From d4a8b4ebb68994ca79a7a52751cf71bfac6bdf77 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 9 May 2022 18:12:34 +0200
Subject: [PATCH 01/97] FEA add NewtonSolver, CholeskyNewtonSolver and
 QRCholeskyNewtonSolver

---
 sklearn/linear_model/_glm/glm.py              | 619 +++++++++++++++++-
 sklearn/linear_model/_glm/tests/test_glm.py   | 353 +++++++++-
 sklearn/linear_model/_linear_loss.py          | 295 +++++++--
 .../linear_model/tests/test_linear_loss.py    |  16 +-
 4 files changed, 1214 insertions(+), 69 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 68aa4ea0df22c..6df1d73805f5e 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -6,9 +6,12 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
+from abc import ABC, abstractmethod
 import numbers
+import warnings
 
 import numpy as np
+import scipy.linalg
 import scipy.optimize
 
 from ..._loss.glm_distribution import TweedieDistribution
@@ -20,13 +23,519 @@
     HalfTweedieLossIdentity,
 )
 from ...base import BaseEstimator, RegressorMixin
-from ...utils.optimize import _check_optimize_result
+from ...exceptions import ConvergenceWarning
 from ...utils import check_scalar, check_array, deprecated
-from ...utils.validation import check_is_fitted, _check_sample_weight
 from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils.optimize import _check_optimize_result
+from ...utils.validation import check_is_fitted, _check_sample_weight
 from .._linear_loss import LinearModelLoss
 
 
+class NewtonSolver(ABC):
+    """Newton solver for GLMs.
+
+    This class implements Newton/2nd-order optimization for GLMs. Each Newton iteration
+    aims at finding the Newton step which is done by the inner solver. With hessian H,
+    gradient g and coefficients coef, one step solves
+
+        H @ coef_newton = -g
+
+    For our GLM / LinearModelLoss, we have gradient g and hessian H:
+
+        g = X.T @ loss.gradient + l2_reg_strength * coef
+        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+
+    Backtracking line seach updates coef = coef_old + t * coef_newton for some t in
+    (0, 1].
+
+    This is a base class, actual implementations (child classes) may deviate from the
+    above pattern and use structure specific tricks.
+
+    Usage pattern:
+        - initialize solver: sol = NewtonSolver(...)
+        - solve the problem: sol.solve(X, y, sample_weight)
+
+    References
+    ----------
+    - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
+      2nd edition
+      https://doi.org/10.1007/978-0-387-40065-5
+
+    - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
+      Cambridge University Press, 2004.
+      https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
+
+    Parameters
+    ----------
+    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,), \
+        default=None
+        Start coefficients of a linear model.
+        If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+        i.e. one reconstructs the 2d-array via
+        coef.reshape((n_classes, -1), order="F").
+        If None, they are initialized with zero.
+
+    linear_loss : LinearModelLoss
+        The loss to be minimized.
+
+    l2_reg_strength : float, default=0.0
+            L2 regularization strength
+
+    tol : float, default=1e-4
+        The optimization problem is solved when each of the following condition is
+        fulfilled:
+        1. maximum |gradient| <= tol
+        2. Newton decrement d: 1/2 * d^2 <= tol
+
+    max_iter : int, default=100
+        Maximum number of Newton steps allowed.
+
+    n_threads : int, default=1
+        Number of OpenMP threads to use.
+    """
+
+    def __init__(
+        self,
+        *,
+        coef=None,
+        linear_loss=LinearModelLoss(base_loss=HalfSquaredError, fit_intercept=True),
+        l2_reg_strength=0.0,
+        tol=1e-4,
+        max_iter=100,
+        n_threads=1,
+        verbose=0,
+    ):
+        self.coef = coef
+        self.linear_loss = linear_loss
+        self.l2_reg_strength = l2_reg_strength
+        self.tol = tol
+        self.max_iter = max_iter
+        self.n_threads = n_threads
+        self.verbose = verbose
+
+    def setup(self, X, y, sample_weight):
+        """Precomputations
+
+        If None, initializes:
+            - self.coef
+        Sets:
+            - self.raw_prediction
+            - self.loss_value
+        """
+        if self.coef is None:
+            self.coef = self.linear_loss.init_zero_coef(X)
+            self.raw_prediction = np.zeros_like(y)
+        else:
+            _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(
+                self.coef, X
+            )
+        self.loss_value = self.linear_loss.loss(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            raw_prediction=self.raw_prediction,
+        )
+
+    @abstractmethod
+    def update_gradient_hessian(X, y, sample_weight):
+        """Update gradient and hessian."""
+
+    @abstractmethod
+    def inner_solve(self):
+        """Compute Newton step.
+
+        Sets self.coef_newton.
+        """
+
+    def line_search(self, X, y, sample_weight):
+        """Backtracking line search.
+
+        Sets:
+            - self.coef_old
+            - self.coef
+            - self.loss_value_old
+            - self.loss_value
+            - self.gradient_old
+            - self.gradient
+            - self.raw_prediction
+        """
+        # line search parameters
+        beta, sigma = 0.5, 0.00048828125  # 1/2, 1/2**11
+        eps = 16 * np.finfo(self.loss_value.dtype).eps
+        t = 1  # step size
+
+        armijo_term = sigma * self.gradient @ self.coef_newton
+        _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
+            self.coef_newton, X
+        )
+
+        self.coef_old = self.coef
+        self.loss_value_old = self.loss_value
+        self.gradient_old = self.gradient
+
+        # np.sum(np.abs(self.gradient_old))
+        sum_abs_grad_old = -1
+        sum_abs_grad_previous = -1  # Used to track sum|gradients| of i-1
+        has_improved_sum_abs_grad_previous = False
+
+        is_verbose = self.verbose >= 2
+        if is_verbose:
+            print("  Backtracking Line Search")
+            print(f"    eps=10 * finfo.eps={eps}")
+
+        for i in range(21):  # until and including t = beta**20 ~ 1e-6
+            self.coef = self.coef_old + t * self.coef_newton
+            raw = self.raw_prediction + t * raw_prediction_newton
+            self.loss_value, self.gradient = self.linear_loss.loss_gradient(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+                raw_prediction=raw,
+            )
+
+            # 1. Check Armijo / sufficient decrease condition.
+            # The smaller (more negative) the better.
+            loss_improvement = self.loss_value - self.loss_value_old
+            check = loss_improvement <= t * armijo_term
+            if is_verbose:
+                print(
+                    f"    line search iteration={i+1}, step size={t}\n"
+                    f"      check loss improvement <= armijo term: {loss_improvement} "
+                    f"<= {t * armijo_term} {check}"
+                )
+            if check:
+                break
+            # 2. Deal with relative loss differences around machine precision.
+            tiny_loss = np.abs(self.loss_value_old * eps)
+            check = np.abs(loss_improvement) <= tiny_loss
+            if is_verbose:
+                print(
+                    "      check loss |improvement| <= eps * |loss_old|:"
+                    f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
+                )
+            if check:
+                if sum_abs_grad_old < 0:
+                    sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
+                # 2.1 Check sum of absolute gradients as alternative condition.
+                sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
+                check = sum_abs_grad < sum_abs_grad_old
+                if is_verbose:
+                    print(
+                        "      check sum(|gradient|) <= sum(|gradient_old|): "
+                        f"{sum_abs_grad} <= {sum_abs_grad_old} {check}"
+                    )
+                if check:
+                    break
+                # 2.2 Deal with relative gradient differences around machine precision.
+                tiny_grad = sum_abs_grad_old * eps
+                abs_grad_improvement = np.abs(sum_abs_grad - sum_abs_grad_old)
+                check = abs_grad_improvement <= tiny_grad
+                if is_verbose:
+                    print(
+                        "      check |sum(|gradient|) - sum(|gradient_old|)| <= eps * "
+                        "sum(|gradient_old|):"
+                        f" {abs_grad_improvement} <= {tiny_grad} {check}"
+                    )
+                if check:
+                    break
+                # 2.3 This is really the last resort.
+                # Check that sum(|gradient_{i-1}| < |sum(|gradient_{i-2}|
+                #            = has_improved_sum_abs_grad_previous
+                # If now sum(|gradient_{i}| >= |sum(|gradient_{i-1}|, this iteration
+                # made things worse and we should have stoped at i-1.
+                check = (
+                    has_improved_sum_abs_grad_previous
+                    and sum_abs_grad >= sum_abs_grad_previous
+                )
+                if is_verbose:
+                    print(
+                        "      check if previously "
+                        "sum(|gradient_{i-1}|) < sum(|gradient_{i-2}|) but now "
+                        f"sum(|gradient_{i}|) >= sum(|gradient_{i-1}|) {check}"
+                    )
+                if check:
+                    t /= beta  # we go back to i-1
+                    self.coef = self.coef_old + t * self.coef_newton
+                    raw = self.raw_prediction + t * raw_prediction_newton
+                    self.loss_value, self.gradient = self.linear_loss.loss_gradient(
+                        coef=self.coef,
+                        X=X,
+                        y=y,
+                        sample_weight=sample_weight,
+                        l2_reg_strength=self.l2_reg_strength,
+                        n_threads=self.n_threads,
+                        raw_prediction=raw,
+                    )
+                    break
+                # Calculate for the next iteration
+                has_improved_sum_abs_grad_previous = (
+                    sum_abs_grad < sum_abs_grad_previous
+                )
+                sum_abs_grad_previous = sum_abs_grad
+
+            t *= beta
+        else:
+            warnings.warn(
+                f"Line search of Newton solver {self.__class__.__name__} did not "
+                "converge after 21 line search refinement iterations.",
+                ConvergenceWarning,
+            )
+
+        self.raw_prediction = raw
+
+    def check_convergence(self, X, y, sample_weight):
+        """Check for convergence."""
+        if self.verbose:
+            print("  Check Convergence")
+        # Note: Checking maximum relative change of coefficient <= tol is a bad
+        # convergence criterion because even a large step could have brought us close
+        # to the true minimum.
+        # coef_step = self.coef - self.coef_old
+        # check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
+
+        # 1. Criterion: maximum |gradient| <= tol
+        #    The gradient was already updated in line_search()
+        check = np.max(np.abs(self.gradient))
+        if self.verbose:
+            print(f"    1. max |gradient| {check} <= {self.tol}")
+        if check > self.tol:
+            return
+
+        # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
+        #       d = sqrt(grad @ hessian^-1 @ grad)
+        #         = sqrt(coef_newton @ hessian @ coef_newton)
+        #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
+        d2 = self.coef_newton @ self.hessian @ self.coef_newton
+        if self.verbose:
+            print(f"    2. Newton decrement {0.5 * d2} <= {self.tol}")
+        if 0.5 * d2 > self.tol:
+            return
+
+        if self.verbose:
+            loss_value = self.linear_loss.loss(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+            )
+            print(f"  Solver did converge at loss = {loss_value}.")
+        self.converged = True
+
+    def finalize(self, X, y, sample_weight):
+        """Finalize the solvers results.
+
+        Some solvers may need this, others not.
+        """
+        pass
+
+    def solve(self, X, y, sample_weight):
+        """Solve the optimization problem.
+
+        Order of calls:
+            self.setup()
+            while iteration:
+                self.update_gradient_hessian()
+                self.inner_solve()
+                self.line_search()
+                self.check_convergence()
+            self.finalize()
+        """
+        # setup usually:
+        #   - initializes self.coef if needed
+        #   - initializes and calculates self.raw_predictions, self.loss_value
+        self.setup(X, y, sample_weight)
+
+        self.iteration = 1
+        self.converged = False
+
+        while self.iteration <= self.max_iter and not self.converged:
+            if self.verbose:
+                print(f"Newton iter={self.iteration}")
+            # 1. Update hessian and gradient
+            self.update_gradient_hessian(X, y, sample_weight)
+
+            # TODO:
+            # if iteration == 1:
+            # We might stop early, e.g. we already are close to the optimum,
+            # usually detected by zero gradients at this stage.
+
+            # 2. Inner solver
+            #    Calculate Newton step/direction
+            #    This usually sets self.coef_newton.
+            self.inner_solve()
+
+            # 3. Backtracking line search
+            #    This usually sets self.coef_old, self.coef, self.loss_value_old
+            #    self.loss_value, self.gradient_old, self.gradient,
+            #    self.raw_prediction.
+            self.line_search(X, y, sample_weight)
+
+            # 4. Check convergence
+            #    Sets self.converged.
+            self.check_convergence(
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+            )
+
+            # 5. Next iteration
+            self.iteration += 1
+
+        if not self.converged:
+            warnings.warn(
+                "Newton solver did not converge after"
+                f" {self.iteration - 1} iterations.",
+                ConvergenceWarning,
+            )
+
+        self.iteration -= 1
+        self.finalize(X, y, sample_weight)
+        return self.coef
+
+
+class CholeskyNewtonSolver(NewtonSolver):
+    """Cholesky based Newton solver.
+
+    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
+    solver.
+    """
+
+    def setup(self, X, y, sample_weight):
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+
+        n_dof = X.shape[1]
+        if self.linear_loss.fit_intercept:
+            n_dof += 1
+        self.gradient = np.empty_like(self.coef)
+        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
+
+    def inner_solve(self):
+        try:
+            self.coef_newton = scipy.linalg.solve(
+                self.hessian, -self.gradient, check_finite=False, assume_a="sym"
+            )
+        except np.linalg.LinAlgError:
+            warnings.warn(
+                f"Inner solver of Newton solver {self.__class__.__name__} stumbbled "
+                "upon a singular matrix. Using SVD based least-squares solution "
+                "instead."
+            )
+            # default lapack_driver="gelsd" is SVD based.
+            self.coef_newton = scipy.linalg.lstsq(self.hessian, -self.gradient)[0]
+
+
+class QRCholeskyNewtonSolver(NewtonSolver):
+    """QR and Cholesky based Newton solver.
+
+    This is a good solver for n_features >> n_samples, see [1].
+
+    This solver uses the structure of the problem, i.e. the fact that coef enters the
+    loss function only as X @ coef and ||coef||_2, and starts with an economic QR
+    decomposition of X':
+
+        X' = QR with Q'Q = identity(k), k = min(n_samples, n_features)
+
+    This is the same as an LQ decomposition of X. We introduce the new variable t as,
+    see [1]:
+
+        (coef, intercept) = (Q @ t, intercept)
+
+    By using X @ coef = R' @ t and ||coef||_2 = ||t||_2, we can just replace X
+    by R', solve for t instead of coef, and finally get coef = Q @ t.
+    Note that t has less elements than coef if n_features > n_sampels:
+        len(t) = k = min(n_samples, n_features) <= n_features = len(coef).
+
+    [1] Hastie, T.J., & Tibshirani, R. (2003). Expression Arrays and the p n Problem.
+    https://web.stanford.edu/~hastie/Papers/pgtn.pdf
+    """
+
+    def setup(self, X, y, sample_weight):
+        n_samples, n_features = X.shape
+        # TODO: setting pivoting=True could improve stability
+        # QR of X'
+        self.Q, self.R = scipy.linalg.qr(X.T, mode="economic", pivoting=False)
+        # use k = min(n_features, n_samples) instead of n_features
+        k = self.R.T.shape[1]
+        n_dof = k
+        if self.linear_loss.fit_intercept:
+            n_dof += 1
+        # store original coef
+        self.coef_original = self.coef
+        # set self.coef = t (coef_original = Q @ t)
+        self.coef = np.zeros_like(self.coef, shape=n_dof)
+        if np.sum(np.abs(self.coef_original)) > 0:
+            self.coef[:k] = self.Q.T @ self.coef_original[:n_features]
+        self.gradient = np.empty_like(self.coef)
+        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
+
+        super().setup(X=self.R.T, y=y, sample_weight=sample_weight)
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        # Use R' instead of X
+        self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=self.R.T,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
+
+    def inner_solve(self):
+        try:
+            self.coef_newton = scipy.linalg.solve(
+                self.hessian, -self.gradient, check_finite=False, assume_a="sym"
+            )
+        except np.linalg.LinAlgError:
+            warnings.warn(
+                f"Inner solver of Newton solver {self.__class__.__name__} stumbbled "
+                "upon a singular matrix. Using SVD based least-squares solution "
+                "instead."
+            )
+            # default lapack_driver="gelsd" is SVD based.
+            self.coef_newton = scipy.linalg.lstsq(self.hessian, -self.gradient)[0]
+
+    def line_search(self, X, y, sample_weight):
+        # Use R' instead of X
+        super().line_search(X=self.R.T, y=y, sample_weight=sample_weight)
+
+    def check_convergence(self, X, y, sample_weight):
+        # Use R' instead of X
+        super().check_convergence(X=self.R.T, y=y, sample_weight=sample_weight)
+
+    def finalize(self, X, y, sample_weight):
+        n_features = X.shape[1]
+        w, intercept = self.linear_loss.weight_intercept(self.coef)
+        self.coef_original[:n_features] = self.Q @ w
+        if self.linear_loss.fit_intercept:
+            self.coef_original[-1] = intercept
+        self.coef = self.coef_original
+
+
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
 
@@ -64,12 +573,20 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : 'lbfgs', default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
+        'newton-cholesky'
+            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
+            an inner cholesky based solver.
+
+        'newton-qr-cholesky'
+            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
+            is better for n_features >> n_samples than 'newton-cholesky'.
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -173,10 +690,18 @@ def fit(self, X, y, sample_weight=None):
                     self.fit_intercept
                 )
             )
-        if self.solver not in ["lbfgs"]:
+        # We allow for NewtonSolver classes but do not make them public in the
+        # docstrings. This facilitates testing and benchmarking.
+        if self.solver not in [
+            "lbfgs",
+            "newton-cholesky",
+            "newton-qr-cholesky",
+        ] and not (
+            isinstance(self.solver, type) and issubclass(self.solver, NewtonSolver)
+        ):
             raise ValueError(
-                f"{self.__class__.__name__} supports only solvers 'lbfgs'; "
-                f"got {self.solver}"
+                f"{self.__class__.__name__} supports only solvers 'lbfgs', "
+                f"'newton-cholesky' and 'newton-qr-cholesky'; got {self.solver}"
             )
         solver = self.solver
         check_scalar(
@@ -271,12 +796,13 @@ def fit(self, X, y, sample_weight=None):
             else:
                 coef = np.zeros(n_features, dtype=loss_dtype)
 
+        l2_reg_strength = self.alpha
+        n_threads = _openmp_effective_n_threads()
+
         # Algorithms for optimization:
         # Note again that our losses implement 1/2 * deviance.
         if solver == "lbfgs":
             func = self._linear_loss.loss_gradient
-            l2_reg_strength = self.alpha
-            n_threads = _openmp_effective_n_threads()
 
             opt_res = scipy.optimize.minimize(
                 func,
@@ -285,14 +811,41 @@ def fit(self, X, y, sample_weight=None):
                 jac=True,
                 options={
                     "maxiter": self.max_iter,
+                    "maxls": 30,  # default is 20
                     "iprint": (self.verbose > 0) - 1,
                     "gtol": self.tol,
-                    "ftol": 1e3 * np.finfo(float).eps,
+                    "ftol": 64 * np.finfo(np.float64).eps,  # lbfgs is float64 land.
                 },
                 args=(X, y, sample_weight, l2_reg_strength, n_threads),
             )
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
+        elif solver in ["newton-cholesky", "newton-qr-cholesky"]:
+            sol_dict = {
+                "newton-cholesky": CholeskyNewtonSolver,
+                "newton-qr-cholesky": QRCholeskyNewtonSolver,
+            }
+            sol = sol_dict[solver](
+                coef=coef,
+                linear_loss=self._linear_loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=n_threads,
+                verbose=self.verbose,
+            )
+            coef = sol.solve(X, y, sample_weight)
+            self.n_iter_ = sol.iteration
+        elif issubclass(solver, NewtonSolver):
+            sol = solver(
+                coef=coef,
+                linear_loss=self._linear_loss,
+                l2_reg_strength=l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=n_threads,
+            )
+            coef = sol.solve(X, y, sample_weight)
 
         if self.fit_intercept:
             self.intercept_ = coef[-1]
@@ -482,6 +1035,20 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
+            an inner cholesky based solver.
+
+        'newton-qr-cholesky'
+            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
+            is better for n_features >> n_samples than 'newton-cholesky'.
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -551,6 +1118,7 @@ def __init__(
         *,
         alpha=1.0,
         fit_intercept=True,
+        solver="lbfgs",
         max_iter=100,
         tol=1e-4,
         warm_start=False,
@@ -559,6 +1127,7 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
+            solver=solver,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
@@ -591,6 +1160,20 @@ class GammaRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
+            an inner cholesky based solver.
+
+        'newton-qr-cholesky'
+            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
+            is better for n_features >> n_samples than 'newton-cholesky'.
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -661,6 +1244,7 @@ def __init__(
         *,
         alpha=1.0,
         fit_intercept=True,
+        solver="lbfgs",
         max_iter=100,
         tol=1e-4,
         warm_start=False,
@@ -669,6 +1253,7 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
+            solver=solver,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
@@ -731,6 +1316,20 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
         - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
           distributions
 
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cholesky'
+            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
+            an inner cholesky based solver.
+
+        'newton-qr-cholesky'
+            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
+            is better for n_features >> n_samples than 'newton-cholesky'.
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -803,6 +1402,7 @@ def __init__(
         alpha=1.0,
         fit_intercept=True,
         link="auto",
+        solver="lbfgs",
         max_iter=100,
         tol=1e-4,
         warm_start=False,
@@ -811,6 +1411,7 @@ def __init__(
         super().__init__(
             alpha=alpha,
             fit_intercept=fit_intercept,
+            solver=solver,
             max_iter=max_iter,
             tol=tol,
             warm_start=warm_start,
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 9bfa2fe28e91a..05d72c5fd8ffe 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -2,25 +2,43 @@
 #
 # License: BSD 3 clause
 
+from functools import partial
 import re
+import warnings
+
 import numpy as np
 from numpy.testing import assert_allclose
 import pytest
-import warnings
+from scipy import linalg
+from scipy.optimize import root
 
 from sklearn.base import clone
+from sklearn._loss import HalfBinomialLoss
 from sklearn._loss.glm_distribution import TweedieDistribution
 from sklearn._loss.link import IdentityLink, LogLink
 
-from sklearn.datasets import make_regression
+from sklearn.datasets import make_low_rank_matrix, make_regression
+from sklearn.linear_model import (
+    GammaRegressor,
+    PoissonRegressor,
+    Ridge,
+    TweedieRegressor,
+)
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
-from sklearn.linear_model import TweedieRegressor, PoissonRegressor, GammaRegressor
-from sklearn.linear_model import Ridge
+from sklearn.linear_model._linear_loss import LinearModelLoss
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.metrics import d2_tweedie_score
 from sklearn.model_selection import train_test_split
 
 
+SOLVERS = ["lbfgs", "newton-cholesky", "newton-qr-cholesky"]
+
+
+class BinomialRegressor(_GeneralizedLinearRegressor):
+    def _get_loss(self):
+        return HalfBinomialLoss()
+
+
 @pytest.fixture(scope="module")
 def regression_data():
     X, y = make_regression(
@@ -29,6 +47,272 @@ def regression_data():
     return X, y
 
 
+@pytest.fixture(
+    params=zip(
+        ["long", "wide"],
+        [
+            BinomialRegressor(),
+            PoissonRegressor(),
+            GammaRegressor(),
+            TweedieRegressor(power=3.0),
+            TweedieRegressor(power=0, link="log"),
+            TweedieRegressor(power=1.5),
+        ],
+    )
+)
+def glm_dataset(global_random_seed, request):
+    """Dataset with GLM solutions, well conditioned X.
+
+    This is inspired by ols_ridge_dataset in test_ridge.py.
+
+    The construction is based on the SVD decomposition of X = U S V'.
+
+    Parameters
+    ----------
+    type : {"long", "wide"}
+        If "long", then n_samples > n_features.
+        If "wide", then n_features > n_samples.
+    model : a GLM model
+
+    For "wide", we return the minimum norm solution w = X' (XX')^-1 y:
+
+        min ||w||_2 subject to X w = y
+
+    Returns
+    -------
+    model : GLM model
+    X : ndarray
+        Last column of 1, i.e. intercept.
+    y : ndarray
+    coef_unpenalized : ndarray
+        Minimum norm solutions, i.e. min sum(loss(w)) (with mininum ||w||_2 in
+        case of ambiguity)
+        Last coefficient is intercept.
+    coef_penalized : ndarray
+        GLM solution with alpha=l2_reg_strength=1, i.e.
+        min 1/n * sum(loss) + ||w||_2^2.
+        Last coefficient is intercept.
+    """
+    type, model = request.param
+    # Make larger dim more than double as big as the smaller one.
+    # This helps when constructing singular matrices like (X, X).
+    if type == "long":
+        n_samples, n_features = 12, 4
+    else:
+        n_samples, n_features = 4, 12
+    k = min(n_samples, n_features)
+    rng = np.random.RandomState(global_random_seed)
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=k,
+        tail_strength=0.1,
+        random_state=rng,
+    )
+    X[:, -1] = 1  # last columns acts as intercept
+    U, s, Vt = linalg.svd(X)
+    assert np.all(s) > 1e-3  # to be sure
+    U1, _ = U[:, :k], U[:, k:]
+    Vt1, _ = Vt[:k, :], Vt[k:, :]
+
+    if request.param == "long":
+        coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
+        coef_unpenalized *= rng.choice([-1, 1], size=n_features)
+        raw_prediction = X @ coef_unpenalized
+    else:
+        raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
+        # w = X'(XX')^-1 y = V s^-1 U' y
+        coef_unpenalized = Vt1.T @ np.diag(1 / s) @ U1.T @ raw_prediction
+
+    linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=True)
+    sw = np.full(shape=n_samples, fill_value=1 / n_samples)
+    y = linear_loss.base_loss.link.inverse(raw_prediction)
+
+    # Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with
+    # optimizer. Note that the problem is well conditioned such that we get accurate
+    # results.
+    l2_reg_strength = 1
+    fun = partial(
+        linear_loss.gradient,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    # Note: Toot finding is more precise then minimizing a function.
+    res = root(
+        fun,
+        coef_unpenalized,
+        method="lm",
+        options={"ftol": 1e-14, "xtol": 1e-14, "gtol": 1e-14},
+    )
+    coef_penalized_with_intercept = res.x
+
+    linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=False)
+    fun = partial(
+        linear_loss.gradient,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    res = root(
+        fun,
+        coef_unpenalized[:-1],
+        method="lm",
+        options={"ftol": 1e-14, "xtol": 1e-14, "gtol": 1e-14},
+    )
+    coef_penalized_without_intercept = res.x
+
+    # To be sure
+    assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
+        coef_unpenalized
+    )
+
+    return (
+        model,
+        X,
+        y,
+        coef_unpenalized,
+        coef_penalized_with_intercept,
+        coef_penalized_without_intercept,
+    )
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_glm_regression(solver, fit_intercept, glm_dataset):
+    """Test that GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    """
+    model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
+    alpha = 1.0  # because glm_dataset uses this.
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    X = X[:, :-1]  # remove intercept
+    if fit_intercept:
+        coef = coef_with_intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        coef = coef_without_intercept
+        intercept = 0
+    model.fit(X, y)
+
+    rtol = 3e-5 if solver == "lbfgs" else 1e-11
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, coef, rtol=rtol)
+
+    # Same with sample_weight.
+    model = (
+        clone(model).set_params(**params).fit(X, y, sample_weight=np.ones(X.shape[0]))
+    )
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, coef, rtol=rtol)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that GLM converges for all solvers to correct solution on hstacked data.
+
+    We work with a simple constructed data set with known solution.
+    Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
+    For long X, [X, X] is a singular matrix.
+    """
+    model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 1.0  # because glm_dataset uses this.
+    params = dict(
+        alpha=alpha / 2,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    if (
+        solver == "lbfgs"
+        and fit_intercept is False
+        and (
+            isinstance(model, BinomialRegressor)
+            or (isinstance(model, PoissonRegressor) and n_features > n_samples)
+        )
+    ):
+        # Line search cannot locate an adequate point after MAXLS
+        #  function and gradient evaluations.
+        #  Previous x, f and g restored.
+        # Possible causes: 1 error in function or gradient evaluation;
+        #                  2 rounding error dominate computation.
+        pytest.xfail()
+
+    model = clone(model).set_params(**params)
+    X = X[:, :-1]  # remove intercept
+    X = 0.5 * np.concatenate((X, X), axis=1)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
+    if fit_intercept:
+        coef = coef_with_intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        coef = coef_without_intercept
+        intercept = 0
+    model.fit(X, y)
+
+    rtol = 3e-5 if solver == "lbfgs" else 1e-11
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that GLM converges for all solvers to correct solution on vstacked data.
+
+    We work with a simple constructed data set with known solution.
+    Fit on [X] with alpha is the same as fit on [X], [y]
+                                                [X], [y] with 1 * alpha.
+    It is the same alpha as the average loss stays the same.
+    For wide X, [X', X'] is a singular matrix.
+    """
+    model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 1.0  # because glm_dataset uses this.
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    X = X[:, :-1]  # remove intercept
+    X = np.concatenate((X, X), axis=0)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    y = np.r_[y, y]
+    if fit_intercept:
+        coef = coef_with_intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        coef = coef_without_intercept
+        intercept = 0
+    model.fit(X, y)
+
+    rtol = 3e-5 if solver == "lbfgs" else 1e-11
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, coef, rtol=rtol)
+
+
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
     # scalar value but not positive
@@ -233,6 +517,7 @@ def test_glm_sample_weight_consistency(fit_intercept, alpha, GLMEstimator):
     assert_allclose(glm1.coef_, glm2.coef_)
 
 
+@pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 @pytest.mark.parametrize(
     "estimator",
@@ -245,7 +530,7 @@ def test_glm_sample_weight_consistency(fit_intercept, alpha, GLMEstimator):
         TweedieRegressor(power=4.5),
     ],
 )
-def test_glm_log_regression(fit_intercept, estimator):
+def test_glm_log_regression(solver, fit_intercept, estimator):
     """Test GLM regression with log link on a simple dataset."""
     coef = [0.2, -0.1]
     X = np.array([[0, 1, 2, 3, 4], [1, 1, 1, 1, 1]]).T
@@ -253,6 +538,7 @@ def test_glm_log_regression(fit_intercept, estimator):
     glm = clone(estimator).set_params(
         alpha=0,
         fit_intercept=fit_intercept,
+        solver=solver,
         tol=1e-8,
     )
     if fit_intercept:
@@ -264,38 +550,61 @@ def test_glm_log_regression(fit_intercept, estimator):
         assert_allclose(res.coef_, coef, rtol=2e-6)
 
 
+@pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
-def test_warm_start(fit_intercept):
-    n_samples, n_features = 110, 10
+def test_warm_start(solver, fit_intercept, global_random_seed):
+    n_samples, n_features = 100, 10
     X, y = make_regression(
         n_samples=n_samples,
         n_features=n_features,
         n_informative=n_features - 2,
-        noise=0.5,
-        random_state=42,
+        bias=fit_intercept * 1.0,
+        noise=1.0,
+        random_state=global_random_seed,
     )
+    y = np.abs(y)  # Poisson requires non-negative targets.
+    params = {"solver": solver, "fit_intercept": fit_intercept, "tol": 1e-10}
 
-    glm1 = _GeneralizedLinearRegressor(
-        warm_start=False, fit_intercept=fit_intercept, max_iter=1000
-    )
+    glm1 = PoissonRegressor(warm_start=False, max_iter=1000, **params)
     glm1.fit(X, y)
 
-    glm2 = _GeneralizedLinearRegressor(
-        warm_start=True, fit_intercept=fit_intercept, max_iter=1
-    )
-    # As we intentionally set max_iter=1, L-BFGS-B will issue a
+    glm2 = PoissonRegressor(warm_start=True, max_iter=1, **params)
+    # As we intentionally set max_iter=1, the solver will issue a
     # ConvergenceWarning which we here simply ignore.
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", category=ConvergenceWarning)
         glm2.fit(X, y)
-    assert glm1.score(X, y) > glm2.score(X, y)
+
+    linear_loss = LinearModelLoss(
+        base_loss=glm1._get_loss(),
+        fit_intercept=fit_intercept,
+    )
+    sw = np.full_like(y, fill_value=1 / n_samples)
+
+    objective_glm1 = linear_loss.loss(
+        coef=np.r_[glm1.coef_, glm1.intercept_] if fit_intercept else glm1.coef_,
+        X=X,
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=1.0,
+    )
+    objective_glm2 = linear_loss.loss(
+        coef=np.r_[glm2.coef_, glm2.intercept_] if fit_intercept else glm2.coef_,
+        X=X,
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=1.0,
+    )
+    assert objective_glm1 < objective_glm2
+
     glm2.set_params(max_iter=1000)
     glm2.fit(X, y)
-    # The two model are not exactly identical since the lbfgs solver
+    # The two models are not exactly identical since the lbfgs solver
     # computes the approximate hessian from previous iterations, which
     # will not be strictly identical in the case of a warm start.
-    assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
-    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
+    rtol = 2e-4 if solver == "lbfgs" else 1e-9
+    assert_allclose(glm1.coef_, glm2.coef_, rtol=rtol)
+    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
 
 
 # FIXME: 'normalize' to be removed in 1.2 in LinearRegression
@@ -360,7 +669,8 @@ def test_normal_ridge_comparison(
     assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
 
 
-def test_poisson_glmnet():
+@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky", "newton-qr-cholesky"])
+def test_poisson_glmnet(solver):
     """Compare Poisson regression with L2 regularization and LogLink to glmnet"""
     # library("glmnet")
     # options(digits=10)
@@ -380,6 +690,7 @@ def test_poisson_glmnet():
         fit_intercept=True,
         tol=1e-7,
         max_iter=300,
+        solver=solver,
     )
     glm.fit(X, y)
     assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 64a99325dcd7a..68c79d316598f 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -67,8 +67,27 @@ def __init__(self, base_loss, fit_intercept):
         self.base_loss = base_loss
         self.fit_intercept = fit_intercept
 
-    def _w_intercept_raw(self, coef, X):
-        """Helper function to get coefficients, intercept and raw_prediction.
+    def init_zero_coef(self, X):
+        """Allocate coef of correct shape with zeros.
+
+        Parameters:
+        -----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        """
+        n_features = X.shape[1]
+        n_classes = self.base_loss.n_classes
+        if self.fit_intercept:
+            n_dof = n_features + 1
+        else:
+            n_dof = n_features
+        if self.base_loss.is_multiclass:
+            self.coef = np.zeros_like(X, shape=(n_classes, n_dof))
+        else:
+            self.coef = np.zeros_like(X, shape=n_dof)
+
+    def weight_intercept(self, coef):
+        """Helper function to get coefficients and intercept.
 
         Parameters
         ----------
@@ -77,8 +96,6 @@ def _w_intercept_raw(self, coef, X):
             If shape (n_classes * n_dof,), the classes of one feature are contiguous,
             i.e. one reconstructs the 2d-array via
             coef.reshape((n_classes, -1), order="F").
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Training data.
 
         Returns
         -------
@@ -86,8 +103,6 @@ def _w_intercept_raw(self, coef, X):
             Coefficients without intercept term.
         intercept : float or ndarray of shape (n_classes,)
             Intercept terms.
-        raw_prediction : ndarray of shape (n_samples,) or \
-            (n_samples, n_classes)
         """
         if not self.base_loss.is_multiclass:
             if self.fit_intercept:
@@ -96,7 +111,6 @@ def _w_intercept_raw(self, coef, X):
             else:
                 intercept = 0.0
                 weights = coef
-            raw_prediction = X @ weights + intercept
         else:
             # reshape to (n_classes, n_dof)
             if coef.ndim == 1:
@@ -108,11 +122,56 @@ def _w_intercept_raw(self, coef, X):
                 weights = weights[:, :-1]
             else:
                 intercept = 0.0
+
+        return weights, intercept
+
+    def weight_intercept_raw(self, coef, X):
+        """Helper function to get coefficients, intercept and raw_prediction.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        Returns
+        -------
+        weights : ndarray of shape (n_features,) or (n_classes, n_features)
+            Coefficients without intercept term.
+        intercept : float or ndarray of shape (n_classes,)
+            Intercept terms.
+        raw_prediction : ndarray of shape (n_samples,) or \
+            (n_samples, n_classes)
+        """
+        weights, intercept = self.weight_intercept(coef)
+
+        if not self.base_loss.is_multiclass:
+            raw_prediction = X @ weights + intercept
+        else:
+            # weights has shape to (n_classes, n_dof)
             raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
 
         return weights, intercept, raw_prediction
 
-    def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1):
+    def l2_penalty(self, weights, l2_reg_strength):
+        """Compute L2 penalty term l2_reg_strength/2 *||w||_2^2."""
+        norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
+        return 0.5 * l2_reg_strength * norm2_w
+
+    def loss(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
+    ):
         """Compute the loss as sum over point-wise losses.
 
         Parameters
@@ -132,13 +191,20 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
 
         Returns
         -------
         loss : float
             Sum of losses per sample plus penalty.
         """
-        weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
 
         loss = self.base_loss.loss(
             y_true=y,
@@ -148,11 +214,17 @@ def loss(self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1)
         )
         loss = loss.sum()
 
-        norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
-        return loss + 0.5 * l2_reg_strength * norm2_w
+        return loss + self.l2_penalty(weights, l2_reg_strength)
 
     def loss_gradient(
-        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
     ):
         """Computes the sum of loss and gradient w.r.t. coef.
 
@@ -173,6 +245,10 @@ def loss_gradient(
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
 
         Returns
         -------
@@ -184,36 +260,46 @@ def loss_gradient(
         """
         n_features, n_classes = X.shape[1], self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
-        weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
 
-        loss, grad_per_sample = self.base_loss.loss_gradient(
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        loss, grad_pointwise = self.base_loss.loss_gradient(
             y_true=y,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
         loss = loss.sum()
+        loss += self.l2_penalty(weights, l2_reg_strength)
 
         if not self.base_loss.is_multiclass:
-            loss += 0.5 * l2_reg_strength * (weights @ weights)
             grad = np.empty_like(coef, dtype=weights.dtype)
-            grad[:n_features] = X.T @ grad_per_sample + l2_reg_strength * weights
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
             if self.fit_intercept:
-                grad[-1] = grad_per_sample.sum()
+                grad[-1] = grad_pointwise.sum()
         else:
-            loss += 0.5 * l2_reg_strength * squared_norm(weights)
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
-            # grad_per_sample.shape = (n_samples, n_classes)
-            grad[:, :n_features] = grad_per_sample.T @ X + l2_reg_strength * weights
+            # grad_pointwise.shape = (n_samples, n_classes)
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
-                grad[:, -1] = grad_per_sample.sum(axis=0)
+                grad[:, -1] = grad_pointwise.sum(axis=0)
             if coef.ndim == 1:
                 grad = grad.ravel(order="F")
 
         return loss, grad
 
     def gradient(
-        self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        raw_prediction=None,
     ):
         """Computes the gradient w.r.t. coef.
 
@@ -234,6 +320,10 @@ def gradient(
             L2 regularization strength
         n_threads : int, default=1
             Number of OpenMP threads to use.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
 
         Returns
         -------
@@ -242,9 +332,13 @@ def gradient(
         """
         n_features, n_classes = X.shape[1], self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
-        weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
 
-        grad_per_sample = self.base_loss.gradient(
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        grad_pointwise = self.base_loss.gradient(
             y_true=y,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
@@ -253,21 +347,143 @@ def gradient(
 
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
-            grad[:n_features] = X.T @ grad_per_sample + l2_reg_strength * weights
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
             if self.fit_intercept:
-                grad[-1] = grad_per_sample.sum()
+                grad[-1] = grad_pointwise.sum()
             return grad
         else:
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             # gradient.shape = (n_samples, n_classes)
-            grad[:, :n_features] = grad_per_sample.T @ X + l2_reg_strength * weights
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
-                grad[:, -1] = grad_per_sample.sum(axis=0)
+                grad[:, -1] = grad_pointwise.sum(axis=0)
             if coef.ndim == 1:
                 return grad.ravel(order="F")
             else:
                 return grad
 
+    def gradient_hessian(
+        self,
+        coef,
+        X,
+        y,
+        sample_weight=None,
+        l2_reg_strength=0.0,
+        n_threads=1,
+        gradient_out=None,
+        hessian_out=None,
+        raw_prediction=None,
+    ):
+        """Computes gradient and hessian w.r.t. coef.
+
+        Parameters
+        ----------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Coefficients of a linear model.
+            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+            i.e. one reconstructs the 2d-array via
+            coef.reshape((n_classes, -1), order="F").
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+        y : contiguous array of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or contiguous array of shape (n_samples,), default=None
+            Sample weights.
+        l2_reg_strength : float, default=0.0
+            L2 regularization strength
+        n_threads : int, default=1
+            Number of OpenMP threads to use.
+        gradient_out : None or ndarray of shape coef.shape
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian_out : None or ndarray
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space). If provided, these are used. If
+            None, then raw_prediction = X @ coef + intercept is calculated.
+
+        Returns
+        -------
+        gradient : ndarray of shape coef.shape
+             The gradient of the loss.
+
+        hessian : ndarray
+            Hessian matrix.
+        """
+        n_samples, n_features = X.shape
+        n_dof = n_features + int(self.fit_intercept)
+
+        if raw_prediction is None:
+            weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        else:
+            weights, intercept = self.weight_intercept(coef)
+
+        grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
+            y_true=y,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            n_threads=n_threads,
+        )
+
+        # For non-canonical link functions and far away from the optimum, we take
+        # care that the hessian is not negative.
+        hess_pointwise[hess_pointwise <= 0] = 0
+
+        if not self.base_loss.is_multiclass:
+            # gradient
+            if gradient_out is None:
+                grad = np.empty_like(coef, dtype=weights.dtype)
+            else:
+                grad = gradient_out
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
+            if self.fit_intercept:
+                grad[-1] = grad_pointwise.sum()
+
+            # hessian
+            if hessian_out is None:
+                hess = np.empty(shape=(n_dof, n_dof), dtype=weights.dtype)
+            else:
+                hess = hessian_out
+            # TODO: This "sandwich product", X' diag(W) X, can be greatly improved by
+            # a dedicated Cython routine.
+            if sparse.issparse(X):
+                hess[:n_features, :n_features] = (
+                    X.T
+                    @ sparse.dia_matrix(
+                        (hess_pointwise, 0), shape=(n_samples, n_samples)
+                    )
+                    @ X
+                ).toarray()
+            else:
+                # np.einsum may use less memory but the following is by far faster.
+                # This matrix multiplication (gemm) is most often the most time
+                # consuming step for solvers.
+                WX = hess_pointwise[:, None] * X
+                hess[:n_features, :n_features] = np.dot(X.T, WX)
+            # flattened view on the array
+            if l2_reg_strength > 0:
+                hess.reshape(-1)[
+                    : (n_features * n_dof) : (n_dof + 1)
+                ] += l2_reg_strength
+
+            if self.fit_intercept:
+                # With intercept included as added column to X, the hessian becomes
+                # hess = (X, 1)' @ diag(h) @ (X, 1)
+                #      = (X' @ diag(h) @ X, X' @ h)
+                #        (           h @ X, sum(h))
+                Xh = X.T @ hess_pointwise
+                hess[:-1, -1] = Xh
+                hess[-1, :-1] = Xh
+                hess[-1, -1] = hess_pointwise.sum()
+        else:
+            # Here we may safely assume HalfMultinomialLoss aka categorical
+            # cross-entropy.
+            raise NotImplementedError
+
+        return grad, hess
+
     def gradient_hessian_product(
         self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
     ):
@@ -302,26 +518,29 @@ def gradient_hessian_product(
         """
         (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
-        weights, intercept, raw_prediction = self._w_intercept_raw(coef, X)
+        weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
 
         if not self.base_loss.is_multiclass:
-            gradient, hessian = self.base_loss.gradient_hessian(
+            grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
                 y_true=y,
                 raw_prediction=raw_prediction,
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
             grad = np.empty_like(coef, dtype=weights.dtype)
-            grad[:n_features] = X.T @ gradient + l2_reg_strength * weights
+            grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
             if self.fit_intercept:
-                grad[-1] = gradient.sum()
+                grad[-1] = grad_pointwise.sum()
 
             # Precompute as much as possible: hX, hX_sum and hessian_sum
-            hessian_sum = hessian.sum()
+            hessian_sum = hess_pointwise.sum()
             if sparse.issparse(X):
-                hX = sparse.dia_matrix((hessian, 0), shape=(n_samples, n_samples)) @ X
+                hX = (
+                    sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples))
+                    @ X
+                )
             else:
-                hX = hessian[:, np.newaxis] * X
+                hX = hess_pointwise[:, np.newaxis] * X
 
             if self.fit_intercept:
                 # Calculate the double derivative with respect to intercept.
@@ -352,16 +571,16 @@ def hessp(s):
             # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
             # diagonal in the classes. Here, we want the matrix-vector product of the
             # full hessian. Therefore, we call gradient_proba.
-            gradient, proba = self.base_loss.gradient_proba(
+            grad_pointwise, proba = self.base_loss.gradient_proba(
                 y_true=y,
                 raw_prediction=raw_prediction,
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
-            grad[:, :n_features] = gradient.T @ X + l2_reg_strength * weights
+            grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
-                grad[:, -1] = gradient.sum(axis=0)
+                grad[:, -1] = grad_pointwise.sum(axis=0)
 
             # Full hessian-vector product, i.e. not only the diagonal part of the
             # hessian. Derivation with some index battle for input vector s:
diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index d4e20ad69ca8a..c48680a282611 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -81,7 +81,7 @@ def choice_vectorized(items, p):
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 @pytest.mark.parametrize("l2_reg_strength", [0, 1])
-def test_loss_gradients_are_the_same(
+def test_loss_grad_hess_are_the_same(
     base_loss, fit_intercept, sample_weight, l2_reg_strength
 ):
     """Test that loss and gradient are the same across different functions."""
@@ -105,10 +105,17 @@ def test_loss_gradients_are_the_same(
     g3, h3 = loss.gradient_hessian_product(
         coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
     )
+    if not base_loss.is_multiclass:
+        g4, h4, _ = loss.gradient_hessian(
+            coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+        )
 
     assert_allclose(l1, l2)
     assert_allclose(g1, g2)
     assert_allclose(g1, g3)
+    if not base_loss.is_multiclass:
+        assert_allclose(g1, g4)
+        assert_allclose(h4 @ g4, h3(g3))
 
     # same for sparse X
     X = sparse.csr_matrix(X)
@@ -124,6 +131,10 @@ def test_loss_gradients_are_the_same(
     g3_sp, h3_sp = loss.gradient_hessian_product(
         coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
     )
+    if not base_loss.is_multiclass:
+        g4_sp, h4_sp, _ = loss.gradient_hessian(
+            coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+        )
 
     assert_allclose(l1, l1_sp)
     assert_allclose(l1, l2_sp)
@@ -131,6 +142,9 @@ def test_loss_gradients_are_the_same(
     assert_allclose(g1, g2_sp)
     assert_allclose(g1, g3_sp)
     assert_allclose(h3(g1), h3_sp(g1_sp))
+    if not base_loss.is_multiclass:
+        assert_allclose(g1, g4_sp)
+        assert_allclose(h4 @ g4, h4_sp @ g1_sp)
 
 
 @pytest.mark.parametrize("base_loss", LOSSES)

From 267d5707a82164347c87b132ac4eaad48c5a528b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 11 May 2022 22:54:29 +0200
Subject: [PATCH 02/97] ENH better singular hessian special solve

---
 sklearn/linear_model/_glm/glm.py | 62 +++++++++++++++++++++++++-------
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index adc60ab997126..549fc1d1799b4 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -31,6 +31,35 @@
 from .._linear_loss import LinearModelLoss
 
 
+def _solve_singular_cholesky(H, g):
+    """Find Newton step with singular hessian H.
+
+    Nocedal & Wright, Chapter 3.4 subsection
+    "Modified symmetric indefinite factorization"
+
+    Parameters
+    ----------
+    H : hessian matrix
+    g : gradient
+
+    Returns
+    -------
+    x : Newton step
+        H x = -g
+    """
+    # hessian = L B L' with block diagonal B, block size <= 2
+    L, B, perm = scipy.linalg.ldl(H, lower=True)
+    U, s, Vt = scipy.linalg.svd(B)
+    delta = 1e-3  # TODO: Decide on size of this number
+    tau = (s < delta) * (delta - s)
+    # F = U @ (tau[:, None] * Vt)
+    # hessian approximation = L (B + F) L' = L U (s + tau) Vt L'
+    w = scipy.linalg.solve_triangular(L, -g, lower=True)
+    # w = scipy.linalg.solve(B + F, w)
+    w = Vt.T @ (1 / (s + tau) * (U.T @ w))
+    return scipy.linalg.solve_triangular(L.T, w, lower=False)
+
+
 class NewtonSolver(ABC):
     """Newton solver for GLMs.
 
@@ -431,18 +460,25 @@ def update_gradient_hessian(self, X, y, sample_weight):
         )
 
     def inner_solve(self):
+        # TODO: solve(..) may give a warning like
+        #     LinAlgWarning: Ill-conditioned matrix (rcond=9.52447e-17): result may not
+        #     be accurate.
+        # Should we treat this as error and deal with it in the except, or is it fine
+        # as is?
         try:
             self.coef_newton = scipy.linalg.solve(
                 self.hessian, -self.gradient, check_finite=False, assume_a="sym"
             )
-        except np.linalg.LinAlgError:
+        except np.linalg.LinAlgError as e:
             warnings.warn(
-                f"Inner solver of Newton solver {self.__class__.__name__} stumbbled "
-                "upon a singular matrix. Using SVD based least-squares solution "
-                "instead."
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular hessian matrix. Therefore, this iteration uses a step closer"
+                " to a gradient descent direction. Removing collinear features of X or"
+                " increasing the penalization strengths may resolve this issue."
+                " The original Linear Algebra message was:\n"
+                + str(e)
             )
-            # default lapack_driver="gelsd" is SVD based.
-            self.coef_newton = scipy.linalg.lstsq(self.hessian, -self.gradient)[0]
+            self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
 
 
 class QRCholeskyNewtonSolver(NewtonSolver):
@@ -510,14 +546,16 @@ def inner_solve(self):
             self.coef_newton = scipy.linalg.solve(
                 self.hessian, -self.gradient, check_finite=False, assume_a="sym"
             )
-        except np.linalg.LinAlgError:
+        except np.linalg.LinAlgError as e:
             warnings.warn(
-                f"Inner solver of Newton solver {self.__class__.__name__} stumbbled "
-                "upon a singular matrix. Using SVD based least-squares solution "
-                "instead."
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular hessian matrix. Therefore, this iteration uses a step closer"
+                " to a gradient descent direction. Removing collinear features of X or"
+                " increasing the penalization strengths may resolve this issue."
+                " The original Linear Algebra message was:\n"
+                + str(e)
             )
-            # default lapack_driver="gelsd" is SVD based.
-            self.coef_newton = scipy.linalg.lstsq(self.hessian, -self.gradient)[0]
+            self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
 
     def line_search(self, X, y, sample_weight):
         # Use R' instead of X

From dd5a8201c188ce2d1215d89e6860eb28659ee5f3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 11 May 2022 23:00:16 +0200
Subject: [PATCH 03/97] CLN fix some typos found by reviewer

---
 sklearn/linear_model/_glm/glm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 549fc1d1799b4..073f20e024261 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -120,7 +120,8 @@ class NewtonSolver(ABC):
         Maximum number of Newton steps allowed.
 
     n_threads : int, default=1
-        Number of OpenMP threads to use.
+        Number of OpenMP threads to use for the computation of the hessian and gradient
+        of the loss function.
     """
 
     def __init__(
@@ -274,10 +275,10 @@ def line_search(self, X, y, sample_weight):
                 if check:
                     break
                 # 2.3 This is really the last resort.
-                # Check that sum(|gradient_{i-1}| < |sum(|gradient_{i-2}|
+                # Check that sum(|gradient_{i-1}|) < sum(|gradient_{i-2}|)
                 #            = has_improved_sum_abs_grad_previous
-                # If now sum(|gradient_{i}| >= |sum(|gradient_{i-1}|, this iteration
-                # made things worse and we should have stoped at i-1.
+                # If now sum(|gradient_{i}|) >= sum(|gradient_{i-1}|), this iteration
+                # made things worse and we should have stopped at i-1.
                 check = (
                     has_improved_sum_abs_grad_previous
                     and sum_abs_grad >= sum_abs_grad_previous
@@ -499,7 +500,7 @@ class QRCholeskyNewtonSolver(NewtonSolver):
 
     By using X @ coef = R' @ t and ||coef||_2 = ||t||_2, we can just replace X
     by R', solve for t instead of coef, and finally get coef = Q @ t.
-    Note that t has less elements than coef if n_features > n_sampels:
+    Note that t has less elements than coef if n_features > n_samples:
         len(t) = k = min(n_samples, n_features) <= n_features = len(coef).
 
     [1] Hastie, T.J., & Tibshirani, R. (2003). Expression Arrays and the p n Problem.

From bf1828db7d9fc94e97fb077e1459382c1e0504ba Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 11 May 2022 23:22:50 +0200
Subject: [PATCH 04/97] TST assert ConvergenceWarning is raised

---
 sklearn/linear_model/_glm/tests/test_glm.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 06b0d490304ca..47a78c0951b6b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -4,7 +4,6 @@
 
 from functools import partial
 import re
-import warnings
 
 import numpy as np
 from numpy.testing import assert_allclose
@@ -569,10 +568,9 @@ def test_warm_start(solver, fit_intercept, global_random_seed):
     glm1.fit(X, y)
 
     glm2 = PoissonRegressor(warm_start=True, max_iter=1, **params)
-    # As we intentionally set max_iter=1, the solver will issue a
-    # ConvergenceWarning which we here simply ignore.
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+    # As we intentionally set max_iter=1 such that the solver should raise a
+    # ConvergenceWarning.
+    with pytest.warns(ConvergenceWarning):
         glm2.fit(X, y)
 
     linear_loss = LinearModelLoss(

From 9783e6be2647575218ea57559065febf4cd8197e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 25 May 2022 17:32:29 +0200
Subject: [PATCH 05/97] MNT add BaseCholeskyNewtonSolver

---
 sklearn/linear_model/_glm/glm.py            | 72 +++++++++------------
 sklearn/linear_model/_glm/tests/test_glm.py |  2 +-
 2 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 073f20e024261..2c253f9872db9 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -286,7 +286,7 @@ def line_search(self, X, y, sample_weight):
                 if is_verbose:
                     print(
                         "      check if previously "
-                        "sum(|gradient_{i-1}|) < sum(|gradient_{i-2}|) but now "
+                        f"sum(|gradient_{i-1}|) < sum(|gradient_{i-2}|) but now "
                         f"sum(|gradient_{i}|) >= sum(|gradient_{i-1}|) {check}"
                     )
                 if check:
@@ -431,7 +431,36 @@ def solve(self, X, y, sample_weight):
         return self.coef
 
 
-class CholeskyNewtonSolver(NewtonSolver):
+class BaseCholeskyNewtonSolver(NewtonSolver):
+    """Cholesky based Newton solver.
+
+    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
+    solver.
+    """
+
+    def inner_solve(self):
+        # TODO: solve(..) may give a warning like
+        #     LinAlgWarning: Ill-conditioned matrix (rcond=9.52447e-17): result may not
+        #     be accurate.
+        # Should we treat this as error and deal with it in the except, or is it fine
+        # as is?
+        try:
+            self.coef_newton = scipy.linalg.solve(
+                self.hessian, -self.gradient, check_finite=False, assume_a="sym"
+            )
+        except np.linalg.LinAlgError as e:
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular hessian matrix. Therefore, this iteration uses a step closer"
+                " to a gradient descent direction. Removing collinear features of X or"
+                " increasing the penalization strengths may resolve this issue."
+                " The original Linear Algebra message was:\n"
+                + str(e)
+            )
+            self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
+
+
+class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
     """Cholesky based Newton solver.
 
     Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
@@ -460,29 +489,8 @@ def update_gradient_hessian(self, X, y, sample_weight):
             raw_prediction=self.raw_prediction,  # this was updated in line_search
         )
 
-    def inner_solve(self):
-        # TODO: solve(..) may give a warning like
-        #     LinAlgWarning: Ill-conditioned matrix (rcond=9.52447e-17): result may not
-        #     be accurate.
-        # Should we treat this as error and deal with it in the except, or is it fine
-        # as is?
-        try:
-            self.coef_newton = scipy.linalg.solve(
-                self.hessian, -self.gradient, check_finite=False, assume_a="sym"
-            )
-        except np.linalg.LinAlgError as e:
-            warnings.warn(
-                f"The inner solver of {self.__class__.__name__} stumbled upon a "
-                "singular hessian matrix. Therefore, this iteration uses a step closer"
-                " to a gradient descent direction. Removing collinear features of X or"
-                " increasing the penalization strengths may resolve this issue."
-                " The original Linear Algebra message was:\n"
-                + str(e)
-            )
-            self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
-
 
-class QRCholeskyNewtonSolver(NewtonSolver):
+class QRCholeskyNewtonSolver(BaseCholeskyNewtonSolver):
     """QR and Cholesky based Newton solver.
 
     This is a good solver for n_features >> n_samples, see [1].
@@ -542,22 +550,6 @@ def update_gradient_hessian(self, X, y, sample_weight):
             raw_prediction=self.raw_prediction,  # this was updated in line_search
         )
 
-    def inner_solve(self):
-        try:
-            self.coef_newton = scipy.linalg.solve(
-                self.hessian, -self.gradient, check_finite=False, assume_a="sym"
-            )
-        except np.linalg.LinAlgError as e:
-            warnings.warn(
-                f"The inner solver of {self.__class__.__name__} stumbled upon a "
-                "singular hessian matrix. Therefore, this iteration uses a step closer"
-                " to a gradient descent direction. Removing collinear features of X or"
-                " increasing the penalization strengths may resolve this issue."
-                " The original Linear Algebra message was:\n"
-                + str(e)
-            )
-            self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
-
     def line_search(self, X, y, sample_weight):
         # Use R' instead of X
         super().line_search(X=self.R.T, y=y, sample_weight=sample_weight)
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 47a78c0951b6b..c52169c935d20 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -138,7 +138,7 @@ def glm_dataset(global_random_seed, request):
         sample_weight=sw,
         l2_reg_strength=l2_reg_strength,
     )
-    # Note: Toot finding is more precise then minimizing a function.
+    # Note: Root finding is more precise then minimizing a function.
     res = root(
         fun,
         coef_unpenalized,

From d373e634328f352a85243e02e3db0a1592aa80cd Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 25 May 2022 18:01:55 +0200
Subject: [PATCH 06/97] WIP colinear design in GLMs

---
 sklearn/linear_model/_glm/glm.py            | 13 ++++++++-----
 sklearn/linear_model/_glm/tests/test_glm.py | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 073f20e024261..65a6b2c900e41 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -467,17 +467,20 @@ def inner_solve(self):
         # Should we treat this as error and deal with it in the except, or is it fine
         # as is?
         try:
-            self.coef_newton = scipy.linalg.solve(
-                self.hessian, -self.gradient, check_finite=False, assume_a="sym"
-            )
-        except np.linalg.LinAlgError as e:
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
+                self.coef_newton = scipy.linalg.solve(
+                    self.hessian, -self.gradient, check_finite=False, assume_a="sym"
+                )
+        except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             warnings.warn(
                 f"The inner solver of {self.__class__.__name__} stumbled upon a "
                 "singular hessian matrix. Therefore, this iteration uses a step closer"
                 " to a gradient descent direction. Removing collinear features of X or"
                 " increasing the penalization strengths may resolve this issue."
                 " The original Linear Algebra message was:\n"
-                + str(e)
+                + str(e),
+                scipy.linalg.LinAlgWarning
             )
             self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
 
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 47a78c0951b6b..617f806af4da5 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -6,6 +6,7 @@
 import re
 
 import numpy as np
+import scipy
 from numpy.testing import assert_allclose
 import pytest
 from scipy import linalg
@@ -783,3 +784,18 @@ def test_family_deprecation(est, family):
         else:
             assert est.family.__class__ == family.__class__
             assert est.family.power == family.power
+
+
+def test_linalg_warning_with_newton_solver(global_random_seed):
+    rng = np.random.RandomState(global_random_seed)
+    X_orig = rng.normal(size=(100, 3))
+    X_colinear = np.hstack([X_orig] * 10)  # colinear design
+    y = rng.normal(size=X_orig.shape[0])
+    y[y < 0] = 0.0
+
+    with pytest.warns(None):
+        # No warning raised on well-conditioned design
+        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_orig, y)
+
+    with pytest.warns(scipy.linalg.LinAlgWarning) as rec:
+        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_colinear, y)
\ No newline at end of file

From c6efcef1b8821222d5ec36183cf9f56b1e6353d9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 31 May 2022 23:03:11 +0200
Subject: [PATCH 07/97] FIX _solve_singular

---
 sklearn/linear_model/_glm/glm.py            | 28 ++++++++++++---------
 sklearn/linear_model/_glm/tests/test_glm.py | 14 ++++++++---
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 00dfb96e588bc..0b3599634997a 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -31,11 +31,13 @@
 from .._linear_loss import LinearModelLoss
 
 
-def _solve_singular_cholesky(H, g):
+def _solve_singular(H, g):
     """Find Newton step with singular hessian H.
 
-    Nocedal & Wright, Chapter 3.4 subsection
-    "Modified symmetric indefinite factorization"
+    We could use the approach with an L D L decomposition as in
+        Nocedal & Wright, Chapter 3.4 subsection
+        "Modified symmetric indefinite factorization"
+    but we use the much simpler (and more expensive?) least squares solver.
 
     Parameters
     ----------
@@ -48,16 +50,18 @@ def _solve_singular_cholesky(H, g):
         H x = -g
     """
     # hessian = L B L' with block diagonal B, block size <= 2
-    L, B, perm = scipy.linalg.ldl(H, lower=True)
-    U, s, Vt = scipy.linalg.svd(B)
-    delta = 1e-3  # TODO: Decide on size of this number
-    tau = (s < delta) * (delta - s)
+    # L[perm, :] is lower triangular
+    # CODE: L, B, perm = scipy.linalg.ldl(H, lower=True)
+    # CODE: U, s, Vt = scipy.linalg.svd(B)
+    # CODE: delta = 1e-3  # TODO: Decide on size of this number
+    # CODE: tau = (s < delta) * (delta - s)
     # F = U @ (tau[:, None] * Vt)
     # hessian approximation = L (B + F) L' = L U (s + tau) Vt L'
-    w = scipy.linalg.solve_triangular(L, -g, lower=True)
+    # CODE: w = scipy.linalg.solve_triangular(L[perm], -g[perm], lower=True)
     # w = scipy.linalg.solve(B + F, w)
-    w = Vt.T @ (1 / (s + tau) * (U.T @ w))
-    return scipy.linalg.solve_triangular(L.T, w, lower=False)
+    # CODE: w = Vt.T @ (1 / (s + tau) * (U.T @ w))
+    # CODE: return scipy.linalg.solve_triangular(L.T[:, perm], w, lower=False)[perm]
+    return scipy.linalg.lstsq(H, -g)[0]
 
 
 class NewtonSolver(ABC):
@@ -458,9 +462,9 @@ def inner_solve(self):
                 " increasing the penalization strengths may resolve this issue."
                 " The original Linear Algebra message was:\n"
                 + str(e),
-                scipy.linalg.LinAlgWarning
+                scipy.linalg.LinAlgWarning,
             )
-            self.coef_newton = _solve_singular_cholesky(self.hessian, -self.gradient)
+            self.coef_newton = _solve_singular(self.hessian, -self.gradient)
 
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index dc06f3f51bd56..dcae2b59a3781 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -4,6 +4,7 @@
 
 from functools import partial
 import re
+import warnings
 
 import numpy as np
 import scipy
@@ -788,14 +789,19 @@ def test_family_deprecation(est, family):
 
 def test_linalg_warning_with_newton_solver(global_random_seed):
     rng = np.random.RandomState(global_random_seed)
-    X_orig = rng.normal(size=(100, 3))
+    X_orig = rng.normal(size=(10, 3))
     X_colinear = np.hstack([X_orig] * 10)  # colinear design
     y = rng.normal(size=X_orig.shape[0])
     y[y < 0] = 0.0
 
-    with pytest.warns(None):
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
         # No warning raised on well-conditioned design
         PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_orig, y)
 
-    with pytest.warns(scipy.linalg.LinAlgWarning) as rec:
-        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_colinear, y)
\ No newline at end of file
+    msg = (
+        "The inner solver of CholeskyNewtonSolver stumbled upon a "
+        "singular hessian matrix. "
+    )
+    with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
+        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_colinear, y)

From d2063f7ed8d4915409660ec30ccf323b3c16d224 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 2 Jun 2022 18:11:19 +0200
Subject: [PATCH 08/97] FIX false unpacking in

---
 sklearn/linear_model/tests/test_linear_loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index c48680a282611..eb35dd8f08d65 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -106,7 +106,7 @@ def test_loss_grad_hess_are_the_same(
         coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
     )
     if not base_loss.is_multiclass:
-        g4, h4, _ = loss.gradient_hessian(
+        g4, h4 = loss.gradient_hessian(
             coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
         )
 
@@ -132,7 +132,7 @@ def test_loss_grad_hess_are_the_same(
         coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
     )
     if not base_loss.is_multiclass:
-        g4_sp, h4_sp, _ = loss.gradient_hessian(
+        g4_sp, h4_sp = loss.gradient_hessian(
             coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
         )
 

From e6684c6bd1c42074216fe6d8fe51f60caff05c2e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 3 Jun 2022 15:31:35 +0200
Subject: [PATCH 09/97] TST add tests for unpenalized GLMs

---
 sklearn/linear_model/_glm/tests/test_glm.py | 166 +++++++++++++++++++-
 1 file changed, 163 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index dcae2b59a3781..504c523373a0f 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -94,10 +94,10 @@ def glm_dataset(global_random_seed, request):
         min 1/n * sum(loss) + ||w||_2^2.
         Last coefficient is intercept.
     """
-    type, model = request.param
+    data_type, model = request.param
     # Make larger dim more than double as big as the smaller one.
     # This helps when constructing singular matrices like (X, X).
-    if type == "long":
+    if data_type == "long":
         n_samples, n_features = 12, 4
     else:
         n_samples, n_features = 4, 12
@@ -116,7 +116,7 @@ def glm_dataset(global_random_seed, request):
     U1, _ = U[:, :k], U[:, k:]
     Vt1, _ = Vt[:k, :], Vt[k:, :]
 
-    if request.param == "long":
+    if data_type == "long":
         coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
         coef_unpenalized *= rng.choice([-1, 1], size=n_features)
         raw_prediction = X @ coef_unpenalized
@@ -314,6 +314,166 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
     assert_allclose(model.coef_, coef, rtol=rtol)
 
 
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
+    """Test that unpenalized GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    Note: This checks the minimum norm solution for wide X, i.e.
+    n_samples < n_features:
+        min ||w||_2 subject to w minimizing the mean deviviance.
+    """
+    model, X, y, coef, _, _ = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # unpenalized
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    # Note that newton-cholesky might give a warning: XXX
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    model.fit(X, y)
+
+    # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
+    # for the wide/fat case with n_features > n_samples. The current Ridge solvers do
+    # NOT return the minimum norm solution with fit_intercept=True.
+    if n_samples > n_features or not fit_intercept:
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+    else:
+        # As it is an underdetermined problem, prediction = y. This shows that we get
+        # a solution, i.e. a (non-unique) minimum of the objective function ...
+        assert_allclose(model.predict(X), y)
+        assert_allclose(model._get_loss().link.inverse(X @ coef + intercept), y)
+        # But it is not the minimum norm solution. (This should be equal.)
+        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
+            np.r_[intercept, coef]
+        )
+
+        pytest.xfail(reason="GLM does not provide the minimum norm solution.")
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that unpenalized GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    GLM fit on [X] is the same as fit on [X, X]/2.
+    For long X, [X, X] is a singular matrix and we check against the minimum norm
+    solution:
+        min ||w||_2 subject to min deviance
+    """
+    model, X, y, coef, _, _ = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # unpenalized
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    X = 0.5 * np.concatenate((X, X), axis=1)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    model.fit(X, y)
+
+    if n_samples > n_features or not fit_intercept:
+        assert model.intercept_ == pytest.approx(intercept)
+        if solver in ["newton-cholesky"]:
+            # Cholesky is a bad choice for singular X.
+            pytest.skip()
+        rtol = 3e-5 if solver == "lbfgs" else 1e-11
+        assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
+    else:
+        # FIXME: Same as in test_glm_regression_unpenalized.
+        # As it is an underdetermined problem, prediction = y. This shows that we get
+        # a solution, i.e. a (non-unique) minimum of the objective function ...
+        assert_allclose(model.predict(X), y)
+        # But it is not the minimum norm solution. (This should be equal.)
+        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
+            np.r_[intercept, coef, coef]
+        )
+
+        pytest.xfail(reason="GLM does not provide the minimum norm solution.")
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, np.r_[coef, coef])
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that unpenalized GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    GLM fit on [X] is the same as fit on [X], [y]
+                                         [X], [y].
+    For wide X, [X', X'] is a singular matrix and we check against the minimum norm
+    solution:
+        min ||w||_2 subject to min deviance
+    """
+    model, X, y, coef, _, _ = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # unpenalized
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    X = np.concatenate((X, X), axis=0)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    y = np.r_[y, y]
+    model.fit(X, y)
+
+    if n_samples > n_features or not fit_intercept:
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+    else:
+        # FIXME: Same as in test_glm_regression_unpenalized.
+        # As it is an underdetermined problem, prediction = y. This shows that we get
+        # a solution, i.e. a (non-unique) minimum of the objective function ...
+        assert_allclose(model.predict(X), y)
+        # But it is not the minimum norm solution. (This should be equal.)
+        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
+            np.r_[intercept, coef]
+        )
+
+        pytest.xfail(reason="GLM does not provide the minimum norm solution.")
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef)
+
+
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
     # scalar value but not positive

From 3fb36954b75492ee94f249ce0a6638cf48d2ed64 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 10 Jun 2022 22:16:00 +0200
Subject: [PATCH 10/97] TST fix solutions of glm_dataset

---
 sklearn/linear_model/_glm/tests/test_glm.py | 182 +++++++++++++-------
 1 file changed, 119 insertions(+), 63 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 504c523373a0f..8cc99b3d0f3db 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -3,6 +3,7 @@
 # License: BSD 3 clause
 
 from functools import partial
+import itertools
 import re
 import warnings
 
@@ -11,7 +12,7 @@
 from numpy.testing import assert_allclose
 import pytest
 from scipy import linalg
-from scipy.optimize import root
+from scipy.optimize import minimize, root
 
 from sklearn.base import clone
 from sklearn._loss import HalfBinomialLoss
@@ -40,6 +41,29 @@ def _get_loss(self):
         return HalfBinomialLoss()
 
 
+def is_canonical(model):
+    """True if model's link function is canonical to loss"""
+    if isinstance(model, (BinomialRegressor, PoissonRegressor)):
+        return True
+    elif isinstance(model, TweedieRegressor):
+        return model.power == 0 and model.link in ["auto", "identity"]
+
+
+def _special_minimize(fun, grad, x, tol_NM, tol):
+    # Find good starting point by Nelder-Mead
+    res_NM = minimize(
+        fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
+    )
+    # Now refine via root finding, wich is more precise then minimizing a function.
+    res = root(
+        grad,
+        res_NM.x,
+        method="lm",
+        options={"ftol": tol, "xtol": tol, "gtol": tol},
+    )
+    return res.x
+
+
 @pytest.fixture(scope="module")
 def regression_data():
     X, y = make_regression(
@@ -49,7 +73,7 @@ def regression_data():
 
 
 @pytest.fixture(
-    params=zip(
+    params=itertools.product(
         ["long", "wide"],
         [
             BinomialRegressor(),
@@ -134,36 +158,41 @@ def glm_dataset(global_random_seed, request):
     # results.
     l2_reg_strength = 1
     fun = partial(
+        linear_loss.loss,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    grad = partial(
         linear_loss.gradient,
         X=X[:, :-1],
         y=y,
         sample_weight=sw,
         l2_reg_strength=l2_reg_strength,
     )
-    # Note: Root finding is more precise then minimizing a function.
-    res = root(
-        fun,
-        coef_unpenalized,
-        method="lm",
-        options={"ftol": 1e-14, "xtol": 1e-14, "gtol": 1e-14},
+    coef_penalized_with_intercept = _special_minimize(
+        fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14
     )
-    coef_penalized_with_intercept = res.x
 
     linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=False)
     fun = partial(
+        linear_loss.loss,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    grad = partial(
         linear_loss.gradient,
         X=X[:, :-1],
         y=y,
         sample_weight=sw,
         l2_reg_strength=l2_reg_strength,
     )
-    res = root(
-        fun,
-        coef_unpenalized[:-1],
-        method="lm",
-        options={"ftol": 1e-14, "xtol": 1e-14, "gtol": 1e-14},
+    coef_penalized_without_intercept = _special_minimize(
+        fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14
     )
-    coef_penalized_without_intercept = res.x
 
     # To be sure
     assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
@@ -208,7 +237,7 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-11
+    rtol = 3e-5 if solver == "lbfgs" else 1e-10
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
@@ -268,7 +297,7 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-11
+    rtol = 3e-5 if solver == "lbfgs" else 1e-10
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
 
@@ -309,11 +338,13 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-11
+    rtol = 3e-5 if solver == "lbfgs" else 1e-10
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
 
+@pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
@@ -336,7 +367,6 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     )
 
     model = clone(model).set_params(**params)
-    # Note that newton-cholesky might give a warning: XXX
     if fit_intercept:
         X = X[:, :-1]  # remove intercept
         intercept = coef[-1]
@@ -346,26 +376,30 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     model.fit(X, y)
 
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
-    # for the wide/fat case with n_features > n_samples. The current Ridge solvers do
+    # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
     # NOT return the minimum norm solution with fit_intercept=True.
     if n_samples > n_features or not fit_intercept:
         assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef)
+        rtol = 5e-6 if solver == "lbfgs" else 1e-7
+        assert_allclose(model.coef_, coef, rtol=rtol)
     else:
-        # As it is an underdetermined problem, prediction = y. This shows that we get
-        # a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y)
-        assert_allclose(model._get_loss().link.inverse(X @ coef + intercept), y)
-        # But it is not the minimum norm solution. (This should be equal.)
-        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
-            np.r_[intercept, coef]
-        )
+        # As it is an underdetermined problem, prediction = y. The following shows that
+        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
+        if is_canonical(model):
+            assert_allclose(model.predict(X), y)
+            assert_allclose(model._get_loss().link.inverse(X @ coef + intercept), y)
+        if solver in ["lbfgs"]:
+            # But it is not the minimum norm solution. (This should be equal.)
+            assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > (
+                1 + 1e-12
+            ) * np.linalg.norm(np.r_[intercept, coef])
+            pytest.xfail(reason="GLM does not provide the minimum norm solution.")
 
-        pytest.xfail(reason="GLM does not provide the minimum norm solution.")
         assert model.intercept_ == pytest.approx(intercept)
         assert_allclose(model.coef_, coef)
 
 
+@pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
@@ -375,7 +409,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
     GLM fit on [X] is the same as fit on [X, X]/2.
     For long X, [X, X] is a singular matrix and we check against the minimum norm
     solution:
-        min ||w||_2 subject to min deviance
+        min ||w||_2 subject to w = argmin deviance(w)
     """
     model, X, y, coef, _, _ = glm_dataset
     n_samples, n_features = X.shape
@@ -390,37 +424,55 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
 
     model = clone(model).set_params(**params)
     if fit_intercept:
-        X = X[:, :-1]  # remove intercept
         intercept = coef[-1]
         coef = coef[:-1]
+        if n_samples > n_features:
+            X = X[:, :-1]  # remove intercept
+            X = 0.5 * np.concatenate((X, X), axis=1)
+        else:
+            # To know the minimum norm solution, we keep one intercept column and do
+            # not divide by 2. Later on, we must take special care.
+            X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]]
     else:
         intercept = 0
-    X = 0.5 * np.concatenate((X, X), axis=1)
+        X = 0.5 * np.concatenate((X, X), axis=1)
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
     model.fit(X, y)
 
+    if fit_intercept and n_samples < n_features:
+        # Here we take special care.
+        model_intercept = 2 * model.intercept_
+        model_coef = 2 * model.coef_[:-1]  # exclude the other intercept term.
+        # For minimum norm solution, we would have
+        # assert model.intercept_ == pytest.approx(model.coef_[-1])
+    else:
+        model_intercept = model.intercept_
+        model_coef = model.coef_
+
     if n_samples > n_features or not fit_intercept:
-        assert model.intercept_ == pytest.approx(intercept)
-        if solver in ["newton-cholesky"]:
-            # Cholesky is a bad choice for singular X.
-            pytest.skip()
-        rtol = 3e-5 if solver == "lbfgs" else 1e-11
-        assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
+        assert model_intercept == pytest.approx(intercept)
+        rtol = 3e-5 if solver == "lbfgs" else 1e-6
+        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
     else:
-        # FIXME: Same as in test_glm_regression_unpenalized.
-        # As it is an underdetermined problem, prediction = y. This shows that we get
-        # a solution, i.e. a (non-unique) minimum of the objective function ...
+        # As it is an underdetermined problem, prediction = y. The following shows that
+        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y)
-        # But it is not the minimum norm solution. (This should be equal.)
-        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
-            np.r_[intercept, coef, coef]
-        )
-
-        pytest.xfail(reason="GLM does not provide the minimum norm solution.")
-        assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, np.r_[coef, coef])
-
-
+        if solver in ["lbfgs", "newton-cholesky"]:
+            # FIXME: Same as in test_glm_regression_unpenalized.
+            # But it is not the minimum norm solution. (This should be equal.)
+            assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > (
+                1 + 1e-12
+            ) * np.linalg.norm(0.5 * np.r_[intercept, intercept, coef, coef])
+            pytest.xfail(
+                reason=f"GLM with {solver} does not provide the minimum norm solution."
+            )
+
+        assert model_intercept == pytest.approx(intercept)
+        assert model.intercept_ == pytest.approx(model.coef_[-1])
+        assert_allclose(model_coef, np.r_[coef, coef])
+
+
+@pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
@@ -431,7 +483,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
                                          [X], [y].
     For wide X, [X', X'] is a singular matrix and we check against the minimum norm
     solution:
-        min ||w||_2 subject to min deviance
+        min ||w||_2 subject to w = argmin deviance(w)
     """
     model, X, y, coef, _, _ = glm_dataset
     n_samples, n_features = X.shape
@@ -458,20 +510,24 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
 
     if n_samples > n_features or not fit_intercept:
         assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef)
+        rtol = 3e-5 if solver == "lbfgs" else 1e-6
+        assert_allclose(model.coef_, coef, rtol=rtol)
     else:
-        # FIXME: Same as in test_glm_regression_unpenalized.
-        # As it is an underdetermined problem, prediction = y. This shows that we get
-        # a solution, i.e. a (non-unique) minimum of the objective function ...
+        # As it is an underdetermined problem, prediction = y. The following shows that
+        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y)
-        # But it is not the minimum norm solution. (This should be equal.)
-        assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > np.linalg.norm(
-            np.r_[intercept, coef]
-        )
+        if solver in ["lbfgs", "newton-cholesky"]:
+            # FIXME: Same as in test_glm_regression_unpenalized.
+            # But it is not the minimum norm solution. (This should be equal.)
+            assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > (
+                1 + 1e-12
+            ) * np.linalg.norm(np.r_[intercept, coef])
+            pytest.xfail(
+                reason=f"GLM with {solver} does not provide the minimum norm solution."
+            )
 
-        pytest.xfail(reason="GLM does not provide the minimum norm solution.")
         assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef)
+        assert_allclose(model.coef_, coef, rtol=5e-5)
 
 
 def test_sample_weights_validation():

From 2b6485e001c7fbe7e3b0546af4ebd0f1c08c41d5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 10 Jun 2022 22:18:58 +0200
Subject: [PATCH 11/97] ENH add SVDFallbackSolver

---
 sklearn/linear_model/_glm/glm.py | 189 +++++++++++++++++++++----------
 1 file changed, 131 insertions(+), 58 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 0b3599634997a..bbca2573ed34f 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -13,6 +13,7 @@
 import numpy as np
 import scipy.linalg
 import scipy.optimize
+import scipy.sparse
 
 from ..._loss.glm_distribution import TweedieDistribution
 from ..._loss.loss import (
@@ -31,39 +32,6 @@
 from .._linear_loss import LinearModelLoss
 
 
-def _solve_singular(H, g):
-    """Find Newton step with singular hessian H.
-
-    We could use the approach with an L D L decomposition as in
-        Nocedal & Wright, Chapter 3.4 subsection
-        "Modified symmetric indefinite factorization"
-    but we use the much simpler (and more expensive?) least squares solver.
-
-    Parameters
-    ----------
-    H : hessian matrix
-    g : gradient
-
-    Returns
-    -------
-    x : Newton step
-        H x = -g
-    """
-    # hessian = L B L' with block diagonal B, block size <= 2
-    # L[perm, :] is lower triangular
-    # CODE: L, B, perm = scipy.linalg.ldl(H, lower=True)
-    # CODE: U, s, Vt = scipy.linalg.svd(B)
-    # CODE: delta = 1e-3  # TODO: Decide on size of this number
-    # CODE: tau = (s < delta) * (delta - s)
-    # F = U @ (tau[:, None] * Vt)
-    # hessian approximation = L (B + F) L' = L U (s + tau) Vt L'
-    # CODE: w = scipy.linalg.solve_triangular(L[perm], -g[perm], lower=True)
-    # w = scipy.linalg.solve(B + F, w)
-    # CODE: w = Vt.T @ (1 / (s + tau) * (U.T @ w))
-    # CODE: return scipy.linalg.solve_triangular(L.T[:, perm], w, lower=False)[perm]
-    return scipy.linalg.lstsq(H, -g)[0]
-
-
 class NewtonSolver(ABC):
     """Newton solver for GLMs.
 
@@ -174,11 +142,11 @@ def setup(self, X, y, sample_weight):
         )
 
     @abstractmethod
-    def update_gradient_hessian(X, y, sample_weight):
+    def update_gradient_hessian(self, X, y, sample_weight):
         """Update gradient and hessian."""
 
     @abstractmethod
-    def inner_solve(self):
+    def inner_solve(self, X, y, sample_weight):
         """Compute Newton step.
 
         Sets self.coef_newton.
@@ -385,16 +353,17 @@ def solve(self, X, y, sample_weight):
         # setup usually:
         #   - initializes self.coef if needed
         #   - initializes and calculates self.raw_predictions, self.loss_value
-        self.setup(X, y, sample_weight)
+        self.setup(X=X, y=y, sample_weight=sample_weight)
 
         self.iteration = 1
         self.converged = False
+        self.stop = False
 
         while self.iteration <= self.max_iter and not self.converged:
             if self.verbose:
                 print(f"Newton iter={self.iteration}")
             # 1. Update hessian and gradient
-            self.update_gradient_hessian(X, y, sample_weight)
+            self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
 
             # TODO:
             # if iteration == 1:
@@ -404,26 +373,25 @@ def solve(self, X, y, sample_weight):
             # 2. Inner solver
             #    Calculate Newton step/direction
             #    This usually sets self.coef_newton.
-            self.inner_solve()
+            #    It may set self.stop = True, e.g. for ill-conditioned systems.
+            self.inner_solve(X=X, y=y, sample_weight=sample_weight)
+            if self.stop:
+                break
 
             # 3. Backtracking line search
             #    This usually sets self.coef_old, self.coef, self.loss_value_old
             #    self.loss_value, self.gradient_old, self.gradient,
             #    self.raw_prediction.
-            self.line_search(X, y, sample_weight)
+            self.line_search(X=X, y=y, sample_weight=sample_weight)
 
             # 4. Check convergence
             #    Sets self.converged.
-            self.check_convergence(
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-            )
+            self.check_convergence(X=X, y=y, sample_weight=sample_weight)
 
             # 5. Next iteration
             self.iteration += 1
 
-        if not self.converged:
+        if not self.converged and not self.stop:
             warnings.warn(
                 "Newton solver did not converge after"
                 f" {self.iteration - 1} iterations.",
@@ -431,10 +399,76 @@ def solve(self, X, y, sample_weight):
             )
 
         self.iteration -= 1
-        self.finalize(X, y, sample_weight)
+        self.finalize(X=X, y=y, sample_weight=sample_weight)
         return self.coef
 
 
+class SVDFallbackSolver(NewtonSolver):
+    """SVD based fallback Newton solver.
+
+    Inner solver for finding the Newton step H w_newton = -g uses SVD of X and is meant
+    for singular problems.
+    """
+
+    def setup(self, X, y, sample_weight):
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+        n_samples, n_features = X.shape
+        n_dof = n_features
+        if self.linear_loss.fit_intercept:
+            n_dof += 1
+        self.gradient = np.empty_like(self.coef)
+        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
+
+        # # SVD of X: This is expensive.
+        # if scipy.sparse.issparse(X):
+        #     X = X.toarray()
+        # if self.linear_loss.fit_intercept:
+        #     n_samples = X.shape[0]
+        #     X = np.c_[X, np.ones(shape=n_samples)]
+        # # X = U diag(s) Vt  and X' = V diag(s) U'
+        # U, s, Vt = scipy.linalg.svd(X, full_matrices=False)
+        # inv_s = np.zeros_like(s)
+        # inv_s[s > 0] = 1 / s[s > 0]
+        # # Adding a small positive constant brings us closer to the minimum
+        # # norm solution.
+        self.EPS = np.sqrt(np.finfo(X.dtype).eps)
+        # inv_s[s <= self.EPS] = self.EPS
+        # # All we need to store is this almost pseudo-inverse of X.T. See below.
+        # # Note that inv_Xt' = inv_X
+        # if n_samples > n_dof:
+        #     self.inv_Xt = U @ (inv_s[:, None] * Vt)
+        # else:
+        #     self.inv_Xt = U @ (inv_s[:, None] * Vt)
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
+
+    def inner_solve(self, X, y, sample_weight):
+        # hessian = self.inv_Xt @ self.hessian @ self.inv_Xt.T
+        # hessian[np.diag_indices_from(hessian)] += self.EPS
+        # gradient = self.inv_Xt @ self.gradient
+
+        # coef_newton = scipy.linalg.solve(
+        #     hessian, -gradient, check_finite=False, assume_a="sym"
+        # )
+        # self.coef_newton = self.inv_Xt.T @ coef_newton
+        U, s, Vt = scipy.linalg.svd(self.hessian, full_matrices=False)
+        inv_s = np.zeros_like(s)
+        inv_s[s > 0] = 1 / s[s > 0]
+        inv_s[s <= self.EPS] = self.EPS
+        self.coef_newton = -Vt.T @ (inv_s * (U.T @ self.gradient))
+
+
 class BaseCholeskyNewtonSolver(NewtonSolver):
     """Cholesky based Newton solver.
 
@@ -442,29 +476,68 @@ class BaseCholeskyNewtonSolver(NewtonSolver):
     solver.
     """
 
-    def inner_solve(self):
-        # TODO: solve(..) may give a warning like
-        #     LinAlgWarning: Ill-conditioned matrix (rcond=9.52447e-17): result may not
-        #     be accurate.
-        # Should we treat this as error and deal with it in the except, or is it fine
-        # as is?
+    def setup(self, X, y, sample_weight):
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+
+    def inner_solve(self, X, y, sample_weight):
         try:
             with warnings.catch_warnings():
                 warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
                 self.coef_newton = scipy.linalg.solve(
                     self.hessian, -self.gradient, check_finite=False, assume_a="sym"
                 )
+            return
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             warnings.warn(
-                f"The inner solver of {self.__class__.__name__} stumbled upon a "
-                "singular hessian matrix. Therefore, this iteration uses a step closer"
-                " to a gradient descent direction. Removing collinear features of X or"
-                " increasing the penalization strengths may resolve this issue."
-                " The original Linear Algebra message was:\n"
+                f"The inner solver of {self.__class__.__name__} stumbled upon a"
+                " singular hessian matrix. This is dealt with a SVD of X which may"
+                " slow down fitting time and use excessive memory. It is best to"
+                " avoid such situations in the first place. Possible remedies are"
+                " removing collinear features of X or increasing the penalization"
+                " strengths. The original Linear Algebra message was:\n"
                 + str(e),
                 scipy.linalg.LinAlgWarning,
             )
-            self.coef_newton = _solve_singular(self.hessian, -self.gradient)
+            # Possible causes:
+            # 1. hess_pointwise is negative. But this is already taken care in
+            #    LinearModelLoss such that min(hess_pointwise) >= 0.
+            # 2. X is singular
+            #    This might be the most probable cause.
+            # We assume X singular and proceed with the slogan:
+            #   BETTER SAFE THAN EFFICIENT.
+            # There are many possible ways to deal with this situation (most of
+            # them adding, explicit or implicit, a matrix to the hessian to make it
+            # positive definite), confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # Instead, we employ the structure of the problem and do once an
+            # economic SVD of X.
+            if self.verbose >= 1:
+                print(
+                    "The inner solver detected a singular hessian matrix.\n From "
+                    "here on, we switch to the SVDFallbackSolver which uses a safer "
+                    "method to find a Newton step based on a SVD of X. "
+                    "Sparse X will be densified for this purpose."
+                )
+            # We stop self.solve(...) early
+            self.stop = True
+
+    def solve(self, X, y, sample_weight):
+        super().solve(X=X, y=y, sample_weight=sample_weight)
+        if not self.stop:
+            return self.coef
+        else:
+            # Fallback solver method for singular problems.
+            if self.verbose >= 1:
+                print("Call SVDFallbackSolver.solve(..)")
+            SVD_solver = SVDFallbackSolver(
+                coef=np.zeros_like(self.coef),
+                linear_loss=self.linear_loss,
+                l2_reg_strength=self.l2_reg_strength,
+                tol=self.tol,
+                max_iter=self.max_iter,
+                n_threads=self.n_threads,
+                verbose=self.verbose,
+            )
+        return SVD_solver.solve(X=X, y=y, sample_weight=sample_weight)
 
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):

From 59989f3ffc5eb6b07560fd1f1040e4c938deff5f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 11 Jun 2022 10:38:39 +0200
Subject: [PATCH 12/97] CLN remove SVDFallbackSolver

---
 sklearn/linear_model/_glm/glm.py | 112 +++----------------------------
 1 file changed, 8 insertions(+), 104 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index bbca2573ed34f..4a1b0c2908f61 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -391,7 +391,7 @@ def solve(self, X, y, sample_weight):
             # 5. Next iteration
             self.iteration += 1
 
-        if not self.converged and not self.stop:
+        if not self.converged:
             warnings.warn(
                 "Newton solver did not converge after"
                 f" {self.iteration - 1} iterations.",
@@ -403,72 +403,6 @@ def solve(self, X, y, sample_weight):
         return self.coef
 
 
-class SVDFallbackSolver(NewtonSolver):
-    """SVD based fallback Newton solver.
-
-    Inner solver for finding the Newton step H w_newton = -g uses SVD of X and is meant
-    for singular problems.
-    """
-
-    def setup(self, X, y, sample_weight):
-        super().setup(X=X, y=y, sample_weight=sample_weight)
-        n_samples, n_features = X.shape
-        n_dof = n_features
-        if self.linear_loss.fit_intercept:
-            n_dof += 1
-        self.gradient = np.empty_like(self.coef)
-        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
-
-        # # SVD of X: This is expensive.
-        # if scipy.sparse.issparse(X):
-        #     X = X.toarray()
-        # if self.linear_loss.fit_intercept:
-        #     n_samples = X.shape[0]
-        #     X = np.c_[X, np.ones(shape=n_samples)]
-        # # X = U diag(s) Vt  and X' = V diag(s) U'
-        # U, s, Vt = scipy.linalg.svd(X, full_matrices=False)
-        # inv_s = np.zeros_like(s)
-        # inv_s[s > 0] = 1 / s[s > 0]
-        # # Adding a small positive constant brings us closer to the minimum
-        # # norm solution.
-        self.EPS = np.sqrt(np.finfo(X.dtype).eps)
-        # inv_s[s <= self.EPS] = self.EPS
-        # # All we need to store is this almost pseudo-inverse of X.T. See below.
-        # # Note that inv_Xt' = inv_X
-        # if n_samples > n_dof:
-        #     self.inv_Xt = U @ (inv_s[:, None] * Vt)
-        # else:
-        #     self.inv_Xt = U @ (inv_s[:, None] * Vt)
-
-    def update_gradient_hessian(self, X, y, sample_weight):
-        self.linear_loss.gradient_hessian(
-            coef=self.coef,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            l2_reg_strength=self.l2_reg_strength,
-            n_threads=self.n_threads,
-            gradient_out=self.gradient,
-            hessian_out=self.hessian,
-            raw_prediction=self.raw_prediction,  # this was updated in line_search
-        )
-
-    def inner_solve(self, X, y, sample_weight):
-        # hessian = self.inv_Xt @ self.hessian @ self.inv_Xt.T
-        # hessian[np.diag_indices_from(hessian)] += self.EPS
-        # gradient = self.inv_Xt @ self.gradient
-
-        # coef_newton = scipy.linalg.solve(
-        #     hessian, -gradient, check_finite=False, assume_a="sym"
-        # )
-        # self.coef_newton = self.inv_Xt.T @ coef_newton
-        U, s, Vt = scipy.linalg.svd(self.hessian, full_matrices=False)
-        inv_s = np.zeros_like(s)
-        inv_s[s > 0] = 1 / s[s > 0]
-        inv_s[s <= self.EPS] = self.EPS
-        self.coef_newton = -Vt.T @ (inv_s * (U.T @ self.gradient))
-
-
 class BaseCholeskyNewtonSolver(NewtonSolver):
     """Cholesky based Newton solver.
 
@@ -490,55 +424,25 @@ def inner_solve(self, X, y, sample_weight):
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             warnings.warn(
                 f"The inner solver of {self.__class__.__name__} stumbled upon a"
-                " singular hessian matrix. This is dealt with a SVD of X which may"
-                " slow down fitting time and use excessive memory. It is best to"
-                " avoid such situations in the first place. Possible remedies are"
-                " removing collinear features of X or increasing the penalization"
-                " strengths. The original Linear Algebra message was:\n"
+                " singular hessian matrix and stopped. Your options are to use another"
+                " solver or to avoid such situations in the first place. Possible "
+                " remedies are removing collinear features of X or increasing the "
+                "penalization strengths. The original Linear Algebra message was:\n"
                 + str(e),
                 scipy.linalg.LinAlgWarning,
             )
             # Possible causes:
             # 1. hess_pointwise is negative. But this is already taken care in
             #    LinearModelLoss such that min(hess_pointwise) >= 0.
-            # 2. X is singular
+            # 2. X is singular or ill-conditioned
             #    This might be the most probable cause.
-            # We assume X singular and proceed with the slogan:
-            #   BETTER SAFE THAN EFFICIENT.
+            #
             # There are many possible ways to deal with this situation (most of
             # them adding, explicit or implicit, a matrix to the hessian to make it
             # positive definite), confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
-            # Instead, we employ the structure of the problem and do once an
-            # economic SVD of X.
-            if self.verbose >= 1:
-                print(
-                    "The inner solver detected a singular hessian matrix.\n From "
-                    "here on, we switch to the SVDFallbackSolver which uses a safer "
-                    "method to find a Newton step based on a SVD of X. "
-                    "Sparse X will be densified for this purpose."
-                )
-            # We stop self.solve(...) early
+            # We have throw this above warning an just stop.
             self.stop = True
 
-    def solve(self, X, y, sample_weight):
-        super().solve(X=X, y=y, sample_weight=sample_weight)
-        if not self.stop:
-            return self.coef
-        else:
-            # Fallback solver method for singular problems.
-            if self.verbose >= 1:
-                print("Call SVDFallbackSolver.solve(..)")
-            SVD_solver = SVDFallbackSolver(
-                coef=np.zeros_like(self.coef),
-                linear_loss=self.linear_loss,
-                l2_reg_strength=self.l2_reg_strength,
-                tol=self.tol,
-                max_iter=self.max_iter,
-                n_threads=self.n_threads,
-                verbose=self.verbose,
-            )
-        return SVD_solver.solve(X=X, y=y, sample_weight=sample_weight)
-
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
     """Cholesky based Newton solver.

From d463817e97bc0d1a820c83c6fbec179c6d2b7fe0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 11 Jun 2022 12:20:41 +0200
Subject: [PATCH 13/97] ENH use gradient step for singular hessians

---
 sklearn/linear_model/_glm/glm.py            | 45 +++++++---
 sklearn/linear_model/_glm/tests/test_glm.py | 92 ++++++++++++---------
 2 files changed, 87 insertions(+), 50 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 4a1b0c2908f61..70b8341691fea 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -200,6 +200,9 @@ def line_search(self, X, y, sample_weight):
                 n_threads=self.n_threads,
                 raw_prediction=raw,
             )
+            # Note: If coef_newton is too large, loss_gradient may produce inf values,
+            # potentially accompanied by a RuntimeWarning.
+            # This case will be captured by the Armijo condition.
 
             # 1. Check Armijo / sufficient decrease condition.
             # The smaller (more negative) the better.
@@ -412,6 +415,7 @@ class BaseCholeskyNewtonSolver(NewtonSolver):
 
     def setup(self, X, y, sample_weight):
         super().setup(X=X, y=y, sample_weight=sample_weight)
+        self.count_singular = 0
 
     def inner_solve(self, X, y, sample_weight):
         try:
@@ -422,15 +426,23 @@ def inner_solve(self, X, y, sample_weight):
                 )
             return
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
-            warnings.warn(
-                f"The inner solver of {self.__class__.__name__} stumbled upon a"
-                " singular hessian matrix and stopped. Your options are to use another"
-                " solver or to avoid such situations in the first place. Possible "
-                " remedies are removing collinear features of X or increasing the "
-                "penalization strengths. The original Linear Algebra message was:\n"
-                + str(e),
-                scipy.linalg.LinAlgWarning,
-            )
+            if self.count_singular == 0:
+                # We only need to throw this warning once.
+                warnings.warn(
+                    f"The inner solver of {self.__class__.__name__} stumbled upon a"
+                    " singular or very ill-conditioned hessian matrix. It will now try"
+                    " a simple gradient step."
+                    " Note that this warning is only raised once, the problem may, "
+                    " however, occur in several or all iterations. Set verbose >= 1"
+                    " to get more information.\n"
+                    "Your options are to use another solver or to avoid such situation"
+                    " in the first place. Possible  remedies are removing collinear"
+                    " features of X or increasing the penalization strengths.\n"
+                    "The original Linear Algebra message was:\n"
+                    + str(e),
+                    scipy.linalg.LinAlgWarning,
+                )
+            self.count_singular += 1
             # Possible causes:
             # 1. hess_pointwise is negative. But this is already taken care in
             #    LinearModelLoss such that min(hess_pointwise) >= 0.
@@ -440,8 +452,21 @@ def inner_solve(self, X, y, sample_weight):
             # There are many possible ways to deal with this situation (most of
             # them adding, explicit or implicit, a matrix to the hessian to make it
             # positive definite), confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # Instead, we resort to a simple gradient step, taking the diagonal part
+            # of the hessian.
+            if self.verbose:
+                print(
+                    "  The inner solver stumbled upon an singular or ill-conditioned "
+                    "hessian matrix and resorts to a simple gradient step."
+                )
+            # We add 1e-3 to the diagonal hessian part to make in invertible and to
+            # restrict coef_newton to at most ~1e3. The line search considerst step
+            # sizes until 1e-6 * newton_step ~1e-3 * newton_step.
+            # Deviding by self.iteration ensures (slow) convergence.
+            eps = 1e-3 / self.iteration
+            self.coef_newton = -self.gradient / (np.diag(self.hessian) + eps)
             # We have throw this above warning an just stop.
-            self.stop = True
+            # self.stop = True
 
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 8cc99b3d0f3db..5dd53a984f7b5 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -41,14 +41,6 @@ def _get_loss(self):
         return HalfBinomialLoss()
 
 
-def is_canonical(model):
-    """True if model's link function is canonical to loss"""
-    if isinstance(model, (BinomialRegressor, PoissonRegressor)):
-        return True
-    elif isinstance(model, TweedieRegressor):
-        return model.power == 0 and model.link in ["auto", "identity"]
-
-
 def _special_minimize(fun, grad, x, tol_NM, tol):
     # Find good starting point by Nelder-Mead
     res_NM = minimize(
@@ -343,8 +335,10 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
     assert_allclose(model.coef_, coef, rtol=rtol)
 
 
+# TweedieRegressor(link='log', power=0) raises RuntimeWarning
 @pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
@@ -378,28 +372,33 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
     # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
     # NOT return the minimum norm solution with fit_intercept=True.
-    if n_samples > n_features or not fit_intercept:
+    rtol = 5e-6 if solver == "lbfgs" else 1e-7
+    if n_samples > n_features:
         assert model.intercept_ == pytest.approx(intercept)
-        rtol = 5e-6 if solver == "lbfgs" else 1e-7
         assert_allclose(model.coef_, coef, rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        if is_canonical(model):
-            assert_allclose(model.predict(X), y)
-            assert_allclose(model._get_loss().link.inverse(X @ coef + intercept), y)
-        if solver in ["lbfgs"]:
-            # But it is not the minimum norm solution. (This should be equal.)
-            assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > (
-                1 + 1e-12
-            ) * np.linalg.norm(np.r_[intercept, coef])
-            pytest.xfail(reason="GLM does not provide the minimum norm solution.")
+        assert_allclose(model.predict(X), y)
+        assert_allclose(model._get_loss().link.inverse(X @ coef + intercept), y)
+        if (solver in ["lbfgs", "newton-qr-cholesky"] and fit_intercept) or solver in [
+            "newton-cholesky"
+        ]:
+            # But it is not the minimum norm solution. Otherwise the norms would be
+            # equal.
+            norm_solution = (1 + 1e-12) * np.linalg.norm(np.r_[intercept, coef])
+            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+            assert norm_model > norm_solution
+            pytest.xfail(
+                reason=f"GLM with {solver} does not provide the minimum norm solution."
+            )
 
         assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef)
+        assert_allclose(model.coef_, coef, rtol=rtol)
 
 
 @pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
@@ -449,30 +448,39 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         model_intercept = model.intercept_
         model_coef = model.coef_
 
-    if n_samples > n_features or not fit_intercept:
+    rtol = 3e-5 if solver == "lbfgs" else 1e-6
+    if n_samples > n_features:
         assert model_intercept == pytest.approx(intercept)
-        rtol = 3e-5 if solver == "lbfgs" else 1e-6
         assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y)
-        if solver in ["lbfgs", "newton-cholesky"]:
+        if (solver in ["lbfgs", "newton-qr-cholesky"] and fit_intercept) or solver in [
+            "newton-cholesky"
+        ]:
             # FIXME: Same as in test_glm_regression_unpenalized.
-            # But it is not the minimum norm solution. (This should be equal.)
-            assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > (
-                1 + 1e-12
-            ) * np.linalg.norm(0.5 * np.r_[intercept, intercept, coef, coef])
+            # But it is not the minimum norm solution. Otherwise the norms would be
+            # equal.
+            norm_solution = (1 + 1e-12) * np.linalg.norm(
+                0.5 * np.r_[intercept, intercept, coef, coef]
+            )
+            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+            assert norm_model > norm_solution
             pytest.xfail(
                 reason=f"GLM with {solver} does not provide the minimum norm solution."
             )
 
+        if fit_intercept:
+            assert model.intercept_ == pytest.approx(model.coef_[-1])
         assert model_intercept == pytest.approx(intercept)
-        assert model.intercept_ == pytest.approx(model.coef_[-1])
-        assert_allclose(model_coef, np.r_[coef, coef])
+        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
 
 
+# TweedieRegressor(link='log', power=0) raises RuntimeWarning
 @pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
@@ -508,26 +516,29 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     y = np.r_[y, y]
     model.fit(X, y)
 
-    if n_samples > n_features or not fit_intercept:
+    rtol = 3e-5 if solver == "lbfgs" else 1e-6
+    if n_samples > n_features:
         assert model.intercept_ == pytest.approx(intercept)
-        rtol = 3e-5 if solver == "lbfgs" else 1e-6
         assert_allclose(model.coef_, coef, rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y)
-        if solver in ["lbfgs", "newton-cholesky"]:
+        if (solver in ["lbfgs", "newton-qr-cholesky"] and fit_intercept) or solver in [
+            "newton-cholesky"
+        ]:
             # FIXME: Same as in test_glm_regression_unpenalized.
-            # But it is not the minimum norm solution. (This should be equal.)
-            assert np.linalg.norm(np.r_[model.intercept_, model.coef_]) > (
-                1 + 1e-12
-            ) * np.linalg.norm(np.r_[intercept, coef])
+            # But it is not the minimum norm solution. Otherwise the norms would be
+            # equal.
+            norm_solution = (1 + 1e-12) * np.linalg.norm(np.r_[intercept, coef])
+            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+            assert norm_model > norm_solution
             pytest.xfail(
                 reason=f"GLM with {solver} does not provide the minimum norm solution."
             )
 
         assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef, rtol=5e-5)
+        assert_allclose(model.coef_, coef, rtol=rtol)
 
 
 def test_sample_weights_validation():
@@ -1006,7 +1017,7 @@ def test_family_deprecation(est, family):
 def test_linalg_warning_with_newton_solver(global_random_seed):
     rng = np.random.RandomState(global_random_seed)
     X_orig = rng.normal(size=(10, 3))
-    X_colinear = np.hstack([X_orig] * 10)  # colinear design
+    X_colinear = np.hstack([X_orig] * 10)  # collinear design
     y = rng.normal(size=X_orig.shape[0])
     y[y < 0] = 0.0
 
@@ -1016,8 +1027,9 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
         PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_orig, y)
 
     msg = (
-        "The inner solver of CholeskyNewtonSolver stumbled upon a "
-        "singular hessian matrix. "
+        "The inner solver of CholeskyNewtonSolver stumbled upon a"
+        " singular or very ill-conditioned hessian matrix. It will now try"
+        " a simple gradient step."
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
         PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_colinear, y)

From 8a108bb88969341514d0730dd6b68dace5c269b3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 11 Jun 2022 12:50:26 +0200
Subject: [PATCH 14/97] ENH print iteration number in warnings

---
 sklearn/linear_model/_glm/glm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 70b8341691fea..122715a0be546 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -287,8 +287,9 @@ def line_search(self, X, y, sample_weight):
             t *= beta
         else:
             warnings.warn(
-                f"Line search of Newton solver {self.__class__.__name__} did not "
-                "converge after 21 line search refinement iterations.",
+                f"Line search of Newton solver {self.__class__.__name__} at iteration "
+                "#{self.iteration} did no converge after 21 line search refinement "
+                "iterations.",
                 ConvergenceWarning,
             )
 
@@ -430,8 +431,8 @@ def inner_solve(self, X, y, sample_weight):
                 # We only need to throw this warning once.
                 warnings.warn(
                     f"The inner solver of {self.__class__.__name__} stumbled upon a"
-                    " singular or very ill-conditioned hessian matrix. It will now try"
-                    " a simple gradient step."
+                    " singular or very ill-conditioned hessian matrix at iteration "
+                    " #{self.iteration}. It will now try a simple gradient step."
                     " Note that this warning is only raised once, the problem may, "
                     " however, occur in several or all iterations. Set verbose >= 1"
                     " to get more information.\n"

From 82287af585a89d68b1dcf9190b37663c40b7327c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 11 Jun 2022 12:51:05 +0200
Subject: [PATCH 15/97] TST improve test_linalg_warning_with_newton_solver

---
 sklearn/linear_model/_glm/tests/test_glm.py | 28 +++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 5dd53a984f7b5..e226016fd56fc 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -29,7 +29,7 @@
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
 from sklearn.linear_model._linear_loss import LinearModelLoss
 from sklearn.exceptions import ConvergenceWarning
-from sklearn.metrics import d2_tweedie_score
+from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
 from sklearn.model_selection import train_test_split
 
 
@@ -1017,19 +1017,33 @@ def test_family_deprecation(est, family):
 def test_linalg_warning_with_newton_solver(global_random_seed):
     rng = np.random.RandomState(global_random_seed)
     X_orig = rng.normal(size=(10, 3))
-    X_colinear = np.hstack([X_orig] * 10)  # collinear design
+    X_collinear = np.hstack([X_orig] * 10)  # collinear design
     y = rng.normal(size=X_orig.shape[0])
     y[y < 0] = 0.0
 
+    # No warning raised on well-conditioned design, even without regularization.
     with warnings.catch_warnings():
         warnings.simplefilter("error")
-        # No warning raised on well-conditioned design
-        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_orig, y)
+        reg = PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_orig, y)
+    reference_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
 
+    # Fitting on collinear data without regularization should raise an
+    # informative warning:
     msg = (
         "The inner solver of CholeskyNewtonSolver stumbled upon a"
-        " singular or very ill-conditioned hessian matrix. It will now try"
-        " a simple gradient step."
+        " singular or very ill-conditioned hessian matrix"
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
-        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_colinear, y)
+        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_collinear, y)
+
+    msg = "Newton solver did not converge after.*iterations."
+    with pytest.warns(ConvergenceWarning, match=msg):
+        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_collinear, y)
+
+    # Increasing the regularization slightly should make the problem go away:
+    reg = PoissonRegressor(solver="newton-cholesky", alpha=1e-12).fit(X_collinear, y)
+
+    # Since we use a small penalty, the deviance of the predictions should still
+    # be almost the same.
+    this_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
+    assert this_deviance == pytest.approx(reference_deviance)

From 9868a13001193bfde5bffca75441ad879c490f1e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 11 Jun 2022 14:34:05 +0200
Subject: [PATCH 16/97] CLN LinAlgWarning fron scipy.linalg

---
 sklearn/linear_model/_glm/tests/test_glm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index e226016fd56fc..4bb44a0131bfc 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -336,7 +336,7 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
 
 
 # TweedieRegressor(link='log', power=0) raises RuntimeWarning
-@pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 @pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
@@ -397,7 +397,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         assert_allclose(model.coef_, coef, rtol=rtol)
 
 
-@pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
@@ -478,7 +478,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
 
 
 # TweedieRegressor(link='log', power=0) raises RuntimeWarning
-@pytest.mark.filterwarnings("ignore::scipy.linalg.misc.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
 @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 @pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("solver", SOLVERS)

From e3a26276539f642c01f347be81201658111067d7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 12 Jun 2022 19:47:04 +0200
Subject: [PATCH 17/97] ENH more robust hessian

---
 sklearn/linear_model/_glm/glm.py            |  4 ++--
 sklearn/linear_model/_glm/tests/test_glm.py | 16 ++++++++--------
 sklearn/linear_model/_linear_loss.py        |  9 +++++++--
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 122715a0be546..a49187f41ba94 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -361,7 +361,7 @@ def solve(self, X, y, sample_weight):
 
         self.iteration = 1
         self.converged = False
-        self.stop = False
+        self.stop = False  # Can be used by inner_solve to stop iteration.
 
         while self.iteration <= self.max_iter and not self.converged:
             if self.verbose:
@@ -432,7 +432,7 @@ def inner_solve(self, X, y, sample_weight):
                 warnings.warn(
                     f"The inner solver of {self.__class__.__name__} stumbled upon a"
                     " singular or very ill-conditioned hessian matrix at iteration "
-                    " #{self.iteration}. It will now try a simple gradient step."
+                    f"#{self.iteration}. It will now try a simple gradient step."
                     " Note that this warning is only raised once, the problem may, "
                     " however, occur in several or all iterations. Set verbose >= 1"
                     " to get more information.\n"
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 4bb44a0131bfc..f6b52477cfd57 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -335,10 +335,6 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
     assert_allclose(model.coef_, coef, rtol=rtol)
 
 
-# TweedieRegressor(link='log', power=0) raises RuntimeWarning
-@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
-@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
-@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
@@ -367,6 +363,10 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         coef = coef[:-1]
     else:
         intercept = 0
+
+    if n_samples < n_features:
+        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
     model.fit(X, y)
 
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
@@ -477,10 +477,6 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
 
 
-# TweedieRegressor(link='log', power=0) raises RuntimeWarning
-@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
-@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
-@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
@@ -514,6 +510,10 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     X = np.concatenate((X, X), axis=0)
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
     y = np.r_[y, y]
+
+    if n_samples < n_features:
+        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
     model.fit(X, y)
 
     rtol = 3e-5 if solver == "lbfgs" else 1e-6
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 68c79d316598f..d14b2cfb94b5d 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -428,8 +428,13 @@ def gradient_hessian(
         )
 
         # For non-canonical link functions and far away from the optimum, we take
-        # care that the hessian is not negative.
-        hess_pointwise[hess_pointwise <= 0] = 0
+        # care that the hessian is positive.
+        positive_hess = hess_pointwise[hess_pointwise > 0]
+        if len(positive_hess) > 0:
+            min_hess = np.amin(positive_hess)
+        else:
+            min_hess = 1
+        hess_pointwise[hess_pointwise <= 0] = min_hess
 
         if not self.base_loss.is_multiclass:
             # gradient

From 0276cd98a43f1f8b3a51f090eae704a3859b138f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 08:19:32 +0200
Subject: [PATCH 18/97] ENH increase maxls for lbfgs to make it more robust

---
 sklearn/linear_model/_glm/glm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index a49187f41ba94..ebd40bf7f1ad8 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -852,8 +852,8 @@ def fit(self, X, y, sample_weight=None):
                 jac=True,
                 options={
                     "maxiter": self.max_iter,
-                    "maxls": 30,  # default is 20
-                    "iprint": (self.verbose > 0) - 1,
+                    "maxls": 40,  # default is 20
+                    "iprint": self.verbose - 1,
                     "gtol": self.tol,
                     "ftol": 64 * np.finfo(np.float64).eps,  # lbfgs is float64 land.
                 },

From b4294728dee937561979cceefcf175fa78b4b644 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 13:49:34 +0200
Subject: [PATCH 19/97] ENH add hessian_warning for too many negative hessian
 values

---
 sklearn/linear_model/_linear_loss.py          | 22 +++++++++++--------
 .../linear_model/tests/test_linear_loss.py    |  4 ++--
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index d14b2cfb94b5d..3193e229fa616 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -411,6 +411,9 @@ def gradient_hessian(
 
         hessian : ndarray
             Hessian matrix.
+
+        hessian_warning : bool
+            True if pointwise hessian has more than half of its elements non-positive.
         """
         n_samples, n_features = X.shape
         n_dof = n_features + int(self.fit_intercept)
@@ -427,14 +430,10 @@ def gradient_hessian(
             n_threads=n_threads,
         )
 
-        # For non-canonical link functions and far away from the optimum, we take
-        # care that the hessian is positive.
-        positive_hess = hess_pointwise[hess_pointwise > 0]
-        if len(positive_hess) > 0:
-            min_hess = np.amin(positive_hess)
-        else:
-            min_hess = 1
-        hess_pointwise[hess_pointwise <= 0] = min_hess
+        # For non-canonical link functions and far away from the optimum, the pointwise
+        # hessian can be negative. We take care that the hessian is positive.
+        hessian_warning = np.sum(hess_pointwise <= 0) > len(hess_pointwise) / 2
+        hess_pointwise = np.abs(hess_pointwise)
 
         if not self.base_loss.is_multiclass:
             # gradient
@@ -451,6 +450,11 @@ def gradient_hessian(
                 hess = np.empty(shape=(n_dof, n_dof), dtype=weights.dtype)
             else:
                 hess = hessian_out
+
+            if hessian_warning:
+                # Exit early without computing the hessian.
+                return grad, hess, hessian_warning
+
             # TODO: This "sandwich product", X' diag(W) X, can be greatly improved by
             # a dedicated Cython routine.
             if sparse.issparse(X):
@@ -487,7 +491,7 @@ def gradient_hessian(
             # cross-entropy.
             raise NotImplementedError
 
-        return grad, hess
+        return grad, hess, hessian_warning
 
     def gradient_hessian_product(
         self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index eb35dd8f08d65..c48680a282611 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -106,7 +106,7 @@ def test_loss_grad_hess_are_the_same(
         coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
     )
     if not base_loss.is_multiclass:
-        g4, h4 = loss.gradient_hessian(
+        g4, h4, _ = loss.gradient_hessian(
             coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
         )
 
@@ -132,7 +132,7 @@ def test_loss_grad_hess_are_the_same(
         coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
     )
     if not base_loss.is_multiclass:
-        g4_sp, h4_sp = loss.gradient_hessian(
+        g4_sp, h4_sp, _ = loss.gradient_hessian(
             coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
         )
 

From a85f2512c448a792516bfad811faa06601457a35 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 13:58:02 +0200
Subject: [PATCH 20/97] CLN some warning messages

---
 sklearn/linear_model/_glm/glm.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index ebd40bf7f1ad8..0892c137a8137 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -232,8 +232,8 @@ def line_search(self, X, y, sample_weight):
                 check = sum_abs_grad < sum_abs_grad_old
                 if is_verbose:
                     print(
-                        "      check sum(|gradient|) <= sum(|gradient_old|): "
-                        f"{sum_abs_grad} <= {sum_abs_grad_old} {check}"
+                        "      check sum(|gradient|) < sum(|gradient_old|): "
+                        f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
                     )
                 if check:
                     break
@@ -288,7 +288,7 @@ def line_search(self, X, y, sample_weight):
         else:
             warnings.warn(
                 f"Line search of Newton solver {self.__class__.__name__} at iteration "
-                "#{self.iteration} did no converge after 21 line search refinement "
+                f"#{self.iteration} did no converge after 21 line search refinement "
                 "iterations.",
                 ConvergenceWarning,
             )
@@ -433,7 +433,7 @@ def inner_solve(self, X, y, sample_weight):
                     f"The inner solver of {self.__class__.__name__} stumbled upon a"
                     " singular or very ill-conditioned hessian matrix at iteration "
                     f"#{self.iteration}. It will now try a simple gradient step."
-                    " Note that this warning is only raised once, the problem may, "
+                    " Note that this warning is only raised once, the problem may,"
                     " however, occur in several or all iterations. Set verbose >= 1"
                     " to get more information.\n"
                     "Your options are to use another solver or to avoid such situation"
@@ -463,11 +463,8 @@ def inner_solve(self, X, y, sample_weight):
             # We add 1e-3 to the diagonal hessian part to make in invertible and to
             # restrict coef_newton to at most ~1e3. The line search considerst step
             # sizes until 1e-6 * newton_step ~1e-3 * newton_step.
-            # Deviding by self.iteration ensures (slow) convergence.
-            eps = 1e-3 / self.iteration
+            eps = 1e-3
             self.coef_newton = -self.gradient / (np.diag(self.hessian) + eps)
-            # We have throw this above warning an just stop.
-            # self.stop = True
 
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):

From c9b120063574bdd9d408805ce96c34ebe7722f81 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 14:05:32 +0200
Subject: [PATCH 21/97] ENH add lbfgs_step

---
 sklearn/linear_model/_glm/glm.py | 45 ++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 0892c137a8137..e92621aa8ba1e 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -152,6 +152,51 @@ def inner_solve(self, X, y, sample_weight):
         Sets self.coef_newton.
         """
 
+    def lbfgs_step(self, X, y, sample_weight):
+        """Fallback for inner solver.
+
+        Use 4 lbfgs steps. As in line_search sets:
+            - self.coef_old
+            - self.coef
+            - self.loss_value_old
+            - self.loss_value
+            - self.gradient_old
+            - self.gradient
+            - self.raw_prediction
+        As in inner_solver sets:
+            - self.coef_newton
+        """
+        self.coef_old = self.coef
+        self.loss_value_old = self.loss_value
+        self.gradient_old = self.gradient
+
+        opt_res = scipy.optimize.minimize(
+            self.linear_loss.loss_gradient,
+            self.coef,
+            method="L-BFGS-B",
+            jac=True,
+            options={
+                "maxiter": 4,
+                "maxls": 40,  # default is 20
+                "iprint": self.verbose - 2,
+                "gtol": self.tol,
+                "ftol": 64 * np.finfo(np.float64).eps,  # lbfgs is float64 land.
+            },
+            args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
+        )
+        self.coef = opt_res.x
+        self.coef_newton = self.coef - self.coef_old
+        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
+        self.loss_value, self.gradient = self.linear_loss.loss_gradient(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            raw_prediction=self.raw_prediction,
+        )
+
     def line_search(self, X, y, sample_weight):
         """Backtracking line search.
 

From 2f0ea15a2089861170b744e8a531708100c9ff88 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 14:17:52 +0200
Subject: [PATCH 22/97] ENH use lbfgs_step for hessian_warning

---
 sklearn/linear_model/_glm/glm.py | 38 +++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index e92621aa8ba1e..bb391498a375c 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -411,6 +411,9 @@ def solve(self, X, y, sample_weight):
         while self.iteration <= self.max_iter and not self.converged:
             if self.verbose:
                 print(f"Newton iter={self.iteration}")
+
+            self.use_lbfgs_step = False  # Fallback for inner_solve.
+
             # 1. Update hessian and gradient
             self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
 
@@ -426,12 +429,15 @@ def solve(self, X, y, sample_weight):
             self.inner_solve(X=X, y=y, sample_weight=sample_weight)
             if self.stop:
                 break
+            if self.use_lbfgs_step:
+                self.lbfgs_step(X=X, y=y, sample_weight=sample_weight)
 
             # 3. Backtracking line search
             #    This usually sets self.coef_old, self.coef, self.loss_value_old
             #    self.loss_value, self.gradient_old, self.gradient,
             #    self.raw_prediction.
-            self.line_search(X=X, y=y, sample_weight=sample_weight)
+            if not self.use_lbfgs_step:
+                self.line_search(X=X, y=y, sample_weight=sample_weight)
 
             # 4. Check convergence
             #    Sets self.converged.
@@ -462,8 +468,30 @@ class BaseCholeskyNewtonSolver(NewtonSolver):
     def setup(self, X, y, sample_weight):
         super().setup(X=X, y=y, sample_weight=sample_weight)
         self.count_singular = 0
+        self.count_hessian_warning = 0
 
     def inner_solve(self, X, y, sample_weight):
+        if self.hessian_warning:
+            if self.count_bad_hessian == 0:
+                # We only need to throw this warning once.
+                warnings.warn(
+                    f"The inner solver of {self.__class__.__name__} detected a "
+                    " pointwise hessian with many negative values at iteration "
+                    f"#{self.iteration}. It will now try a lbfgs step."
+                    " Note that this warning is only raised once, the problem may,"
+                    " however, occur in several or all iterations. Set verbose >= 1"
+                    " to get more information.\n",
+                    ConvergenceWarning,
+                )
+            self.count_hessian_warning += 1
+            if self.verbose:
+                print(
+                    "  The inner solver detected a pointwise hessian with many "
+                    "negative values and resorts to a lbfgs step."
+                )
+            self.use_lbfgs_step = True
+            return
+
         try:
             with warnings.catch_warnings():
                 warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
@@ -529,7 +557,7 @@ def setup(self, X, y, sample_weight):
         self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
 
     def update_gradient_hessian(self, X, y, sample_weight):
-        self.linear_loss.gradient_hessian(
+        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
             coef=self.coef,
             X=X,
             y=y,
@@ -590,7 +618,7 @@ def setup(self, X, y, sample_weight):
 
     def update_gradient_hessian(self, X, y, sample_weight):
         # Use R' instead of X
-        self.linear_loss.gradient_hessian(
+        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
             coef=self.coef,
             X=self.R.T,
             y=y,
@@ -602,6 +630,10 @@ def update_gradient_hessian(self, X, y, sample_weight):
             raw_prediction=self.raw_prediction,  # this was updated in line_search
         )
 
+    def lbfgs_step(self, X, y, sample_weight):
+        # Use R' instead of X
+        super().lbfgs_step(X=self.R.T, y=y, sample_weight=sample_weight)
+
     def line_search(self, X, y, sample_weight):
         # Use R' instead of X
         super().line_search(X=self.R.T, y=y, sample_weight=sample_weight)

From 9ce6cf23e964f304ae4d69e28db8cc2ff047b54d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 14:20:11 +0200
Subject: [PATCH 23/97] TST make them pass

---
 sklearn/linear_model/_glm/tests/test_glm.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index f6b52477cfd57..b26de17c95b8b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -71,8 +71,8 @@ def regression_data():
             BinomialRegressor(),
             PoissonRegressor(),
             GammaRegressor(),
-            TweedieRegressor(power=3.0),
-            TweedieRegressor(power=0, link="log"),
+            # TweedieRegressor(power=3.0),  # too difficult
+            # TweedieRegressor(power=0, link="log"),  # too difficult
             TweedieRegressor(power=1.5),
         ],
     )
@@ -129,6 +129,7 @@ def glm_dataset(global_random_seed, request):
     X[:, -1] = 1  # last columns acts as intercept
     U, s, Vt = linalg.svd(X)
     assert np.all(s) > 1e-3  # to be sure
+    assert np.max(s) / np.min(s) < 100  # condition number
     U1, _ = U[:, :k], U[:, k:]
     Vt1, _ = Vt[:k, :], Vt[k:, :]
 
@@ -227,9 +228,16 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
     else:
         coef = coef_without_intercept
         intercept = 0
+
+    if solver in ["newton-cholesky", "newton-qr-cholesky"]:
+        warnings.filterwarnings(
+            action="ignore",
+            message=".*pointwise hessian to have many non-positive values.*",
+            category=ConvergenceWarning,
+        )
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-10
+    rtol = 3e-5 if solver == "lbfgs" else 1e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
@@ -265,7 +273,7 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
         solver == "lbfgs"
         and fit_intercept is False
         and (
-            isinstance(model, BinomialRegressor)
+            isinstance(model, (BinomialRegressor, GammaRegressor, TweedieRegressor))
             or (isinstance(model, PoissonRegressor) and n_features > n_samples)
         )
     ):
@@ -289,7 +297,7 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-10
+    rtol = 1e-4 if solver == "lbfgs" else 1e-10
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
 

From 221f61140320420411a6b4b6015a59a63f4268f3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 13 Jun 2022 20:28:01 +0200
Subject: [PATCH 24/97] TST tweek rtol for lbfgs

---
 sklearn/linear_model/_glm/tests/test_glm.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index b26de17c95b8b..ba0912ff669ef 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -380,15 +380,17 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
     # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
     # NOT return the minimum norm solution with fit_intercept=True.
-    rtol = 5e-6 if solver == "lbfgs" else 1e-7
+    rtol = 5e-5 if solver == "lbfgs" else 1e-7
     if n_samples > n_features:
         assert model.intercept_ == pytest.approx(intercept)
         assert_allclose(model.coef_, coef, rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y)
-        assert_allclose(model._get_loss().link.inverse(X @ coef + intercept), y)
+        assert_allclose(model.predict(X), y, rtol=1e-6)
+        assert_allclose(
+            model._get_loss().link.inverse(X @ coef + intercept), y, rtol=5e-7
+        )
         if (solver in ["lbfgs", "newton-qr-cholesky"] and fit_intercept) or solver in [
             "newton-cholesky"
         ]:
@@ -456,14 +458,14 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         model_intercept = model.intercept_
         model_coef = model.coef_
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-6
+    rtol = 6e-5 if solver == "lbfgs" else 1e-6
     if n_samples > n_features:
         assert model_intercept == pytest.approx(intercept)
-        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
+        assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y)
+        assert_allclose(model.predict(X), y, rtol=1e-6)
         if (solver in ["lbfgs", "newton-qr-cholesky"] and fit_intercept) or solver in [
             "newton-cholesky"
         ]:
@@ -524,14 +526,14 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
         warnings.filterwarnings("ignore", category=ConvergenceWarning)
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-6
+    rtol = 5e-5 if solver == "lbfgs" else 1e-6
     if n_samples > n_features:
         assert model.intercept_ == pytest.approx(intercept)
         assert_allclose(model.coef_, coef, rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y)
+        assert_allclose(model.predict(X), y, rtol=1e-6)
         if (solver in ["lbfgs", "newton-qr-cholesky"] and fit_intercept) or solver in [
             "newton-cholesky"
         ]:

From aa81fb5b8dc482bc331bad59bc87a5e7f325fe11 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 14 Jun 2022 00:01:57 +0200
Subject: [PATCH 25/97] TST add rigoros test for GLMs

---
 sklearn/linear_model/_glm/tests/test_glm.py | 511 +++++++++++++++++++-
 1 file changed, 507 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index d29fde2eb30d7..fb53a1c983d54 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -2,25 +2,60 @@
 #
 # License: BSD 3 clause
 
+from functools import partial
+import itertools
 import re
+import warnings
+
 import numpy as np
+import scipy
 from numpy.testing import assert_allclose
 import pytest
-import warnings
+from scipy import linalg
+from scipy.optimize import minimize, root
 
 from sklearn.base import clone
+from sklearn._loss import HalfBinomialLoss
 from sklearn._loss.glm_distribution import TweedieDistribution
 from sklearn._loss.link import IdentityLink, LogLink
 
-from sklearn.datasets import make_regression
+from sklearn.datasets import make_low_rank_matrix, make_regression
+from sklearn.linear_model import (
+    GammaRegressor,
+    PoissonRegressor,
+    Ridge,
+    TweedieRegressor,
+)
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
-from sklearn.linear_model import TweedieRegressor, PoissonRegressor, GammaRegressor
-from sklearn.linear_model import Ridge
+from sklearn.linear_model._linear_loss import LinearModelLoss
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.metrics import d2_tweedie_score
 from sklearn.model_selection import train_test_split
 
 
+SOLVERS = ["lbfgs"]
+
+
+class BinomialRegressor(_GeneralizedLinearRegressor):
+    def _get_loss(self):
+        return HalfBinomialLoss()
+
+
+def _special_minimize(fun, grad, x, tol_NM, tol):
+    # Find good starting point by Nelder-Mead
+    res_NM = minimize(
+        fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
+    )
+    # Now refine via root finding, wich is more precise then minimizing a function.
+    res = root(
+        grad,
+        res_NM.x,
+        method="lm",
+        options={"ftol": tol, "xtol": tol, "gtol": tol},
+    )
+    return res.x
+
+
 @pytest.fixture(scope="module")
 def regression_data():
     X, y = make_regression(
@@ -29,6 +64,474 @@ def regression_data():
     return X, y
 
 
+@pytest.fixture(
+    params=itertools.product(
+        ["long", "wide"],
+        [
+            BinomialRegressor(),
+            PoissonRegressor(),
+            GammaRegressor(),
+            # TweedieRegressor(power=3.0),  # too difficult
+            # TweedieRegressor(power=0, link="log"),  # too difficult
+            TweedieRegressor(power=1.5),
+        ],
+    )
+)
+def glm_dataset(global_random_seed, request):
+    """Dataset with GLM solutions, well conditioned X.
+
+    This is inspired by ols_ridge_dataset in test_ridge.py.
+
+    The construction is based on the SVD decomposition of X = U S V'.
+
+    Parameters
+    ----------
+    type : {"long", "wide"}
+        If "long", then n_samples > n_features.
+        If "wide", then n_features > n_samples.
+    model : a GLM model
+
+    For "wide", we return the minimum norm solution w = X' (XX')^-1 y:
+
+        min ||w||_2 subject to X w = y
+
+    Returns
+    -------
+    model : GLM model
+    X : ndarray
+        Last column of 1, i.e. intercept.
+    y : ndarray
+    coef_unpenalized : ndarray
+        Minimum norm solutions, i.e. min sum(loss(w)) (with mininum ||w||_2 in
+        case of ambiguity)
+        Last coefficient is intercept.
+    coef_penalized : ndarray
+        GLM solution with alpha=l2_reg_strength=1, i.e.
+        min 1/n * sum(loss) + ||w||_2^2.
+        Last coefficient is intercept.
+    """
+    data_type, model = request.param
+    # Make larger dim more than double as big as the smaller one.
+    # This helps when constructing singular matrices like (X, X).
+    if data_type == "long":
+        n_samples, n_features = 12, 4
+    else:
+        n_samples, n_features = 4, 12
+    k = min(n_samples, n_features)
+    rng = np.random.RandomState(global_random_seed)
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        effective_rank=k,
+        tail_strength=0.1,
+        random_state=rng,
+    )
+    X[:, -1] = 1  # last columns acts as intercept
+    U, s, Vt = linalg.svd(X)
+    assert np.all(s) > 1e-3  # to be sure
+    assert np.max(s) / np.min(s) < 100  # condition number
+    U1, _ = U[:, :k], U[:, k:]
+    Vt1, _ = Vt[:k, :], Vt[k:, :]
+
+    if data_type == "long":
+        coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
+        coef_unpenalized *= rng.choice([-1, 1], size=n_features)
+        raw_prediction = X @ coef_unpenalized
+    else:
+        raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
+        # w = X'(XX')^-1 y = V s^-1 U' y
+        coef_unpenalized = Vt1.T @ np.diag(1 / s) @ U1.T @ raw_prediction
+
+    linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=True)
+    sw = np.full(shape=n_samples, fill_value=1 / n_samples)
+    y = linear_loss.base_loss.link.inverse(raw_prediction)
+
+    # Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with
+    # optimizer. Note that the problem is well conditioned such that we get accurate
+    # results.
+    l2_reg_strength = 1
+    fun = partial(
+        linear_loss.loss,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    grad = partial(
+        linear_loss.gradient,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    coef_penalized_with_intercept = _special_minimize(
+        fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14
+    )
+
+    linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=False)
+    fun = partial(
+        linear_loss.loss,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    grad = partial(
+        linear_loss.gradient,
+        X=X[:, :-1],
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=l2_reg_strength,
+    )
+    coef_penalized_without_intercept = _special_minimize(
+        fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14
+    )
+
+    # To be sure
+    assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
+        coef_unpenalized
+    )
+
+    return (
+        model,
+        X,
+        y,
+        coef_unpenalized,
+        coef_penalized_with_intercept,
+        coef_penalized_without_intercept,
+    )
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_glm_regression(solver, fit_intercept, glm_dataset):
+    """Test that GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    """
+    model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
+    alpha = 1.0  # because glm_dataset uses this.
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    X = X[:, :-1]  # remove intercept
+    if fit_intercept:
+        coef = coef_with_intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        coef = coef_without_intercept
+        intercept = 0
+
+    model.fit(X, y)
+
+    rtol = 3e-5
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, coef, rtol=rtol)
+
+    # Same with sample_weight.
+    model = (
+        clone(model).set_params(**params).fit(X, y, sample_weight=np.ones(X.shape[0]))
+    )
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, coef, rtol=rtol)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that GLM converges for all solvers to correct solution on hstacked data.
+
+    We work with a simple constructed data set with known solution.
+    Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
+    For long X, [X, X] is a singular matrix.
+    """
+    model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 1.0  # because glm_dataset uses this.
+    params = dict(
+        alpha=alpha / 2,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    if not fit_intercept:
+        # Line search cannot locate an adequate point after MAXLS
+        #  function and gradient evaluations.
+        #  Previous x, f and g restored.
+        # Possible causes: 1 error in function or gradient evaluation;
+        #                  2 rounding error dominate computation.
+        pytest.xfail()
+
+    model = clone(model).set_params(**params)
+    X = X[:, :-1]  # remove intercept
+    X = 0.5 * np.concatenate((X, X), axis=1)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
+    if fit_intercept:
+        coef = coef_with_intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        coef = coef_without_intercept
+        intercept = 0
+    model.fit(X, y)
+
+    rtol = 1e-4
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that GLM converges for all solvers to correct solution on vstacked data.
+
+    We work with a simple constructed data set with known solution.
+    Fit on [X] with alpha is the same as fit on [X], [y]
+                                                [X], [y] with 1 * alpha.
+    It is the same alpha as the average loss stays the same.
+    For wide X, [X', X'] is a singular matrix.
+    """
+    model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 1.0  # because glm_dataset uses this.
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    X = X[:, :-1]  # remove intercept
+    X = np.concatenate((X, X), axis=0)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    y = np.r_[y, y]
+    if fit_intercept:
+        coef = coef_with_intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        coef = coef_without_intercept
+        intercept = 0
+    model.fit(X, y)
+
+    rtol = 3e-5
+    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
+    assert_allclose(model.coef_, coef, rtol=rtol)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
+    """Test that unpenalized GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    Note: This checks the minimum norm solution for wide X, i.e.
+    n_samples < n_features:
+        min ||w||_2 subject to w minimizing the mean deviviance.
+    """
+    model, X, y, coef, _, _ = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # unpenalized
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+
+    if n_samples < n_features:
+        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+    model.fit(X, y)
+
+    # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
+    # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
+    # NOT return the minimum norm solution with fit_intercept=True.
+    rtol = 5e-5
+    if n_samples > n_features:
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef, rtol=rtol)
+    else:
+        # As it is an underdetermined problem, prediction = y. The following shows that
+        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
+        assert_allclose(model.predict(X), y, rtol=1e-6)
+        assert_allclose(
+            model._get_loss().link.inverse(X @ coef + intercept), y, rtol=5e-7
+        )
+        if fit_intercept:
+            # But it is not the minimum norm solution. Otherwise the norms would be
+            # equal.
+            norm_solution = (1 + 1e-12) * np.linalg.norm(np.r_[intercept, coef])
+            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+            assert norm_model > norm_solution
+            pytest.xfail(
+                reason=f"GLM with {solver} does not provide the minimum norm solution."
+            )
+
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef, rtol=rtol)
+
+
+@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that unpenalized GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    GLM fit on [X] is the same as fit on [X, X]/2.
+    For long X, [X, X] is a singular matrix and we check against the minimum norm
+    solution:
+        min ||w||_2 subject to w = argmin deviance(w)
+    """
+    model, X, y, coef, _, _ = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # unpenalized
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    if fit_intercept:
+        intercept = coef[-1]
+        coef = coef[:-1]
+        if n_samples > n_features:
+            X = X[:, :-1]  # remove intercept
+            X = 0.5 * np.concatenate((X, X), axis=1)
+        else:
+            # To know the minimum norm solution, we keep one intercept column and do
+            # not divide by 2. Later on, we must take special care.
+            X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]]
+    else:
+        intercept = 0
+        X = 0.5 * np.concatenate((X, X), axis=1)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    model.fit(X, y)
+
+    if fit_intercept and n_samples < n_features:
+        # Here we take special care.
+        model_intercept = 2 * model.intercept_
+        model_coef = 2 * model.coef_[:-1]  # exclude the other intercept term.
+        # For minimum norm solution, we would have
+        # assert model.intercept_ == pytest.approx(model.coef_[-1])
+    else:
+        model_intercept = model.intercept_
+        model_coef = model.coef_
+
+    rtol = 6e-5
+    if n_samples > n_features:
+        assert model_intercept == pytest.approx(intercept)
+        assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
+    else:
+        # As it is an underdetermined problem, prediction = y. The following shows that
+        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
+        assert_allclose(model.predict(X), y, rtol=1e-6)
+        if fit_intercept:
+            # FIXME: Same as in test_glm_regression_unpenalized.
+            # But it is not the minimum norm solution. Otherwise the norms would be
+            # equal.
+            norm_solution = (1 + 1e-12) * np.linalg.norm(
+                0.5 * np.r_[intercept, intercept, coef, coef]
+            )
+            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+            assert norm_model > norm_solution
+            pytest.xfail(
+                reason=f"GLM with {solver} does not provide the minimum norm solution."
+            )
+
+        if fit_intercept:
+            assert model.intercept_ == pytest.approx(model.coef_[-1])
+        assert model_intercept == pytest.approx(intercept)
+        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
+
+
+@pytest.mark.parametrize("solver", SOLVERS)
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
+    """Test that unpenalized GLM converges for all solvers to correct solution.
+
+    We work with a simple constructed data set with known solution.
+    GLM fit on [X] is the same as fit on [X], [y]
+                                         [X], [y].
+    For wide X, [X', X'] is a singular matrix and we check against the minimum norm
+    solution:
+        min ||w||_2 subject to w = argmin deviance(w)
+    """
+    model, X, y, coef, _, _ = glm_dataset
+    n_samples, n_features = X.shape
+    alpha = 0  # unpenalized
+    params = dict(
+        alpha=alpha,
+        fit_intercept=fit_intercept,
+        solver=solver,
+        tol=1e-12,
+        max_iter=1000,
+    )
+
+    model = clone(model).set_params(**params)
+    if fit_intercept:
+        X = X[:, :-1]  # remove intercept
+        intercept = coef[-1]
+        coef = coef[:-1]
+    else:
+        intercept = 0
+    X = np.concatenate((X, X), axis=0)
+    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+    y = np.r_[y, y]
+
+    if n_samples < n_features:
+        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+    model.fit(X, y)
+
+    rtol = 5e-5
+    if n_samples > n_features:
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef, rtol=rtol)
+    else:
+        # As it is an underdetermined problem, prediction = y. The following shows that
+        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
+        assert_allclose(model.predict(X), y, rtol=1e-6)
+        if fit_intercept:
+            # FIXME: Same as in test_glm_regression_unpenalized.
+            # But it is not the minimum norm solution. Otherwise the norms would be
+            # equal.
+            norm_solution = (1 + 1e-12) * np.linalg.norm(np.r_[intercept, coef])
+            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+            assert norm_model > norm_solution
+            pytest.xfail(
+                reason=f"GLM with {solver} does not provide the minimum norm solution."
+            )
+
+        assert model.intercept_ == pytest.approx(intercept)
+        assert_allclose(model.coef_, coef, rtol=rtol)
+
+
 def test_sample_weights_validation():
     """Test the raised errors in the validation of sample_weight."""
     # scalar value but not positive

From cd06ba76b1a24e1d6325ebfb923fb6b22ff45461 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 14 Jun 2022 00:06:02 +0200
Subject: [PATCH 26/97] TST improve test_warm_start

---
 sklearn/linear_model/_glm/tests/test_glm.py | 72 ++++++++++++++-------
 1 file changed, 47 insertions(+), 25 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index fb53a1c983d54..c26659b9f7284 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -214,7 +214,7 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
     params = dict(
         alpha=alpha,
         fit_intercept=fit_intercept,
-        solver=solver,
+        # solver=solver,  # only lbfgs available
         tol=1e-12,
         max_iter=1000,
     )
@@ -231,7 +231,7 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
 
     model.fit(X, y)
 
-    rtol = 3e-5
+    rtol = 5e-5
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
@@ -258,7 +258,7 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
     params = dict(
         alpha=alpha / 2,
         fit_intercept=fit_intercept,
-        solver=solver,
+        # solver=solver,  # only lbfgs available
         tol=1e-12,
         max_iter=1000,
     )
@@ -306,7 +306,7 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
     params = dict(
         alpha=alpha,
         fit_intercept=fit_intercept,
-        solver=solver,
+        # solver=solver,  # only lbfgs available
         tol=1e-12,
         max_iter=1000,
     )
@@ -346,7 +346,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     params = dict(
         alpha=alpha,
         fit_intercept=fit_intercept,
-        solver=solver,
+        # solver=solver,  # only lbfgs available
         tol=1e-12,
         max_iter=1000,
     )
@@ -411,7 +411,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
     params = dict(
         alpha=alpha,
         fit_intercept=fit_intercept,
-        solver=solver,
+        # solver=solver,  # only lbfgs available
         tol=1e-12,
         max_iter=1000,
     )
@@ -488,7 +488,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     params = dict(
         alpha=alpha,
         fit_intercept=fit_intercept,
-        solver=solver,
+        # solver=solver,  # only lbfgs available
         tol=1e-12,
         max_iter=1000,
     )
@@ -767,38 +767,60 @@ def test_glm_log_regression(fit_intercept, estimator):
         assert_allclose(res.coef_, coef, rtol=2e-6)
 
 
+@pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
-def test_warm_start(fit_intercept):
-    n_samples, n_features = 110, 10
+def test_warm_start(solver, fit_intercept, global_random_seed):
+    n_samples, n_features = 100, 10
     X, y = make_regression(
         n_samples=n_samples,
         n_features=n_features,
         n_informative=n_features - 2,
-        noise=0.5,
-        random_state=42,
+        bias=fit_intercept * 1.0,
+        noise=1.0,
+        random_state=global_random_seed,
     )
+    y = np.abs(y)  # Poisson requires non-negative targets.
+    params = {"solver": solver, "fit_intercept": fit_intercept, "tol": 1e-10}
 
-    glm1 = _GeneralizedLinearRegressor(
-        warm_start=False, fit_intercept=fit_intercept, max_iter=1000
-    )
+    glm1 = PoissonRegressor(warm_start=False, max_iter=1000, **params)
     glm1.fit(X, y)
 
-    glm2 = _GeneralizedLinearRegressor(
-        warm_start=True, fit_intercept=fit_intercept, max_iter=1
-    )
-    # As we intentionally set max_iter=1, L-BFGS-B will issue a
-    # ConvergenceWarning which we here simply ignore.
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+    glm2 = PoissonRegressor(warm_start=True, max_iter=1, **params)
+    # As we intentionally set max_iter=1 such that the solver should raise a
+    # ConvergenceWarning.
+    with pytest.warns(ConvergenceWarning):
         glm2.fit(X, y)
-    assert glm1.score(X, y) > glm2.score(X, y)
+
+    linear_loss = LinearModelLoss(
+        base_loss=glm1._get_loss(),
+        fit_intercept=fit_intercept,
+    )
+    sw = np.full_like(y, fill_value=1 / n_samples)
+
+    objective_glm1 = linear_loss.loss(
+        coef=np.r_[glm1.coef_, glm1.intercept_] if fit_intercept else glm1.coef_,
+        X=X,
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=1.0,
+    )
+    objective_glm2 = linear_loss.loss(
+        coef=np.r_[glm2.coef_, glm2.intercept_] if fit_intercept else glm2.coef_,
+        X=X,
+        y=y,
+        sample_weight=sw,
+        l2_reg_strength=1.0,
+    )
+    assert objective_glm1 < objective_glm2
+
     glm2.set_params(max_iter=1000)
     glm2.fit(X, y)
-    # The two model are not exactly identical since the lbfgs solver
+    # The two models are not exactly identical since the lbfgs solver
     # computes the approximate hessian from previous iterations, which
     # will not be strictly identical in the case of a warm start.
-    assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
-    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
+    rtol = 2e-4
+    assert_allclose(glm1.coef_, glm2.coef_, rtol=rtol)
+    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
 
 
 # FIXME: 'normalize' to be removed in 1.2 in LinearRegression

From a27c7f9848667a06336f4b67fe5167cb4c6acf00 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 14 Jun 2022 17:12:21 +0200
Subject: [PATCH 27/97] ENH improve lbfgs options for better convergence

---
 sklearn/linear_model/_glm/glm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index d337eaa7a4a18..cfda1058eb373 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -285,9 +285,10 @@ def fit(self, X, y, sample_weight=None):
                 jac=True,
                 options={
                     "maxiter": self.max_iter,
+                    "maxls": 40,  # default is 20
                     "iprint": (self.verbose > 0) - 1,
                     "gtol": self.tol,
-                    "ftol": 1e3 * np.finfo(float).eps,
+                    "ftol": 64 * np.finfo(float).eps,  # lbfgs is float64 land.
                 },
                 args=(X, y, sample_weight, l2_reg_strength, n_threads),
             )

From a5c1fc074ad71844e8c3ba25d72428d423810f85 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 14 Jun 2022 18:47:01 +0200
Subject: [PATCH 28/97] CLN fix test_warm_start

---
 sklearn/linear_model/_glm/tests/test_glm.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c26659b9f7284..86e2947611114 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -780,7 +780,11 @@ def test_warm_start(solver, fit_intercept, global_random_seed):
         random_state=global_random_seed,
     )
     y = np.abs(y)  # Poisson requires non-negative targets.
-    params = {"solver": solver, "fit_intercept": fit_intercept, "tol": 1e-10}
+    params = {
+        # "solver": solver,  # only lbfgs available
+        "fit_intercept": fit_intercept,
+        "tol": 1e-10,
+    }
 
     glm1 = PoissonRegressor(warm_start=False, max_iter=1000, **params)
     glm1.fit(X, y)

From 9b8519d92afb12bbd4566986e4a43102dfbd3b23 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 15 Jun 2022 21:40:29 +0200
Subject: [PATCH 29/97] TST fix assert singular values in datasets

---
 sklearn/linear_model/_glm/tests/test_glm.py | 4 ++--
 sklearn/linear_model/tests/test_ridge.py    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 86e2947611114..d87ccc74bc2a1 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -46,7 +46,7 @@ def _special_minimize(fun, grad, x, tol_NM, tol):
     res_NM = minimize(
         fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
     )
-    # Now refine via root finding, wich is more precise then minimizing a function.
+    # Now refine via root finding, wich is more precise than minimizing a function.
     res = root(
         grad,
         res_NM.x,
@@ -128,7 +128,7 @@ def glm_dataset(global_random_seed, request):
     )
     X[:, -1] = 1  # last columns acts as intercept
     U, s, Vt = linalg.svd(X)
-    assert np.all(s) > 1e-3  # to be sure
+    assert np.all(s > 1e-3)  # to be sure
     assert np.max(s) / np.min(s) < 100  # condition number
     U1, _ = U[:, :k], U[:, k:]
     Vt1, _ = Vt[:k, :], Vt[k:, :]
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 1f05d821efed4..6a999e23b2db2 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -125,7 +125,7 @@ def ols_ridge_dataset(global_random_seed, request):
     )
     X[:, -1] = 1  # last columns acts as intercept
     U, s, Vt = linalg.svd(X)
-    assert np.all(s) > 1e-3  # to be sure
+    assert np.all(s > 1e-3)  # to be sure
     U1, U2 = U[:, :k], U[:, k:]
     Vt1, _ = Vt[:k, :], Vt[k:, :]
 

From 68947b613f49ca452822eedf50903887620aee69 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 15 Jun 2022 22:25:15 +0200
Subject: [PATCH 30/97] CLN address most review comments

---
 sklearn/linear_model/_glm/tests/test_glm.py | 58 ++++++++++++---------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index d87ccc74bc2a1..65d8b6dd1cb6b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -91,9 +91,15 @@ def glm_dataset(global_random_seed, request):
         If "wide", then n_features > n_samples.
     model : a GLM model
 
-    For "wide", we return the minimum norm solution w = X' (XX')^-1 y:
+    For "wide", we return the minimum norm solution:
 
-        min ||w||_2 subject to X w = y
+        min ||w||_2 subject to w = argmin deviance(X, y, w)
+
+    Note that the deviance is always minimized if y = inverse_link(X w) is possible to
+    achieve, which it is in the wide data case. Therefore, we can construct the
+    solution with minimum norm like (wide) OLS:
+
+        min ||w||_2 subject to link(y) = raw_prediction = X w
 
     Returns
     -------
@@ -107,7 +113,7 @@ def glm_dataset(global_random_seed, request):
         Last coefficient is intercept.
     coef_penalized : ndarray
         GLM solution with alpha=l2_reg_strength=1, i.e.
-        min 1/n * sum(loss) + ||w||_2^2.
+        min 1/n * sum(loss) + ||w[:-1]||_2^2.
         Last coefficient is intercept.
     """
     data_type, model = request.param
@@ -127,11 +133,9 @@ def glm_dataset(global_random_seed, request):
         random_state=rng,
     )
     X[:, -1] = 1  # last columns acts as intercept
-    U, s, Vt = linalg.svd(X)
+    U, s, Vt = linalg.svd(X, full_matrices=False)
     assert np.all(s > 1e-3)  # to be sure
-    assert np.max(s) / np.min(s) < 100  # condition number
-    U1, _ = U[:, :k], U[:, k:]
-    Vt1, _ = Vt[:k, :], Vt[k:, :]
+    assert np.max(s) / np.min(s) < 100  # condition number of X
 
     if data_type == "long":
         coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
@@ -139,8 +143,9 @@ def glm_dataset(global_random_seed, request):
         raw_prediction = X @ coef_unpenalized
     else:
         raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
-        # w = X'(XX')^-1 y = V s^-1 U' y
-        coef_unpenalized = Vt1.T @ np.diag(1 / s) @ U1.T @ raw_prediction
+        # minimum norm solution min ||w||_2 such that raw_prediction = X w:
+        # w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction
+        coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction
 
     linear_loss = LinearModelLoss(base_loss=model._get_loss(), fit_intercept=True)
     sw = np.full(shape=n_samples, fill_value=1 / n_samples)
@@ -250,7 +255,7 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
 
     We work with a simple constructed data set with known solution.
     Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
-    For long X, [X, X] is a singular matrix.
+    For long X, [X, X] is still a long but singular matrix.
     """
     model, X, y, _, coef_with_intercept, coef_without_intercept = glm_dataset
     n_samples, n_features = X.shape
@@ -263,10 +268,10 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
         max_iter=1000,
     )
 
-    if not fit_intercept:
-        # Line search cannot locate an adequate point after MAXLS
-        #  function and gradient evaluations.
-        #  Previous x, f and g restored.
+    if solver == "lbfgs" and not fit_intercept:
+        # Sometimes (depending on global_random_seed) lbfgs fails with:
+        # Line search cannot locate an adequate point after MAXLS function and gradient
+        # evaluations. Previous x, f and g restored.
         # Possible causes: 1 error in function or gradient evaluation;
         #                  2 rounding error dominate computation.
         pytest.xfail()
@@ -381,9 +386,9 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         if fit_intercept:
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
-            norm_solution = (1 + 1e-12) * np.linalg.norm(np.r_[intercept, coef])
+            norm_solution = np.linalg.norm(np.r_[intercept, coef])
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
-            assert norm_model > norm_solution
+            assert norm_model > (1 + 1e-12) * norm_solution
             pytest.xfail(
                 reason=f"GLM with {solver} does not provide the minimum norm solution."
             )
@@ -403,7 +408,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
     GLM fit on [X] is the same as fit on [X, X]/2.
     For long X, [X, X] is a singular matrix and we check against the minimum norm
     solution:
-        min ||w||_2 subject to w = argmin deviance(w)
+        min ||w||_2 subject to w = argmin deviance(X, y, w)
     """
     model, X, y, coef, _, _ = glm_dataset
     n_samples, n_features = X.shape
@@ -455,11 +460,11 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
             # FIXME: Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
-            norm_solution = (1 + 1e-12) * np.linalg.norm(
+            norm_solution = np.linalg.norm(
                 0.5 * np.r_[intercept, intercept, coef, coef]
             )
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
-            assert norm_model > norm_solution
+            assert norm_model > (1 + 1e-12) * norm_solution
             pytest.xfail(
                 reason=f"GLM with {solver} does not provide the minimum norm solution."
             )
@@ -480,7 +485,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
                                          [X], [y].
     For wide X, [X', X'] is a singular matrix and we check against the minimum norm
     solution:
-        min ||w||_2 subject to w = argmin deviance(w)
+        min ||w||_2 subject to w = argmin deviance(X, y, w)
     """
     model, X, y, coef, _, _ = glm_dataset
     n_samples, n_features = X.shape
@@ -521,9 +526,9 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
             # FIXME: Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
-            norm_solution = (1 + 1e-12) * np.linalg.norm(np.r_[intercept, coef])
+            norm_solution = np.linalg.norm(np.r_[intercept, coef])
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
-            assert norm_model > norm_solution
+            assert norm_model > (1 + 1e-12) * norm_solution
             pytest.xfail(
                 reason=f"GLM with {solver} does not provide the minimum norm solution."
             )
@@ -780,16 +785,17 @@ def test_warm_start(solver, fit_intercept, global_random_seed):
         random_state=global_random_seed,
     )
     y = np.abs(y)  # Poisson requires non-negative targets.
+    alpha = 1
     params = {
         # "solver": solver,  # only lbfgs available
         "fit_intercept": fit_intercept,
         "tol": 1e-10,
     }
 
-    glm1 = PoissonRegressor(warm_start=False, max_iter=1000, **params)
+    glm1 = PoissonRegressor(warm_start=False, max_iter=1000, alpha=alpha, **params)
     glm1.fit(X, y)
 
-    glm2 = PoissonRegressor(warm_start=True, max_iter=1, **params)
+    glm2 = PoissonRegressor(warm_start=True, max_iter=1, alpha=alpha, **params)
     # As we intentionally set max_iter=1 such that the solver should raise a
     # ConvergenceWarning.
     with pytest.warns(ConvergenceWarning):
@@ -806,14 +812,14 @@ def test_warm_start(solver, fit_intercept, global_random_seed):
         X=X,
         y=y,
         sample_weight=sw,
-        l2_reg_strength=1.0,
+        l2_reg_strength=alpha,
     )
     objective_glm2 = linear_loss.loss(
         coef=np.r_[glm2.coef_, glm2.intercept_] if fit_intercept else glm2.coef_,
         X=X,
         y=y,
         sample_weight=sw,
-        l2_reg_strength=1.0,
+        l2_reg_strength=alpha,
     )
     assert objective_glm1 < objective_glm2
 

From 06a0b79a885f5de41969d886761d8454d09375d7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 15 Jun 2022 22:25:59 +0200
Subject: [PATCH 31/97] ENH enable more vebosity levels for lbfgs

---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index cfda1058eb373..2082387181767 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -286,7 +286,7 @@ def fit(self, X, y, sample_weight=None):
                 options={
                     "maxiter": self.max_iter,
                     "maxls": 40,  # default is 20
-                    "iprint": (self.verbose > 0) - 1,
+                    "iprint": self.verbose - 1,
                     "gtol": self.tol,
                     "ftol": 64 * np.finfo(float).eps,  # lbfgs is float64 land.
                 },

From 4d245cf247e3e900f3c120b3dd316575caadbb65 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 15 Jun 2022 22:31:21 +0200
Subject: [PATCH 32/97] DOC add whatsnew

---
 doc/whats_new/v1.2.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index d8cadff6e83f8..00d10d384dcc7 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -30,6 +30,12 @@ random sampling procedures.
   :pr:`10805` by :user:`Mathias Andersen <MrMathias>` and
   :pr:`23471` by :user:`Meekail Zain <micky774>`
 
+- |Enhancement| :class:`linear_model.GammaRegressor`,
+  :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor`
+  can reach higher precision with the lbfgs solver, in particular when `tol` is set
+  to a tiny value.
+  :pr:`23619` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 Changes impacting all modules
 -----------------------------
 
@@ -117,6 +123,12 @@ Changelog
 - |Fix| Use dtype-aware tolerances for the validation of gram matrices (passed by users
   or precomputed). :pr:`22059` by :user:`Malte S. Kurz <MalteKurz>`.
 
+- |Enhancement| :class:`linear_model.GammaRegressor`,
+  :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor`
+  can reach higher precision with the lbfgs solver, in particular when `tol` is set
+  to a tiny value.
+  :pr:`23619` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 :mod:`sklearn.metrics`
 ......................
 

From 382d17707457638ee936bf44a9f25bb3e296a71c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 16 Jun 2022 19:23:44 +0200
Subject: [PATCH 33/97] CLN remove xfail and clean a bit

---
 sklearn/linear_model/_glm/tests/test_glm.py | 48 ++++++++++-----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 65d8b6dd1cb6b..cd2d6473fa535 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -380,21 +380,27 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y, rtol=1e-6)
-        assert_allclose(
-            model._get_loss().link.inverse(X @ coef + intercept), y, rtol=5e-7
-        )
         if fit_intercept:
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
             norm_solution = np.linalg.norm(np.r_[intercept, coef])
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
             assert norm_model > (1 + 1e-12) * norm_solution
-            pytest.xfail(
-                reason=f"GLM with {solver} does not provide the minimum norm solution."
-            )
 
-        assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef, rtol=rtol)
+            # Note: Even adding a tiny penalty does not give the minimal norm solution.
+            # model_pen = clone(model).set_params(**params).set_params(alpha=1e-10)
+            # model_pen.fit(X, y)
+            # assert_allclose(model_pen.predict(X), y, rtol=1e-6)  # This is true.
+            # norm_model_pen = np.linalg.norm(
+            #     np.r_[model_pen.intercept_, model_pen.coef_]
+            # )
+            # All the following assertions fail.
+            # assert norm_model_pen == pytest.approx(norm_solution)
+            # assert model_pen.intercept_ == pytest.approx(intercept)
+            # assert_allclose(model_pen.coef_, coef, rtol=rtol)
+        else:
+            assert model.intercept_ == pytest.approx(intercept)
+            assert_allclose(model.coef_, coef, rtol=rtol)
 
 
 @pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
@@ -457,7 +463,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y, rtol=1e-6)
         if fit_intercept:
-            # FIXME: Same as in test_glm_regression_unpenalized.
+            # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
             norm_solution = np.linalg.norm(
@@ -465,14 +471,11 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
             )
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
             assert norm_model > (1 + 1e-12) * norm_solution
-            pytest.xfail(
-                reason=f"GLM with {solver} does not provide the minimum norm solution."
-            )
-
-        if fit_intercept:
-            assert model.intercept_ == pytest.approx(model.coef_[-1])
-        assert model_intercept == pytest.approx(intercept)
-        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
+            # For minimum norm solution, we would have
+            # assert model.intercept_ == pytest.approx(model.coef_[-1])
+        else:
+            assert model_intercept == pytest.approx(intercept)
+            assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
 
 
 @pytest.mark.parametrize("solver", SOLVERS)
@@ -523,18 +526,15 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y, rtol=1e-6)
         if fit_intercept:
-            # FIXME: Same as in test_glm_regression_unpenalized.
+            # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
             norm_solution = np.linalg.norm(np.r_[intercept, coef])
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
             assert norm_model > (1 + 1e-12) * norm_solution
-            pytest.xfail(
-                reason=f"GLM with {solver} does not provide the minimum norm solution."
-            )
-
-        assert model.intercept_ == pytest.approx(intercept)
-        assert_allclose(model.coef_, coef, rtol=rtol)
+        else:
+            assert model.intercept_ == pytest.approx(intercept)
+            assert_allclose(model.coef_, coef, rtol=rtol)
 
 
 def test_sample_weights_validation():

From 25fe6e126a41220fb579b3a1fdc52d0f571c84e7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 16 Jun 2022 19:25:54 +0200
Subject: [PATCH 34/97] CLN docstring about minimum norm

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index cd2d6473fa535..b7d1d265897e1 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -343,7 +343,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     We work with a simple constructed data set with known solution.
     Note: This checks the minimum norm solution for wide X, i.e.
     n_samples < n_features:
-        min ||w||_2 subject to w minimizing the mean deviviance.
+        min ||w||_2 subject to w = argmin deviance(X, y, w)
     """
     model, X, y, coef, _, _ = glm_dataset
     n_samples, n_features = X.shape

From 4c4582da57e0eb807f4f317c05940d06bee8b259 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 16 Jun 2022 19:56:17 +0200
Subject: [PATCH 35/97] More informative repr for the glm_dataset fixture cases

---
 sklearn/linear_model/_glm/tests/test_glm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index b7d1d265897e1..84ea43de538cf 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -75,7 +75,8 @@ def regression_data():
             # TweedieRegressor(power=0, link="log"),  # too difficult
             TweedieRegressor(power=1.5),
         ],
-    )
+    ),
+    ids=lambda param: f"{param[0]}-{param[1]}"
 )
 def glm_dataset(global_random_seed, request):
     """Dataset with GLM solutions, well conditioned X.

From 2065a9e144fe8f19eaf8791434820396fb6e61be Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 16 Jun 2022 20:07:44 +0200
Subject: [PATCH 36/97] Forgot to run black

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 84ea43de538cf..068ee09074acc 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -76,7 +76,7 @@ def regression_data():
             TweedieRegressor(power=1.5),
         ],
     ),
-    ids=lambda param: f"{param[0]}-{param[1]}"
+    ids=lambda param: f"{param[0]}-{param[1]}",
 )
 def glm_dataset(global_random_seed, request):
     """Dataset with GLM solutions, well conditioned X.

From 5aaaf21722943188760ca0787e3f05a66dd4d330 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 09:21:38 +0200
Subject: [PATCH 37/97] CLN remove unnecessary filterwarnings

---
 sklearn/linear_model/_glm/tests/test_glm.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index b7d1d265897e1..f94e7fba132ba 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -8,7 +8,6 @@
 import warnings
 
 import numpy as np
-import scipy
 from numpy.testing import assert_allclose
 import pytest
 from scipy import linalg
@@ -364,9 +363,6 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     else:
         intercept = 0
 
-    if n_samples < n_features:
-        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
     model.fit(X, y)
 
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
@@ -403,8 +399,6 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
             assert_allclose(model.coef_, coef, rtol=rtol)
 
 
-@pytest.mark.filterwarnings("ignore::scipy.linalg.LinAlgWarning")
-@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
 @pytest.mark.parametrize("solver", SOLVERS)
 @pytest.mark.parametrize("fit_intercept", [True, False])
 def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
@@ -442,6 +436,9 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         intercept = 0
         X = 0.5 * np.concatenate((X, X), axis=1)
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
+
+    if fit_intercept and n_samples < n_features:
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
     model.fit(X, y)
 
     if fit_intercept and n_samples < n_features:
@@ -512,9 +509,6 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
     y = np.r_[y, y]
 
-    if n_samples < n_features:
-        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
     model.fit(X, y)
 
     rtol = 5e-5

From 10da88004c6d2111f24401e51aa82ad0f2456b43 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 09:24:22 +0200
Subject: [PATCH 38/97] CLN address review comments

---
 sklearn/linear_model/_glm/tests/test_glm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index f94e7fba132ba..f8c97a7e2e241 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -45,7 +45,8 @@ def _special_minimize(fun, grad, x, tol_NM, tol):
     res_NM = minimize(
         fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
     )
-    # Now refine via root finding, wich is more precise than minimizing a function.
+    # Now refine via root finding on the gradient of the function, wich is more precise
+    # than minimizing the function itself.
     res = root(
         grad,
         res_NM.x,
@@ -384,6 +385,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
             assert norm_model > (1 + 1e-12) * norm_solution
 
             # Note: Even adding a tiny penalty does not give the minimal norm solution.
+            # XXX:
             # model_pen = clone(model).set_params(**params).set_params(alpha=1e-10)
             # model_pen.fit(X, y)
             # assert_allclose(model_pen.predict(X), y, rtol=1e-6)  # This is true.
@@ -822,8 +824,7 @@ def test_warm_start(solver, fit_intercept, global_random_seed):
     # The two models are not exactly identical since the lbfgs solver
     # computes the approximate hessian from previous iterations, which
     # will not be strictly identical in the case of a warm start.
-    rtol = 2e-4
-    assert_allclose(glm1.coef_, glm2.coef_, rtol=rtol)
+    assert_allclose(glm1.coef_, glm2.coef_, rtol=2e-4)
     assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-5)
 
 
From e16d04e1ee16e3a26585d4c2972ebe0332525742 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 17 Jun 2022 10:41:28 +0200
Subject: [PATCH 39/97] Trigger [all random seeds] on the following tests:
 test_glm_regression test_glm_regression_hstacked_X
 test_glm_regression_vstacked_X test_glm_regression_unpenalized
 test_glm_regression_unpenalized_hstacked_X
 test_glm_regression_unpenalized_vstacked_X test_warm_start


From 2fa4397bd198c932a851d63b0feb1313c2264014 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 11:08:31 +0200
Subject: [PATCH 40/97] CLN add comment for lbfgs ftol=64 * machine precision

---
 sklearn/linear_model/_glm/glm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 2082387181767..af558030b09cd 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -288,6 +288,9 @@ def fit(self, X, y, sample_weight=None):
                     "maxls": 40,  # default is 20
                     "iprint": self.verbose - 1,
                     "gtol": self.tol,
+                    # The constant 64 was found empirically to pass the test suite. The
+                    # point is that ftol is very small, but a bit larger than machine
+                    # precision.
                     "ftol": 64 * np.finfo(float).eps,  # lbfgs is float64 land.
                 },
                 args=(X, y, sample_weight, l2_reg_strength, n_threads),

From c0e242213c13e10c1c01114133597a819ae224b1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 11:14:31 +0200
Subject: [PATCH 41/97] CLN XXX code comment

---
 sklearn/linear_model/_glm/tests/test_glm.py | 26 +++++++++++++--------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 54de64b56ab8a..9cb2bd21ea352 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -386,18 +386,24 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
             assert norm_model > (1 + 1e-12) * norm_solution
 
             # Note: Even adding a tiny penalty does not give the minimal norm solution.
-            # XXX:
-            # model_pen = clone(model).set_params(**params).set_params(alpha=1e-10)
-            # model_pen.fit(X, y)
-            # assert_allclose(model_pen.predict(X), y, rtol=1e-6)  # This is true.
-            # norm_model_pen = np.linalg.norm(
-            #     np.r_[model_pen.intercept_, model_pen.coef_]
-            # )
+            # XXX: We could have naively expected LBFGS to find the minimal norm
+            # solution by adding a very small penalty. However, as the following code
+            # snippet shows, this does not work for a reason we do not properly
+            # understand at this point.
+            #   model_pen = clone(model).set_params(**params).set_params(alpha=1e-10)
+            #   model_pen.fit(X, y)
+            #   assert_allclose(model_pen.predict(X), y, rtol=1e-6)  # This is true.
+            #   norm_model_pen = np.linalg.norm(
+            #       np.r_[model_pen.intercept_, model_pen.coef_]
+            #   )
             # All the following assertions fail.
-            # assert norm_model_pen == pytest.approx(norm_solution)
-            # assert model_pen.intercept_ == pytest.approx(intercept)
-            # assert_allclose(model_pen.coef_, coef, rtol=rtol)
+            #   assert norm_model_pen == pytest.approx(norm_solution)
+            #   assert model_pen.intercept_ == pytest.approx(intercept)
+            #   assert_allclose(model_pen.coef_, coef, rtol=rtol)
         else:
+            # When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
+            # solution on this problem.
+            # XXX: Do we have any theoretical guarantees why this should be the case?
             assert model.intercept_ == pytest.approx(intercept)
             assert_allclose(model.coef_, coef, rtol=rtol)
 

From 11493428c83a2b7d899c3066511ea9fd25e14eec Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 11:18:13 +0200
Subject: [PATCH 42/97] Trigger [all random seeds] on the following tests:

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start

From 3dad44571447e5a374c2515fa69dbb83adf98b77 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 11:46:35 +0200
Subject: [PATCH 43/97] CLN link issue and remove code snippet in comment

---
 sklearn/linear_model/_glm/tests/test_glm.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 9cb2bd21ea352..42026f5a62f2b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -385,21 +385,11 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
             norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
             assert norm_model > (1 + 1e-12) * norm_solution
 
+            # See https://github.com/scikit-learn/scikit-learn/issues/23670.
             # Note: Even adding a tiny penalty does not give the minimal norm solution.
             # XXX: We could have naively expected LBFGS to find the minimal norm
-            # solution by adding a very small penalty. However, as the following code
-            # snippet shows, this does not work for a reason we do not properly
-            # understand at this point.
-            #   model_pen = clone(model).set_params(**params).set_params(alpha=1e-10)
-            #   model_pen.fit(X, y)
-            #   assert_allclose(model_pen.predict(X), y, rtol=1e-6)  # This is true.
-            #   norm_model_pen = np.linalg.norm(
-            #       np.r_[model_pen.intercept_, model_pen.coef_]
-            #   )
-            # All the following assertions fail.
-            #   assert norm_model_pen == pytest.approx(norm_solution)
-            #   assert model_pen.intercept_ == pytest.approx(intercept)
-            #   assert_allclose(model_pen.coef_, coef, rtol=rtol)
+            # solution by adding a very small penalty. Even that fails for a reason we
+            # do not properly
         else:
             # When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
             # solution on this problem.

From 12525f128a77b7aaed4823d286c9e76ec04cbfc2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 11:46:49 +0200
Subject: [PATCH 44/97] Trigger [all random seeds] on the following tests:

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start

From 556164ae6c9e9cd7dfcbbaf2840ca3d35cdb9f82 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 14:28:02 +0200
Subject: [PATCH 45/97] CLN add catch_warnings

---
 sklearn/linear_model/_glm/tests/test_glm.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 42026f5a62f2b..bdc51b7b8e2e0 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -436,9 +436,12 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         X = 0.5 * np.concatenate((X, X), axis=1)
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
 
-    if fit_intercept and n_samples < n_features:
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
-    model.fit(X, y)
+    with warnings.catch_warnings():
+        if fit_intercept and n_samples < n_features:
+            # XXX: Investigate if the lack of convergence in this case should be
+            # considered a bug or not.
+            warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        model.fit(X, y)
 
     if fit_intercept and n_samples < n_features:
         # Here we take special care.

From 4fcc1c8473f3bd933105e7a76ff5dbce68cbc62e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 17:23:18 +0200
Subject: [PATCH 46/97] Trigger [all random seeds] on the following tests:

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start

From c723f65330518688f545451e48ddd7d3c1f8365b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 17 Jun 2022 21:20:51 +0200
Subject: [PATCH 47/97] Trigger [all random seeds] on the following tests:

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start

From 3458c397085585a397117ea210aa3dddbaef6a13 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 11:41:56 +0200
Subject: [PATCH 48/97] [all random seeds]

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start

From 99f4cf99ca41b4ad2bdad537ad60f936970e3a88 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 14:20:35 +0200
Subject: [PATCH 49/97] Trigger with -Werror [all random seeds]

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start
---
 build_tools/azure/test_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index 03e12e8ab4702..b23eb646d4174 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -38,7 +38,7 @@ python -c "import sklearn; sklearn.show_versions()"
 
 show_installed_libraries
 
-TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML"
+TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML -Werror"
 
 if [[ "$COVERAGE" == "true" ]]; then
     # Note: --cov-report= is used to disable to long text output report in the

From 79ec862e0bc0d8c6c5bde8b0253c1ebde4355ae0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 15:05:05 +0200
Subject: [PATCH 50/97] ENH increase maxls to 50

---
 sklearn/linear_model/_glm/glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index af558030b09cd..d23d229c96457 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -285,7 +285,7 @@ def fit(self, X, y, sample_weight=None):
                 jac=True,
                 options={
                     "maxiter": self.max_iter,
-                    "maxls": 40,  # default is 20
+                    "maxls": 50,  # default is 20
                     "iprint": self.verbose - 1,
                     "gtol": self.tol,
                     # The constant 64 was found empirically to pass the test suite. The

From 904e9601ddd785a3e120165fdd3c0156ca7e77fd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 15:05:33 +0200
Subject: [PATCH 51/97] [all random seeds]

test_glm_regression
test_glm_regression_hstacked_X
test_glm_regression_vstacked_X
test_glm_regression_unpenalized
test_glm_regression_unpenalized_hstacked_X
test_glm_regression_unpenalized_vstacked_X
test_warm_start

From 4fd1d9bfa3e4c9bcd3b97f0c9ea748f2bec04934 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 15:26:46 +0200
Subject: [PATCH 52/97] Revert "Trigger with -Werror [all random seeds]"

This reverts commit 99f4cf99ca41b4ad2bdad537ad60f936970e3a88.
---
 build_tools/azure/test_script.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index b23eb646d4174..03e12e8ab4702 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -38,7 +38,7 @@ python -c "import sklearn; sklearn.show_versions()"
 
 show_installed_libraries
 
-TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML -Werror"
+TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML"
 
 if [[ "$COVERAGE" == "true" ]]; then
     # Note: --cov-report= is used to disable to long text output report in the

From 81efa1a80fb2028196a69c4bb8a960a353ee5a1d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 15:57:24 +0200
Subject: [PATCH 53/97] TST add catch_warnings to filterwarnings

---
 sklearn/linear_model/_glm/tests/test_glm.py | 33 +++++++++++----------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index a64e7f1c4bb9a..2f617c884cf6b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -236,13 +236,14 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
         coef = coef_without_intercept
         intercept = 0
 
-    if solver in ["newton-cholesky", "newton-qr-cholesky"]:
-        warnings.filterwarnings(
-            action="ignore",
-            message=".*pointwise hessian to have many non-positive values.*",
-            category=ConvergenceWarning,
-        )
-    model.fit(X, y)
+    with warnings.catch_warnings():
+        if solver in ["newton-cholesky", "newton-qr-cholesky"]:
+            warnings.filterwarnings(
+                action="ignore",
+                message=".*pointwise hessian to have many non-positive values.*",
+                category=ConvergenceWarning,
+            )
+        model.fit(X, y)
 
     rtol = 5e-5 if solver == "lbfgs" else 1e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
@@ -372,10 +373,11 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     else:
         intercept = 0
 
-    if n_samples < n_features:
-        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
-    model.fit(X, y)
+    with warnings.catch_warnings():
+        if n_samples < n_features:
+            warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
+            warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        model.fit(X, y)
 
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
     # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
@@ -521,10 +523,11 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
     y = np.r_[y, y]
 
-    if n_samples < n_features:
-        warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-        warnings.filterwarnings("ignore", category=ConvergenceWarning)
-    model.fit(X, y)
+    with warnings.catch_warnings():
+        if n_samples < n_features:
+            warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
+            warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        model.fit(X, y)
 
     rtol = 5e-5 if solver == "lbfgs" else 1e-6
     if n_samples > n_features:

From fa7469c04ab1915a10ae50283797f30c2bf01c2e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Jun 2022 16:27:32 +0200
Subject: [PATCH 54/97] TST adapt tests for newton solvers

---
 sklearn/linear_model/_glm/tests/test_glm.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2f617c884cf6b..75202d7e97364 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -390,7 +390,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y, rtol=1e-6)
-        if fit_intercept:
+        if fit_intercept or solver in ["newton-cholesky"]:
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
             norm_solution = np.linalg.norm(np.r_[intercept, coef])
@@ -449,9 +449,12 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
 
     with warnings.catch_warnings():
-        if fit_intercept and n_samples < n_features:
+        if (
+            solver == "lbfgs" and fit_intercept and n_samples < n_features
+        ) or solver in ["newton-cholesky", "newton-qr-cholesky"]:
             # XXX: Investigate if the lack of convergence in this case should be
             # considered a bug or not.
+            warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
             warnings.filterwarnings("ignore", category=ConvergenceWarning)
         model.fit(X, y)
 
@@ -473,7 +476,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y, rtol=1e-6)
-        if fit_intercept:
+        if fit_intercept or solver in ["newton-cholesky"]:
             # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
@@ -537,7 +540,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         assert_allclose(model.predict(X), y, rtol=1e-6)
-        if fit_intercept:
+        if fit_intercept or solver in ["newton-cholesky"]:
             # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.

From ccb986610afd72c447d3d5a5fa4c49d76845a54a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 19 Jun 2022 10:48:46 +0200
Subject: [PATCH 55/97] CLN cleaner gradient step with gradient_times_newton

---
 sklearn/linear_model/_glm/glm.py            | 97 +++++++++++++++++----
 sklearn/linear_model/_glm/tests/test_glm.py |  4 +-
 sklearn/linear_model/_linear_loss.py        |  5 +-
 3 files changed, 83 insertions(+), 23 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 51fcb4ea110dd..31f7d65606304 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -35,9 +35,9 @@
 class NewtonSolver(ABC):
     """Newton solver for GLMs.
 
-    This class implements Newton/2nd-order optimization for GLMs. Each Newton iteration
-    aims at finding the Newton step which is done by the inner solver. With hessian H,
-    gradient g and coefficients coef, one step solves
+    This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
+    iteration aims at finding the Newton step which is done by the inner solver. With
+    hessian H, gradient g and coefficients coef, one step solves:
 
         H @ coef_newton = -g
 
@@ -94,6 +94,42 @@ class NewtonSolver(ABC):
     n_threads : int, default=1
         Number of OpenMP threads to use for the computation of the hessian and gradient
         of the loss function.
+
+    Attributes
+    ----------
+    coef_old : ndarray of shape coef.shape
+        Coefficient of previous iteration.
+
+    coef_newton : ndarray of shape coef.shape
+        Newton step.
+
+    gradient : ndarray of shape coef.shape
+        Gradient of the loss wrt. the coefficients.
+
+    gradient_old : ndarray of shape coef.shape
+        Gradient of previous iteration.
+
+    loss_value : float
+        Value of objective function = loss + penalty.
+
+    loss_value_old : float
+        Value of objective function of previous itertion.
+
+    raw_prediction : ndarray of shape (n_samples,) or \
+            (n_samples, n_classes)
+
+    converged : bool
+        Indicator for convergence of the solver.
+
+    iteration : int
+        Number of Newton steps, i.e. calls to inner_solve
+
+    use_lbfgs_step : bool
+        An inner solver can set this to True to resort to LBFGS for one iteration.
+
+    gradient_times_newton : float
+        gradient @ coef_newton, set in inner_solve and used by line_search. If the
+        Newton step is a descent direction, this is negative.
     """
 
     def __init__(
@@ -149,12 +185,16 @@ def update_gradient_hessian(self, X, y, sample_weight):
     def inner_solve(self, X, y, sample_weight):
         """Compute Newton step.
 
-        Sets self.coef_newton.
+        Sets:
+            - self.coef_newton
+            - gradient_times_newton
         """
 
     def lbfgs_step(self, X, y, sample_weight):
         """Fallback for inner solver.
 
+        This is like inner_solve and line_search together.
+
         Use 4 lbfgs steps. As in line_search sets:
             - self.coef_old
             - self.coef
@@ -214,7 +254,9 @@ def line_search(self, X, y, sample_weight):
         eps = 16 * np.finfo(self.loss_value.dtype).eps
         t = 1  # step size
 
-        armijo_term = sigma * self.gradient @ self.coef_newton
+        # gradient_times_newton = self.gradient @ self.coef_newton
+        # was computed in inner_solve.
+        armijo_term = sigma * self.gradient_times_newton
         _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
             self.coef_newton, X
         )
@@ -341,7 +383,10 @@ def line_search(self, X, y, sample_weight):
         self.raw_prediction = raw
 
     def check_convergence(self, X, y, sample_weight):
-        """Check for convergence."""
+        """Check for convergence.
+
+        Sets self.converged.
+        """
         if self.verbose:
             print("  Check Convergence")
         # Note: Checking maximum relative change of coefficient <= tol is a bad
@@ -390,6 +435,8 @@ def finalize(self, X, y, sample_weight):
     def solve(self, X, y, sample_weight):
         """Solve the optimization problem.
 
+        This is the main routine.
+
         Order of calls:
             self.setup()
             while iteration:
@@ -398,6 +445,11 @@ def solve(self, X, y, sample_weight):
                 self.line_search()
                 self.check_convergence()
             self.finalize()
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Solution of the optimization problem.
         """
         # setup usually:
         #   - initializes self.coef if needed
@@ -406,7 +458,6 @@ def solve(self, X, y, sample_weight):
 
         self.iteration = 1
         self.converged = False
-        self.stop = False  # Can be used by inner_solve to stop iteration.
 
         while self.iteration <= self.max_iter and not self.converged:
             if self.verbose:
@@ -425,10 +476,7 @@ def solve(self, X, y, sample_weight):
             # 2. Inner solver
             #    Calculate Newton step/direction
             #    This usually sets self.coef_newton.
-            #    It may set self.stop = True, e.g. for ill-conditioned systems.
             self.inner_solve(X=X, y=y, sample_weight=sample_weight)
-            if self.stop:
-                break
             if self.use_lbfgs_step:
                 self.lbfgs_step(X=X, y=y, sample_weight=sample_weight)
 
@@ -492,13 +540,20 @@ def inner_solve(self, X, y, sample_weight):
             self.use_lbfgs_step = True
             return
 
+        gradient_step = False
         try:
             with warnings.catch_warnings():
                 warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
                 self.coef_newton = scipy.linalg.solve(
                     self.hessian, -self.gradient, check_finite=False, assume_a="sym"
                 )
-            return
+                self.gradient_times_newton = self.gradient @ self.coef_newton
+                gradient_step = self.gradient_times_newton > 0
+                if gradient_step and self.verbose:
+                    print(
+                        "  The inner solver found a Newton step that is not a descent "
+                        "direction and resorts to a simple gradient step."
+                    )
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             if self.count_singular == 0:
                 # We only need to throw this warning once.
@@ -519,13 +574,13 @@ def inner_solve(self, X, y, sample_weight):
             self.count_singular += 1
             # Possible causes:
             # 1. hess_pointwise is negative. But this is already taken care in
-            #    LinearModelLoss such that min(hess_pointwise) >= 0.
+            #    LinearModelLoss.gradient_hessian.
             # 2. X is singular or ill-conditioned
             #    This might be the most probable cause.
             #
-            # There are many possible ways to deal with this situation (most of
-            # them adding, explicit or implicit, a matrix to the hessian to make it
-            # positive definite), confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # There are many possible ways to deal with this situation. Most of them
+            # add, explicit or implicit, a matrix to the hessian to make it positive
+            # definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
             # Instead, we resort to a simple gradient step, taking the diagonal part
             # of the hessian.
             if self.verbose:
@@ -533,11 +588,15 @@ def inner_solve(self, X, y, sample_weight):
                     "  The inner solver stumbled upon an singular or ill-conditioned "
                     "hessian matrix and resorts to a simple gradient step."
                 )
-            # We add 1e-3 to the diagonal hessian part to make in invertible and to
-            # restrict coef_newton to at most ~1e3. The line search considerst step
-            # sizes until 1e-6 * newton_step ~1e-3 * newton_step.
+            gradient_step = True
+
+        if gradient_step:
+            # We add 1e-3 to the diagonal hessian part to make it invertible and to
+            # restrict coef_newton to at most ~1e3. The line search considers step
+            # sizes until 2**-20 ~ 1e-6 * newton_step >~ 1e-3 * gradient.
             eps = 1e-3
-            self.coef_newton = -self.gradient / (np.diag(self.hessian) + eps)
+            self.coef_newton = -self.gradient / (np.abs(np.diag(self.hessian)) + eps)
+            self.gradient_times_newton = self.gradient @ self.coef_newton
 
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 75202d7e97364..2c477b3f9ce0f 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -298,7 +298,7 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 1e-4 if solver == "lbfgs" else 1e-10
+    rtol = 1e-4 if solver == "lbfgs" else 5e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
 
@@ -339,7 +339,7 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 1e-10
+    rtol = 3e-5 if solver == "lbfgs" else 5e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index dd433d91f610c..710bc6e4f71dc 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -431,8 +431,9 @@ def gradient_hessian(
         )
 
         # For non-canonical link functions and far away from the optimum, the pointwise
-        # hessian can be negative. We take care that the hessian is positive.
-        hessian_warning = np.sum(hess_pointwise <= 0) > len(hess_pointwise) / 2
+        # hessian can be negative. We take care that 75% ot the hessian entries are
+        # positive.
+        hessian_warning = np.sum(hess_pointwise <= 0) > len(hess_pointwise) * 0.25
         hess_pointwise = np.abs(hess_pointwise)
 
         if not self.base_loss.is_multiclass:

From 28f2051905d0f51e3bfbaac376fd6402b57d4dd3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 19 Jun 2022 10:54:30 +0200
Subject: [PATCH 56/97] DOC add whatsnew

---
 doc/whats_new/v1.2.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index 85d90f7b62216..172ac7628d93b 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -145,6 +145,13 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Enhancement| :class:`linear_model.GammaRegressor`,
+  :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor` got
+  a `solver` parameter with the two new solvers `solver="newton-cholesky"` and
+  `solver="newton-qr-cholesky"`. Those are 2nd order (Newton) optimisation routines
+  that may reach higher precision in less time than the already available `"lbfgs"`.
+  :pr:`23314` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Fix| Use dtype-aware tolerances for the validation of gram matrices (passed by users
   or precomputed). :pr:`22059` by :user:`Malte S. Kurz <MalteKurz>`.
 

From 2d9f20566ed21180222fb738b5f6d18f9f893714 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 19 Jun 2022 16:05:46 +0200
Subject: [PATCH 57/97] ENH always use lbfgs as fallback

---
 sklearn/linear_model/_glm/glm.py            | 38 ++++++--------
 sklearn/linear_model/_glm/tests/test_glm.py | 58 +++++++++++++++------
 2 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 31f7d65606304..9973585799f01 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -194,8 +194,11 @@ def lbfgs_step(self, X, y, sample_weight):
         """Fallback for inner solver.
 
         This is like inner_solve and line_search together.
+        It uses 4 lbfgs steps such that it takes advantage of updates of the
+        quasi-hessian, but not more steps in the hope that the normal inner solver can
+        take over again.
 
-        Use 4 lbfgs steps. As in line_search sets:
+        As in line_search sets:
             - self.coef_old
             - self.coef
             - self.loss_value_old
@@ -535,12 +538,11 @@ def inner_solve(self, X, y, sample_weight):
             if self.verbose:
                 print(
                     "  The inner solver detected a pointwise hessian with many "
-                    "negative values and resorts to a lbfgs step."
+                    "negative values and resorts to a few lbfgs steps."
                 )
             self.use_lbfgs_step = True
             return
 
-        gradient_step = False
         try:
             with warnings.catch_warnings():
                 warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
@@ -548,12 +550,14 @@ def inner_solve(self, X, y, sample_weight):
                     self.hessian, -self.gradient, check_finite=False, assume_a="sym"
                 )
                 self.gradient_times_newton = self.gradient @ self.coef_newton
-                gradient_step = self.gradient_times_newton > 0
-                if gradient_step and self.verbose:
-                    print(
-                        "  The inner solver found a Newton step that is not a descent "
-                        "direction and resorts to a simple gradient step."
-                    )
+                if self.gradient_times_newton > 0:
+                    if self.verbose:
+                        print(
+                            "  The inner solver found a Newton step that is not a "
+                            "descent direction and resorts to a few lbfgs steps."
+                        )
+                    self.use_lbfgs_step = True
+                    return
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             if self.count_singular == 0:
                 # We only need to throw this warning once.
@@ -581,22 +585,14 @@ def inner_solve(self, X, y, sample_weight):
             # There are many possible ways to deal with this situation. Most of them
             # add, explicit or implicit, a matrix to the hessian to make it positive
             # definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
-            # Instead, we resort to a simple gradient step, taking the diagonal part
-            # of the hessian.
+            # Instead, we resort to a few lbfgs steps.
             if self.verbose:
                 print(
                     "  The inner solver stumbled upon an singular or ill-conditioned "
-                    "hessian matrix and resorts to a simple gradient step."
+                    "hessian matrix and resorts to a few lbfgs steps."
                 )
-            gradient_step = True
-
-        if gradient_step:
-            # We add 1e-3 to the diagonal hessian part to make it invertible and to
-            # restrict coef_newton to at most ~1e3. The line search considers step
-            # sizes until 2**-20 ~ 1e-6 * newton_step >~ 1e-3 * gradient.
-            eps = 1e-3
-            self.coef_newton = -self.gradient / (np.abs(np.diag(self.hessian)) + eps)
-            self.gradient_times_newton = self.gradient @ self.coef_newton
+            self.use_lbfgs_step = True
+            return
 
 
 class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2c477b3f9ce0f..6fa2da58ef77b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -382,19 +382,33 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
     # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
     # NOT return the minimum norm solution with fit_intercept=True.
-    rtol = 5e-5 if solver == "lbfgs" else 1e-7
     if n_samples > n_features:
+        rtol = 5e-5 if solver == "lbfgs" else 1e-7
         assert model.intercept_ == pytest.approx(intercept)
         assert_allclose(model.coef_, coef, rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y, rtol=1e-6)
-        if fit_intercept or solver in ["newton-cholesky"]:
+        rtol = 1e-6
+        if solver == "newton-cholesky":
+            rtol = 5e-4
+        elif solver == "newton-qr-cholesky":
+            rtol = 5e-5
+            if isinstance(model, TweedieRegressor) and model.power == 1.5:
+                pytest.xfail("newton-qr-cholesky fails on TweedieRegressor(power=1.5)")
+        assert_allclose(model.predict(X), y, rtol=rtol)
+
+        norm_solution = np.linalg.norm(np.r_[intercept, coef])
+        norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+        if solver == "newton-cholesky":
+            # XXX: This solver shows random behaviour. Sometimes it finds solutions
+            # with norm_model <= norm_solution! So we check conditionally.
+            if not (norm_model > (1 + 1e-12) * norm_solution):
+                assert model.intercept_ == pytest.approx(intercept)
+                assert_allclose(model.coef_, coef, rtol=5e-5)
+        elif solver == "lbfgs" and fit_intercept:
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
-            norm_solution = np.linalg.norm(np.r_[intercept, coef])
-            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
             assert norm_model > (1 + 1e-12) * norm_solution
 
             # See https://github.com/scikit-learn/scikit-learn/issues/23670.
@@ -406,8 +420,8 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
             # When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
             # solution on this problem.
             # XXX: Do we have any theoretical guarantees why this should be the case?
-            assert model.intercept_ == pytest.approx(intercept)
-            assert_allclose(model.coef_, coef, rtol=rtol)
+            assert model.intercept_ == pytest.approx(intercept, rel=5e-6)
+            assert_allclose(model.coef_, coef, rtol=1e-5)
 
 
 @pytest.mark.parametrize("solver", SOLVERS)
@@ -468,15 +482,20 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         model_intercept = model.intercept_
         model_coef = model.coef_
 
-    rtol = 6e-5 if solver == "lbfgs" else 1e-6
     if n_samples > n_features:
         assert model_intercept == pytest.approx(intercept)
-        assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
+        rtol = 1e-4
+        if solver == "newton-qr-cholesky":
+            rtol = 5e-4
+            if isinstance(model, TweedieRegressor) and model.power == 1.5:
+                pytest.xfail("newton-qr-cholesky fails on TweedieRegressor(power=1.5)")
+        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y, rtol=1e-6)
-        if fit_intercept or solver in ["newton-cholesky"]:
+        rtol = 1e-6 if solver == "lbfgs" else 5e-6
+        assert_allclose(model.predict(X), y, rtol=rtol)
+        if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky":
             # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
@@ -488,6 +507,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
             # For minimum norm solution, we would have
             # assert model.intercept_ == pytest.approx(model.coef_[-1])
         else:
+            rtol = 6e-5 if solver == "lbfgs" else 1e-6
             assert model_intercept == pytest.approx(intercept)
             assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
 
@@ -539,13 +559,21 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        assert_allclose(model.predict(X), y, rtol=1e-6)
-        if fit_intercept or solver in ["newton-cholesky"]:
+        rtol = 1e-6 if solver == "lbfgs" else 5e-6
+        assert_allclose(model.predict(X), y, rtol=rtol)
+
+        norm_solution = np.linalg.norm(np.r_[intercept, coef])
+        norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
+        if solver == "newton-cholesky":
+            # XXX: This solver shows random behaviour. Sometimes it finds solutions
+            # with norm_model <= norm_solution! So we check conditionally.
+            if not (norm_model > (1 + 1e-12) * norm_solution):
+                assert model.intercept_ == pytest.approx(intercept)
+                assert_allclose(model.coef_, coef, rtol=1e-4)
+        elif solver == "lbfgs" and fit_intercept:
             # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
-            norm_solution = np.linalg.norm(np.r_[intercept, coef])
-            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
             assert norm_model > (1 + 1e-12) * norm_solution
         else:
             assert model.intercept_ == pytest.approx(intercept)

From e70a4dfce322bdf10c65ff8860f714d063284998 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 19 Jun 2022 20:38:36 +0200
Subject: [PATCH 58/97] TST adapt rtol

---
 sklearn/linear_model/_glm/tests/test_glm.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 6fa2da58ef77b..fad33c1b68ca8 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -415,13 +415,13 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
             # Note: Even adding a tiny penalty does not give the minimal norm solution.
             # XXX: We could have naively expected LBFGS to find the minimal norm
             # solution by adding a very small penalty. Even that fails for a reason we
-            # do not properly
+            # do not properly understand.
         else:
             # When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
             # solution on this problem.
             # XXX: Do we have any theoretical guarantees why this should be the case?
             assert model.intercept_ == pytest.approx(intercept, rel=5e-6)
-            assert_allclose(model.coef_, coef, rtol=1e-5)
+            assert_allclose(model.coef_, coef, rtol=5e-5)
 
 
 @pytest.mark.parametrize("solver", SOLVERS)
@@ -507,9 +507,9 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
             # For minimum norm solution, we would have
             # assert model.intercept_ == pytest.approx(model.coef_[-1])
         else:
-            rtol = 6e-5 if solver == "lbfgs" else 1e-6
-            assert model_intercept == pytest.approx(intercept)
-            assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
+            rtol = 5e-5 if solver == "newton-qr-cholesky" else 5e-6
+            assert model_intercept == pytest.approx(intercept, rel=rtol)
+            assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
 
 
 @pytest.mark.parametrize("solver", SOLVERS)
@@ -552,8 +552,8 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
             warnings.filterwarnings("ignore", category=ConvergenceWarning)
         model.fit(X, y)
 
-    rtol = 5e-5 if solver == "lbfgs" else 1e-6
     if n_samples > n_features:
+        rtol = 5e-5 if solver == "lbfgs" else 1e-6
         assert model.intercept_ == pytest.approx(intercept)
         assert_allclose(model.coef_, coef, rtol=rtol)
     else:
@@ -576,7 +576,8 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
             # equal.
             assert norm_model > (1 + 1e-12) * norm_solution
         else:
-            assert model.intercept_ == pytest.approx(intercept)
+            rtol = 1e-5 if solver == "newton-cholesky" else 1e-4
+            assert model.intercept_ == pytest.approx(intercept, rel=rtol)
             assert_allclose(model.coef_, coef, rtol=rtol)
 
 
From 85a1c523a75c70428e7d3dd403f5666b816234f9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 20 Jun 2022 14:39:13 +0200
Subject: [PATCH 59/97] TST fix test_linalg_warning_with_newton_solver

---
 sklearn/linear_model/_glm/tests/test_glm.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index fad33c1b68ca8..7b23223ca5a2c 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1077,10 +1077,6 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
         PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_collinear, y)
 
-    msg = "Newton solver did not converge after.*iterations."
-    with pytest.warns(ConvergenceWarning, match=msg):
-        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_collinear, y)
-
     # Increasing the regularization slightly should make the problem go away:
     reg = PoissonRegressor(solver="newton-cholesky", alpha=1e-12).fit(X_collinear, y)
 

From 0a557caa93619aa838dad973675c8e7b797c397a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 20 Jun 2022 16:43:07 +0200
Subject: [PATCH 60/97] CLN address some review comments

---
 sklearn/linear_model/_glm/glm.py | 67 +++++++++++++++++---------------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 9973585799f01..084cfd9247a9c 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -46,7 +46,7 @@ class NewtonSolver(ABC):
         g = X.T @ loss.gradient + l2_reg_strength * coef
         H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
 
-    Backtracking line seach updates coef = coef_old + t * coef_newton for some t in
+    Backtracking line search updates coef = coef_old + t * coef_newton for some t in
     (0, 1].
 
     This is a base class, actual implementations (child classes) may deviate from the
@@ -80,7 +80,7 @@ class NewtonSolver(ABC):
         The loss to be minimized.
 
     l2_reg_strength : float, default=0.0
-            L2 regularization strength
+        L2 regularization strength.
 
     tol : float, default=1e-4
         The optimization problem is solved when each of the following condition is
@@ -639,12 +639,12 @@ class QRCholeskyNewtonSolver(BaseCholeskyNewtonSolver):
     This is the same as an LQ decomposition of X. We introduce the new variable t as,
     see [1]:
 
-        (coef, intercept) = (Q @ t, intercept)
+        (coef, intercept) = (Q @ z, intercept)
 
-    By using X @ coef = R' @ t and ||coef||_2 = ||t||_2, we can just replace X
-    by R', solve for t instead of coef, and finally get coef = Q @ t.
-    Note that t has less elements than coef if n_features > n_samples:
-        len(t) = k = min(n_samples, n_features) <= n_features = len(coef).
+    By using X @ coef = R' @ z and ||coef||_2 = ||z||_2, we can just replace X
+    by R', solve for z instead of coef, and finally get coef = Q @ z.
+    Note that z has less elements than coef if n_features > n_samples:
+        len(z) = k = min(n_samples, n_features) <= n_features = len(coef).
 
     [1] Hastie, T.J., & Tibshirani, R. (2003). Expression Arrays and the p n Problem.
     https://web.stanford.edu/~hastie/Papers/pgtn.pdf
@@ -654,7 +654,9 @@ def setup(self, X, y, sample_weight):
         n_samples, n_features = X.shape
         # TODO: setting pivoting=True could improve stability
         # QR of X'
-        self.Q, self.R = scipy.linalg.qr(X.T, mode="economic", pivoting=False)
+        self.Q, self.R = scipy.linalg.qr(
+            X.T, mode="economic", pivoting=False, check_finite=False
+        )
         # use k = min(n_features, n_samples) instead of n_features
         k = self.R.T.shape[1]
         n_dof = k
@@ -662,7 +664,7 @@ def setup(self, X, y, sample_weight):
             n_dof += 1
         # store original coef
         self.coef_original = self.coef
-        # set self.coef = t (coef_original = Q @ t)
+        # set self.coef = z (coef_original = Q @ z)
         self.coef = np.zeros_like(self.coef, shape=n_dof)
         if np.sum(np.abs(self.coef_original)) > 0:
             self.coef[:k] = self.Q.T @ self.coef_original[:n_features]
@@ -750,12 +752,12 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
-            an inner cholesky based solver.
+            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
+            with an inner cholesky based solver.
 
         'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
-            is better for n_features >> n_samples than 'newton-cholesky'.
+            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
+            is better for `n_features >> n_samples` than 'newton-cholesky'.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -984,10 +986,10 @@ def fit(self, X, y, sample_weight=None):
                     "maxls": 50,  # default is 20
                     "iprint": self.verbose - 1,
                     "gtol": self.tol,
-                    # The constant 64 was found empirically to pass the test suite. The
-                    # point is that ftol is very small, but a bit larger than machine
-                    # precision.
-                    "ftol": 64 * np.finfo(float).eps,  # lbfgs is float64 land.
+                    # The constant 64 was found empirically to pass the test suite.
+                    # The point is that ftol is very small, but a bit larger than
+                    # machine precision for float64, which is the dtype used by lbfgs.
+                    "ftol": 64 * np.finfo(float).eps,
                 },
                 args=(X, y, sample_weight, l2_reg_strength, n_threads),
             )
@@ -1019,6 +1021,7 @@ def fit(self, X, y, sample_weight=None):
                 n_threads=n_threads,
             )
             coef = sol.solve(X, y, sample_weight)
+            self.n_iter_ = sol.iteration
 
         if self.fit_intercept:
             self.intercept_ = coef[-1]
@@ -1208,19 +1211,19 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
-            an inner cholesky based solver.
+            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
+            with an inner cholesky based solver.
 
         'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
-            is better for n_features >> n_samples than 'newton-cholesky'.
+            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
+            is better for `n_features >> n_samples` than 'newton-cholesky'.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -1333,19 +1336,19 @@ class GammaRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
-            an inner cholesky based solver.
+            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
+            with an inner cholesky based solver.
 
         'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
-            is better for n_features >> n_samples than 'newton-cholesky'.
+            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
+            is better for `n_features >> n_samples` than 'newton-cholesky'.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -1489,19 +1492,19 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
         - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
           distributions
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equals iterated reweighted least squares) with
-            an inner cholesky based solver.
+            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
+            with an inner cholesky based solver.
 
         'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a qr decomposition of X.T. This solver
-            is better for n_features >> n_samples than 'newton-cholesky'.
+            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
+            is better for `n_features >> n_samples` than 'newton-cholesky'.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.

From be2fe6dc400e402ba64439a81740eeef4a3586fc Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Wed, 29 Jun 2022 18:42:59 +0200
Subject: [PATCH 61/97] Improve tests related to convergence warning on
 collinear data

---
 sklearn/linear_model/_glm/tests/test_glm.py | 59 +++++++++++++++++----
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 7b23223ca5a2c..fc495d30f487d 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1059,28 +1059,65 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     rng = np.random.RandomState(global_random_seed)
     X_orig = rng.normal(size=(10, 3))
     X_collinear = np.hstack([X_orig] * 10)  # collinear design
-    y = rng.normal(size=X_orig.shape[0])
-    y[y < 0] = 0.0
+    y = rng.poisson(
+        np.exp(X_orig @ np.ones(X_orig.shape[1])), size=X_orig.shape[0]
+    ).astype(np.float64)
+
+    # Let's consider the deviance of constant baseline on this problem:
+    baseline_pred = np.full_like(y, y.astype(np.float64).mean())
+    constant_model_deviance = mean_poisson_deviance(y, baseline_pred)
 
     # No warning raised on well-conditioned design, even without regularization.
+    tol = 1e-10
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        reg = PoissonRegressor(solver="newton-cholesky", alpha=0.0, tol=tol).fit(
+            X_orig, y
+        )
+    original_newton_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
+
+    # We check that the model could successfully overfit information in X_orig
+    # to improve upon the constant baseline (when evaluated on the traing set).
+    assert original_newton_deviance < constant_model_deviance - 1e-3
+
+    # LBFGS is robust to collinear design because its approximation of the
+    # Hessian is Symmeric Positive Definite by construction. Let's record its
+    # solution
     with warnings.catch_warnings():
         warnings.simplefilter("error")
-        reg = PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_orig, y)
-    reference_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
+        reg = PoissonRegressor(solver="lbfgs", alpha=0.0, tol=tol).fit(X_collinear, y)
+    collinear_lbfgs_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
+    print()
+    print(f"{original_newton_deviance - collinear_lbfgs_deviance=}")
+
+    # The LBFGS solution on the collinear is expected to reach a comparable
+    # solution.
+    rtol, atol = 1e-4, 1e-8
+    assert collinear_lbfgs_deviance == pytest.approx(
+        original_newton_deviance, rel=rtol, abs=atol
+    )
 
     # Fitting on collinear data without regularization should raise an
-    # informative warning:
+    # informative warning and fallback to the LBFGS solver
     msg = (
         "The inner solver of CholeskyNewtonSolver stumbled upon a"
         " singular or very ill-conditioned hessian matrix"
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
-        PoissonRegressor(solver="newton-cholesky", alpha=0.0).fit(X_collinear, y)
+        reg = PoissonRegressor(solver="newton-cholesky", alpha=0.0, tol=tol).fit(
+            X_collinear, y
+        )
+    collinear_newton_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
+
+    assert collinear_newton_deviance == pytest.approx(
+        original_newton_deviance, rel=rtol, abs=atol
+    )
 
     # Increasing the regularization slightly should make the problem go away:
-    reg = PoissonRegressor(solver="newton-cholesky", alpha=1e-12).fit(X_collinear, y)
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
+        PoissonRegressor(solver="newton-cholesky", alpha=1e-10).fit(X_collinear, y)
 
-    # Since we use a small penalty, the deviance of the predictions should still
-    # be almost the same.
-    this_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
-    assert this_deviance == pytest.approx(reference_deviance)
+    # While for most random seed the deviance of this model is very close to
+    # that of the unpenalized model, it is unfortunately not always the case so
+    # we do not check such an assertion here.

From 0906f9412202a4c90789d6795f1a5631968f6807 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 30 Jun 2022 11:38:35 +0200
Subject: [PATCH 62/97] overfit -> fit

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index fc495d30f487d..0150486ad42d9 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1076,7 +1076,7 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
         )
     original_newton_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
 
-    # We check that the model could successfully overfit information in X_orig
+    # We check that the model could successfully fit information in X_orig
     # to improve upon the constant baseline (when evaluated on the traing set).
     assert original_newton_deviance < constant_model_deviance - 1e-3
 

From 0aa83acebde765b290210f561baa2fbf4ca199a0 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 30 Jun 2022 11:39:13 +0200
Subject: [PATCH 63/97] Typo in comment

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 0150486ad42d9..b96f02a5360f1 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1118,6 +1118,6 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
         warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
         PoissonRegressor(solver="newton-cholesky", alpha=1e-10).fit(X_collinear, y)
 
-    # While for most random seed the deviance of this model is very close to
+    # While for most random seeds the deviance of this model is very close to
     # that of the unpenalized model, it is unfortunately not always the case so
     # we do not check such an assertion here.

From 325c849dcd29f05b8c252210f1101c1344b6e80e Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Thu, 30 Jun 2022 19:42:31 +0200
Subject: [PATCH 64/97] Apply suggestions from code review

---
 sklearn/linear_model/_glm/glm.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 3384042e92490..c1605ab435339 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -1259,8 +1259,6 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
     array([10.676..., 21.875...])
     """
 
-    _parameter_constraints = {**_GeneralizedLinearRegressor._parameter_constraints}
-
     def __init__(
         self,
         *,
@@ -1387,8 +1385,6 @@ class GammaRegressor(_GeneralizedLinearRegressor):
     array([19.483..., 35.795...])
     """
 
-    _parameter_constraints = {**_GeneralizedLinearRegressor._parameter_constraints}
-
     def __init__(
         self,
         *,

From d4206d68247ca1b439657be31ce9d86250a53d8a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 1 Jul 2022 15:01:41 +0200
Subject: [PATCH 65/97] ENH fallback_lbfgs_solve - Do not use lbfgs steps, fall
 back complete to lbfgs

---
 sklearn/linear_model/_glm/glm.py | 89 +++++++++++++-------------------
 1 file changed, 36 insertions(+), 53 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index c49504e4df1ac..8f5c1b30d09be 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -125,7 +125,7 @@ class NewtonSolver(ABC):
     iteration : int
         Number of Newton steps, i.e. calls to inner_solve
 
-    use_lbfgs_step : bool
+    use_fallback_lbfgs_solve : bool
         An inner solver can set this to True to resort to LBFGS for one iteration.
 
     gradient_times_newton : float
@@ -191,55 +191,33 @@ def inner_solve(self, X, y, sample_weight):
             - gradient_times_newton
         """
 
-    def lbfgs_step(self, X, y, sample_weight):
-        """Fallback for inner solver.
+    def fallback_lbfgs_solve(self, X, y, sample_weight):
+        """Fallback solver in case of emergency.
 
-        This is like inner_solve and line_search together.
-        It uses 4 lbfgs steps such that it takes advantage of updates of the
-        quasi-hessian, but not more steps in the hope that the normal inner solver can
-        take over again.
+        If a solver detects convergence problems, it may fall back to this methods in
+        the hope to exit with success instead of raising an error.
 
-        As in line_search sets:
-            - self.coef_old
+        Sets:
             - self.coef
-            - self.loss_value_old
-            - self.loss_value
-            - self.gradient_old
-            - self.gradient
-            - self.raw_prediction
-        As in inner_solver sets:
-            - self.coef_newton
+            - self.converged
         """
-        self.coef_old = self.coef
-        self.loss_value_old = self.loss_value
-        self.gradient_old = self.gradient
-
         opt_res = scipy.optimize.minimize(
             self.linear_loss.loss_gradient,
             self.coef,
             method="L-BFGS-B",
             jac=True,
             options={
-                "maxiter": 4,
-                "maxls": 40,  # default is 20
-                "iprint": self.verbose - 2,
+                "maxiter": self.max_iter,
+                "maxls": 50,  # default is 20
+                "iprint": self.verbose - 1,
                 "gtol": self.tol,
-                "ftol": 64 * np.finfo(np.float64).eps,  # lbfgs is float64 land.
+                "ftol": 64 * np.finfo(np.float64).eps,
             },
             args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
         )
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
         self.coef = opt_res.x
-        self.coef_newton = self.coef - self.coef_old
-        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
-        self.loss_value, self.gradient = self.linear_loss.loss_gradient(
-            coef=self.coef,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            l2_reg_strength=self.l2_reg_strength,
-            n_threads=self.n_threads,
-            raw_prediction=self.raw_prediction,
-        )
+        self.converged = opt_res.status == 0
 
     def line_search(self, X, y, sample_weight):
         """Backtracking line search.
@@ -467,7 +445,7 @@ def solve(self, X, y, sample_weight):
             if self.verbose:
                 print(f"Newton iter={self.iteration}")
 
-            self.use_lbfgs_step = False  # Fallback for inner_solve.
+            self.use_fallback_lbfgs_solve = False  # Fallback solver.
 
             # 1. Update hessian and gradient
             self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
@@ -481,15 +459,14 @@ def solve(self, X, y, sample_weight):
             #    Calculate Newton step/direction
             #    This usually sets self.coef_newton.
             self.inner_solve(X=X, y=y, sample_weight=sample_weight)
-            if self.use_lbfgs_step:
-                self.lbfgs_step(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
 
             # 3. Backtracking line search
             #    This usually sets self.coef_old, self.coef, self.loss_value_old
             #    self.loss_value, self.gradient_old, self.gradient,
             #    self.raw_prediction.
-            if not self.use_lbfgs_step:
-                self.line_search(X=X, y=y, sample_weight=sample_weight)
+            self.line_search(X=X, y=y, sample_weight=sample_weight)
 
             # 4. Check convergence
             #    Sets self.converged.
@@ -499,11 +476,17 @@ def solve(self, X, y, sample_weight):
             self.iteration += 1
 
         if not self.converged:
-            warnings.warn(
-                "Newton solver did not converge after"
-                f" {self.iteration - 1} iterations.",
-                ConvergenceWarning,
-            )
+            if self.use_fallback_lbfgs_solve:
+                # Note: The fallback solver circumvents check_convergence and relies on
+                # the convergence checks of lbfgs instead. Enough warnings have been
+                # raised on the way.
+                self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
+            else:
+                warnings.warn(
+                    "Newton solver did not converge after"
+                    f" {self.iteration - 1} iterations.",
+                    ConvergenceWarning,
+                )
 
         self.iteration -= 1
         self.finalize(X=X, y=y, sample_weight=sample_weight)
@@ -539,9 +522,9 @@ def inner_solve(self, X, y, sample_weight):
             if self.verbose:
                 print(
                     "  The inner solver detected a pointwise hessian with many "
-                    "negative values and resorts to a few lbfgs steps."
+                    "negative values and resorts to lbfgs instead."
                 )
-            self.use_lbfgs_step = True
+            self.use_fallback_lbfgs_solve = True
             return
 
         try:
@@ -555,9 +538,9 @@ def inner_solve(self, X, y, sample_weight):
                     if self.verbose:
                         print(
                             "  The inner solver found a Newton step that is not a "
-                            "descent direction and resorts to a few lbfgs steps."
+                            "descent direction and resorts to lbfgs instead."
                         )
-                    self.use_lbfgs_step = True
+                    self.use_fallback_lbfgs_solve = True
                     return
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             if self.count_singular == 0:
@@ -590,9 +573,9 @@ def inner_solve(self, X, y, sample_weight):
             if self.verbose:
                 print(
                     "  The inner solver stumbled upon an singular or ill-conditioned "
-                    "hessian matrix and resorts to a few lbfgs steps."
+                    "hessian matrix and resorts to lbfgs instead."
                 )
-            self.use_lbfgs_step = True
+            self.use_fallback_lbfgs_solve = True
             return
 
 
@@ -688,9 +671,9 @@ def update_gradient_hessian(self, X, y, sample_weight):
             raw_prediction=self.raw_prediction,  # this was updated in line_search
         )
 
-    def lbfgs_step(self, X, y, sample_weight):
+    def fallback_lbfgs_solve(self, X, y, sample_weight):
         # Use R' instead of X
-        super().lbfgs_step(X=self.R.T, y=y, sample_weight=sample_weight)
+        super().fallback_lbfgs_solve(X=self.R.T, y=y, sample_weight=sample_weight)
 
     def line_search(self, X, y, sample_weight):
         # Use R' instead of X

From 5e6aa9974a862db813e1300d5cf0b47a6aed23a5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 1 Jul 2022 15:03:00 +0200
Subject: [PATCH 66/97] ENH adapt rtol

---
 sklearn/linear_model/_glm/tests/test_glm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 75e8edfa67ad4..3fe18b630a82b 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -381,11 +381,10 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
-        rtol = 1e-6
+        rtol = 5e-5
         if solver == "newton-cholesky":
             rtol = 5e-4
         elif solver == "newton-qr-cholesky":
-            rtol = 5e-5
             if isinstance(model, TweedieRegressor) and model.power == 1.5:
                 pytest.xfail("newton-qr-cholesky fails on TweedieRegressor(power=1.5)")
         assert_allclose(model.predict(X), y, rtol=rtol)

From 15192f1bac7dfcfe6a6709024c204c628d9c7bdd Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 1 Jul 2022 16:00:59 +0200
Subject: [PATCH 67/97] Improve test_linalg_warning_with_newton_solver

---
 sklearn/linear_model/_glm/tests/test_glm.py | 57 ++++++++++++---------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c9c9ff9b62cf1..dc254be0d2413 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -950,69 +950,76 @@ def test_family_deprecation(est, family):
             assert est.family.power == family.power
 
 
-def test_linalg_warning_with_newton_solver(global_random_seed):
+@pytest.mark.parametrize("newton_solver", ["newton-cholesky", "newton-qr-cholesky"])
+def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
     rng = np.random.RandomState(global_random_seed)
-    X_orig = rng.normal(size=(10, 3))
+    X_orig = rng.normal(size=(20, 3))
     X_collinear = np.hstack([X_orig] * 10)  # collinear design
     y = rng.poisson(
         np.exp(X_orig @ np.ones(X_orig.shape[1])), size=X_orig.shape[0]
     ).astype(np.float64)
 
-    # Let's consider the deviance of constant baseline on this problem:
+    # Let's consider the deviance of constant baseline on this problem.
     baseline_pred = np.full_like(y, y.astype(np.float64).mean())
     constant_model_deviance = mean_poisson_deviance(y, baseline_pred)
+    assert constant_model_deviance > 1.0
 
     # No warning raised on well-conditioned design, even without regularization.
     tol = 1e-10
     with warnings.catch_warnings():
         warnings.simplefilter("error")
-        reg = PoissonRegressor(solver="newton-cholesky", alpha=0.0, tol=tol).fit(
-            X_orig, y
-        )
+        reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(X_orig, y)
     original_newton_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
 
-    # We check that the model could successfully fit information in X_orig
-    # to improve upon the constant baseline (when evaluated on the traing set).
-    assert original_newton_deviance < constant_model_deviance - 1e-3
+    # On this dataset, we should have enough data points in the original data
+    # to not make it possible to get a near zero deviance (for the any of the
+    # admissible random seeds). This will make it easier to interpret meaning
+    # of rtol in the subsequent assertions:
+    assert original_newton_deviance > 0.2
+
+    # We check that the model could successfully fit information in X_orig to
+    # improve upon the constant baseline by a large margin (when evaluated on
+    # the traing set).
+    assert constant_model_deviance - original_newton_deviance > 0.1
 
-    # LBFGS is robust to collinear design because its approximation of the
+    # LBFGS is robust to a collinear design because its approximation of the
     # Hessian is Symmeric Positive Definite by construction. Let's record its
     # solution
     with warnings.catch_warnings():
         warnings.simplefilter("error")
         reg = PoissonRegressor(solver="lbfgs", alpha=0.0, tol=tol).fit(X_collinear, y)
     collinear_lbfgs_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
-    print()
-    print(f"{original_newton_deviance - collinear_lbfgs_deviance=}")
 
     # The LBFGS solution on the collinear is expected to reach a comparable
-    # solution.
-    rtol, atol = 1e-4, 1e-8
-    assert collinear_lbfgs_deviance == pytest.approx(
-        original_newton_deviance, rel=rtol, abs=atol
-    )
+    # solution to the Newton solution on the original data.
+    rtol = 1e-6
+    assert collinear_lbfgs_deviance == pytest.approx(original_newton_deviance, rel=rtol)
 
     # Fitting on collinear data without regularization should raise an
     # informative warning and fallback to the LBFGS solver
     msg = (
-        "The inner solver of CholeskyNewtonSolver stumbled upon a"
+        "The inner solver of .*NewtonSolver stumbled upon a"
         " singular or very ill-conditioned hessian matrix"
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
-        reg = PoissonRegressor(solver="newton-cholesky", alpha=0.0, tol=tol).fit(
+        reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(
             X_collinear, y
         )
     collinear_newton_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
-
     assert collinear_newton_deviance == pytest.approx(
-        original_newton_deviance, rel=rtol, abs=atol
+        original_newton_deviance, rel=rtol
     )
 
     # Increasing the regularization slightly should make the problem go away:
     with warnings.catch_warnings():
         warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
-        PoissonRegressor(solver="newton-cholesky", alpha=1e-10).fit(X_collinear, y)
+        reg = PoissonRegressor(solver=newton_solver, alpha=1e-10).fit(X_collinear, y)
 
-    # While for most random seeds the deviance of this model is very close to
-    # that of the unpenalized model, it is unfortunately not always the case so
-    # we do not check such an assertion here.
+    # The slightly penalized model on the collinear data should be close enough
+    # to the unpenalized model on the original data.
+    penalized_collinear_newton_deviance = mean_poisson_deviance(
+        y, reg.predict(X_collinear)
+    )
+    assert penalized_collinear_newton_deviance == pytest.approx(
+        original_newton_deviance, rel=rtol
+    )

From 621ffd83c98ec77f9acb8bced7f6885824502a9b Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 1 Jul 2022 16:06:04 +0200
Subject: [PATCH 68/97] Better comments

---
 sklearn/linear_model/_glm/tests/test_glm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index dc254be0d2413..4db0289bb14e8 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -995,8 +995,9 @@ def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
     rtol = 1e-6
     assert collinear_lbfgs_deviance == pytest.approx(original_newton_deviance, rel=rtol)
 
-    # Fitting on collinear data without regularization should raise an
-    # informative warning and fallback to the LBFGS solver
+    # Fitting a Newton solver on the collinear version of the training data
+    # without regularization should raise an informative warning and fallback
+    # to the LBFGS solver.
     msg = (
         "The inner solver of .*NewtonSolver stumbled upon a"
         " singular or very ill-conditioned hessian matrix"
@@ -1005,6 +1006,7 @@ def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
         reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(
             X_collinear, y
         )
+    # As a result we should still automatically converge to a good solution.
     collinear_newton_deviance = mean_poisson_deviance(y, reg.predict(X_collinear))
     assert collinear_newton_deviance == pytest.approx(
         original_newton_deviance, rel=rtol

From 6413f0742d3609237bd6cd5d0eda05cfa421abd9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 1 Jul 2022 16:38:10 +0200
Subject: [PATCH 69/97] Fixed Hessian casing and improved warning messages

---
 sklearn/linear_model/_glm/glm.py            | 44 ++++++++++-----------
 sklearn/linear_model/_glm/tests/test_glm.py | 16 +++++---
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 8f5c1b30d09be..92f7af3ee9020 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -38,11 +38,11 @@ class NewtonSolver(ABC):
 
     This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
     iteration aims at finding the Newton step which is done by the inner solver. With
-    hessian H, gradient g and coefficients coef, one step solves:
+    Hessian H, gradient g and coefficients coef, one step solves:
 
         H @ coef_newton = -g
 
-    For our GLM / LinearModelLoss, we have gradient g and hessian H:
+    For our GLM / LinearModelLoss, we have gradient g and Hessian H:
 
         g = X.T @ loss.gradient + l2_reg_strength * coef
         H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
@@ -93,7 +93,7 @@ class NewtonSolver(ABC):
         Maximum number of Newton steps allowed.
 
     n_threads : int, default=1
-        Number of OpenMP threads to use for the computation of the hessian and gradient
+        Number of OpenMP threads to use for the computation of the Hessian and gradient
         of the loss function.
 
     Attributes
@@ -180,7 +180,7 @@ def setup(self, X, y, sample_weight):
 
     @abstractmethod
     def update_gradient_hessian(self, X, y, sample_weight):
-        """Update gradient and hessian."""
+        """Update gradient and Hessian."""
 
     @abstractmethod
     def inner_solve(self, X, y, sample_weight):
@@ -447,7 +447,7 @@ def solve(self, X, y, sample_weight):
 
             self.use_fallback_lbfgs_solve = False  # Fallback solver.
 
-            # 1. Update hessian and gradient
+            # 1. Update Hessian and gradient
             self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
 
             # TODO:
@@ -507,21 +507,20 @@ def setup(self, X, y, sample_weight):
 
     def inner_solve(self, X, y, sample_weight):
         if self.hessian_warning:
-            if self.count_bad_hessian == 0:
+            if self.count_hessian_warning == 0:
                 # We only need to throw this warning once.
                 warnings.warn(
-                    f"The inner solver of {self.__class__.__name__} detected a "
-                    " pointwise hessian with many negative values at iteration "
-                    f"#{self.iteration}. It will now try a lbfgs step."
-                    " Note that this warning is only raised once, the problem may,"
-                    " however, occur in several or all iterations. Set verbose >= 1"
-                    " to get more information.\n",
+                    f"The inner solver of {self.__class__.__name__} detected a"
+                    " pointwise Hessian with many negative values at iteration"
+                    f" #{self.iteration}. Switching from exact Newton steps to"
+                    " Quasi-Newton steps using LBFGS until convegence."
+                    " Set verbose >= 1 to get more information.\n",
                     ConvergenceWarning,
                 )
             self.count_hessian_warning += 1
             if self.verbose:
                 print(
-                    "  The inner solver detected a pointwise hessian with many "
+                    "  The inner solver detected a pointwise Hessian with many "
                     "negative values and resorts to lbfgs instead."
                 )
             self.use_fallback_lbfgs_solve = True
@@ -538,7 +537,7 @@ def inner_solve(self, X, y, sample_weight):
                     if self.verbose:
                         print(
                             "  The inner solver found a Newton step that is not a "
-                            "descent direction and resorts to lbfgs instead."
+                            "descent direction and resorts to LBFGS steps instead."
                         )
                     self.use_fallback_lbfgs_solve = True
                     return
@@ -547,14 +546,13 @@ def inner_solve(self, X, y, sample_weight):
                 # We only need to throw this warning once.
                 warnings.warn(
                     f"The inner solver of {self.__class__.__name__} stumbled upon a"
-                    " singular or very ill-conditioned hessian matrix at iteration "
-                    f"#{self.iteration}. It will now try a simple gradient step."
-                    " Note that this warning is only raised once, the problem may,"
-                    " however, occur in several or all iterations. Set verbose >= 1"
-                    " to get more information.\n"
+                    " singular or very ill-conditioned Hessian matrix at iteration"
+                    f" #{self.iteration}.  Switching from exact Newton steps to"
+                    " Quasi-Newton steps using LBFGS until convegence."
+                    " Set verbose >= 1 to get more information.\n"
                     "Your options are to use another solver or to avoid such situation"
-                    " in the first place. Possible  remedies are removing collinear"
-                    " features of X or increasing the penalization strengths.\n"
+                    " in the first place. Possible remedies are removing collinear"
+                    " features of X or increasing the penalization strength.\n"
                     "The original Linear Algebra message was:\n"
                     + str(e),
                     scipy.linalg.LinAlgWarning,
@@ -567,13 +565,13 @@ def inner_solve(self, X, y, sample_weight):
             #    This might be the most probable cause.
             #
             # There are many possible ways to deal with this situation. Most of them
-            # add, explicit or implicit, a matrix to the hessian to make it positive
+            # add, explicit or implicit, a matrix to the Hessian to make it positive
             # definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
             # Instead, we resort to a few lbfgs steps.
             if self.verbose:
                 print(
                     "  The inner solver stumbled upon an singular or ill-conditioned "
-                    "hessian matrix and resorts to lbfgs instead."
+                    "Hessian matrix and resorts to LBFGS instead."
                 )
             self.use_fallback_lbfgs_solve = True
             return
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 4db0289bb14e8..b0d42582dda25 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -953,12 +953,16 @@ def test_family_deprecation(est, family):
 @pytest.mark.parametrize("newton_solver", ["newton-cholesky", "newton-qr-cholesky"])
 def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
     rng = np.random.RandomState(global_random_seed)
+    # Use at least 20 samples to reduce the likelihood to get a degenerate
+    # dataset for any global_random_seed.
     X_orig = rng.normal(size=(20, 3))
-    X_collinear = np.hstack([X_orig] * 10)  # collinear design
     y = rng.poisson(
         np.exp(X_orig @ np.ones(X_orig.shape[1])), size=X_orig.shape[0]
     ).astype(np.float64)
 
+    # Collinear variation of the same input features.
+    X_collinear = np.hstack([X_orig] * 10)
+
     # Let's consider the deviance of constant baseline on this problem.
     baseline_pred = np.full_like(y, y.astype(np.float64).mean())
     constant_model_deviance = mean_poisson_deviance(y, baseline_pred)
@@ -971,10 +975,10 @@ def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
         reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(X_orig, y)
     original_newton_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
 
-    # On this dataset, we should have enough data points in the original data
-    # to not make it possible to get a near zero deviance (for the any of the
-    # admissible random seeds). This will make it easier to interpret meaning
-    # of rtol in the subsequent assertions:
+    # On this dataset, we should have enough data points to not make it
+    # possible to get a near zero deviance (for the any of the admissible
+    # random seeds). This will make it easier to interpret meaning of rtol in
+    # the subsequent assertions:
     assert original_newton_deviance > 0.2
 
     # We check that the model could successfully fit information in X_orig to
@@ -1000,7 +1004,7 @@ def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
     # to the LBFGS solver.
     msg = (
         "The inner solver of .*NewtonSolver stumbled upon a"
-        " singular or very ill-conditioned hessian matrix"
+        " singular or very ill-conditioned Hessian matrix"
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
         reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(

From bfe3c38013978aab8c107cc304ca691ec4b67ef9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 1 Jul 2022 16:38:44 +0200
Subject: [PATCH 70/97] [all random seeds]

test_linalg_warning_with_newton_solver

From fa9e8856db6de122ad67130ff7b6b0128488db9f Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Fri, 1 Jul 2022 18:14:11 +0200
Subject: [PATCH 71/97] Ignore ConvergenceWarnings for now if convergence is
 good

---
 sklearn/linear_model/_glm/tests/test_glm.py | 46 ++++++++++++++-------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index b0d42582dda25..2c166b975735e 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -288,7 +288,14 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
     else:
         coef = coef_without_intercept
         intercept = 0
-    model.fit(X, y)
+
+    with warnings.catch_warnings():
+        # XXX: Investigate if the ConvergenceWarning that can appear in some
+        # cases should be considered a bug or not. In the mean time we don't
+        # fail when the assertions below pass irrespective of the presence of
+        # the warning.
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        model.fit(X, y)
 
     rtol = 2e-4 if solver == "lbfgs" else 5e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
@@ -365,10 +372,15 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         intercept = 0
 
     with warnings.catch_warnings():
-        if n_samples < n_features:
-            # TODO: implement a fallback mechanism to LBFGS avoid bad convergence.
+        if solver.startswith("newton") and n_samples < n_features:
+            # The newton solvers should warn and automatically fallback to LBFGS
+            # in this case. The model should still converge.
             warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-            warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        # XXX: Investigate if the ConvergenceWarning that can appear in some
+        # cases should be considered a bug or not. In the mean time we don't
+        # fail when the assertions below pass irrespective of the presence of
+        # the warning.
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
         model.fit(X, y)
 
     # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
@@ -454,13 +466,15 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
     assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
 
     with warnings.catch_warnings():
-        if (
-            solver == "lbfgs" and fit_intercept and n_samples < n_features
-        ) or solver in ["newton-cholesky", "newton-qr-cholesky"]:
-            # XXX: Investigate if the lack of convergence in this case should be
-            # considered a bug or not.
+        if solver.startswith("newton"):
+            # The newton solvers should warn and automatically fallback to LBFGS
+            # in this case. The model should still converge.
             warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-            warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        # XXX: Investigate if the ConvergenceWarning that can appear in some
+        # cases should be considered a bug or not. In the mean time we don't
+        # fail when the assertions below pass irrespective of the presence of
+        # the warning.
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
         model.fit(X, y)
 
     if fit_intercept and n_samples < n_features:
@@ -538,11 +552,15 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
     y = np.r_[y, y]
 
     with warnings.catch_warnings():
-        if n_samples < n_features:
-            # XXX: Implement a fallback mechanism to avoid lack of convergence
-            # in this case.
+        if solver.startswith("newton") and n_samples < n_features:
+            # The newton solvers should warn and automatically fallback to LBFGS
+            # in this case. The model should still converge.
             warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
-            warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        # XXX: Investigate if the ConvergenceWarning that can appear in some
+        # cases should be considered a bug or not. In the mean time we don't
+        # fail when the assertions below pass irrespective of the presence of
+        # the warning.
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
         model.fit(X, y)
 
     if n_samples > n_features:

From 7318a4fc85a27e6c8d28c18b22a93debffbe1ca1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jul 2022 09:53:05 +0200
Subject: [PATCH 72/97] CLN remove counting of warnings

---
 sklearn/linear_model/_glm/glm.py | 58 +++++++++++++-------------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 92f7af3ee9020..140ddd3730480 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -71,7 +71,7 @@ class NewtonSolver(ABC):
     ----------
     coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,), \
         default=None
-        Start coefficients of a linear model.
+        Start/Initial coefficients of a linear model.
         If shape (n_classes * n_dof,), the classes of one feature are contiguous,
         i.e. one reconstructs the 2d-array via
         coef.reshape((n_classes, -1), order="F").
@@ -483,8 +483,8 @@ def solve(self, X, y, sample_weight):
                 self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
             else:
                 warnings.warn(
-                    "Newton solver did not converge after"
-                    f" {self.iteration - 1} iterations.",
+                    f"Newton solver did not converge after {self.iteration - 1} "
+                    "iterations.",
                     ConvergenceWarning,
                 )
 
@@ -502,22 +502,15 @@ class BaseCholeskyNewtonSolver(NewtonSolver):
 
     def setup(self, X, y, sample_weight):
         super().setup(X=X, y=y, sample_weight=sample_weight)
-        self.count_singular = 0
-        self.count_hessian_warning = 0
 
     def inner_solve(self, X, y, sample_weight):
         if self.hessian_warning:
-            if self.count_hessian_warning == 0:
-                # We only need to throw this warning once.
-                warnings.warn(
-                    f"The inner solver of {self.__class__.__name__} detected a"
-                    " pointwise Hessian with many negative values at iteration"
-                    f" #{self.iteration}. Switching from exact Newton steps to"
-                    " Quasi-Newton steps using LBFGS until convegence."
-                    " Set verbose >= 1 to get more information.\n",
-                    ConvergenceWarning,
-                )
-            self.count_hessian_warning += 1
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} detected a "
+                "pointwise hessian with many negative values at iteration "
+                f"#{self.iteration}. It will now resort to lbfgs instead.",
+                ConvergenceWarning,
+            )
             if self.verbose:
                 print(
                     "  The inner solver detected a pointwise Hessian with many "
@@ -542,22 +535,17 @@ def inner_solve(self, X, y, sample_weight):
                     self.use_fallback_lbfgs_solve = True
                     return
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
-            if self.count_singular == 0:
-                # We only need to throw this warning once.
-                warnings.warn(
-                    f"The inner solver of {self.__class__.__name__} stumbled upon a"
-                    " singular or very ill-conditioned Hessian matrix at iteration"
-                    f" #{self.iteration}.  Switching from exact Newton steps to"
-                    " Quasi-Newton steps using LBFGS until convegence."
-                    " Set verbose >= 1 to get more information.\n"
-                    "Your options are to use another solver or to avoid such situation"
-                    " in the first place. Possible remedies are removing collinear"
-                    " features of X or increasing the penalization strength.\n"
-                    "The original Linear Algebra message was:\n"
-                    + str(e),
-                    scipy.linalg.LinAlgWarning,
-                )
-            self.count_singular += 1
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular or very ill-conditioned hessian matrix at iteration "
+                f"#{self.iteration}. It will now resort to lbfgs instead.\n"
+                "Further options are to use another solver or to avoid such situation "
+                "in the first place. Possible remedies are removing collinearfeatures "
+                "of X or increasing the penalization strengths.\n"
+                "The original Linear Algebra message was:\n"
+                + str(e),
+                scipy.linalg.LinAlgWarning,
+            )
             # Possible causes:
             # 1. hess_pointwise is negative. But this is already taken care in
             #    LinearModelLoss.gradient_hessian.
@@ -565,9 +553,9 @@ def inner_solve(self, X, y, sample_weight):
             #    This might be the most probable cause.
             #
             # There are many possible ways to deal with this situation. Most of them
-            # add, explicit or implicit, a matrix to the Hessian to make it positive
-            # definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
-            # Instead, we resort to a few lbfgs steps.
+            # add, explicitly or implicitly, a matrix to the hessian to make it
+            # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # Instead, we resort to lbfgs.
             if self.verbose:
                 print(
                     "  The inner solver stumbled upon an singular or ill-conditioned "

From 34e297e7b5c7c00536721535cea65b7c28ffb94e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jul 2022 10:09:04 +0200
Subject: [PATCH 73/97] ENH fall back to lbfgs if line search did not converge

---
 sklearn/linear_model/_glm/glm.py            | 10 ++++++++--
 sklearn/linear_model/_glm/tests/test_glm.py |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 140ddd3730480..6dceaf6860fe2 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -358,9 +358,13 @@ def line_search(self, X, y, sample_weight):
             warnings.warn(
                 f"Line search of Newton solver {self.__class__.__name__} at iteration "
                 f"#{self.iteration} did no converge after 21 line search refinement "
-                "iterations.",
+                "iterations. It will now resort to lbfgs instead.",
                 ConvergenceWarning,
             )
+            if self.verbose:
+                print("  Lines search did not converge and resorts to lbfgs instead.")
+            self.use_fallback_lbfgs_solve = True
+            return
 
         self.raw_prediction = raw
 
@@ -467,6 +471,8 @@ def solve(self, X, y, sample_weight):
             #    self.loss_value, self.gradient_old, self.gradient,
             #    self.raw_prediction.
             self.line_search(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
 
             # 4. Check convergence
             #    Sets self.converged.
@@ -537,7 +543,7 @@ def inner_solve(self, X, y, sample_weight):
         except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
             warnings.warn(
                 f"The inner solver of {self.__class__.__name__} stumbled upon a "
-                "singular or very ill-conditioned hessian matrix at iteration "
+                "singular or very ill-conditioned Hessian matrix at iteration "
                 f"#{self.iteration}. It will now resort to lbfgs instead.\n"
                 "Further options are to use another solver or to avoid such situation "
                 "in the first place. Possible remedies are removing collinearfeatures "
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 2c166b975735e..c1d086d73d994 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1021,8 +1021,8 @@ def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
     # without regularization should raise an informative warning and fallback
     # to the LBFGS solver.
     msg = (
-        "The inner solver of .*NewtonSolver stumbled upon a"
-        " singular or very ill-conditioned Hessian matrix"
+        "The inner solver of .*NewtonSolver stumbled upon a singular or very "
+        "ill-conditioned Hessian matrix"
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
         reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(

From d8c98a2a73f93a28c88f02c22adf229e3a610710 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jul 2022 10:27:14 +0200
Subject: [PATCH 74/97] DOC better comment on performance bottleneck

---
 sklearn/linear_model/_linear_loss.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 710bc6e4f71dc..16200f493125b 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -456,8 +456,9 @@ def gradient_hessian(
                 # Exit early without computing the hessian.
                 return grad, hess, hessian_warning
 
-            # TODO: This "sandwich product", X' diag(W) X, can be greatly improved by
-            # a dedicated Cython routine.
+            # TODO: This "sandwich product", X' diag(W) X, is the main computational
+            # bottleneck for solvers. A dedicated Cython routine might improve it
+            # exploiting the symmetry (as opposed to, e.g., BLAS gemm).
             if sparse.issparse(X):
                 hess[:n_features, :n_features] = (
                     X.T
@@ -467,9 +468,8 @@ def gradient_hessian(
                     @ X
                 ).toarray()
             else:
-                # np.einsum may use less memory but the following is by far faster.
-                # This matrix multiplication (gemm) is most often the most time
-                # consuming step for solvers.
+                # np.einsum may use less memory but the following, using BLAS matrix
+                # multiplication (gemm), is by far faster.
                 WX = hess_pointwise[:, None] * X
                 hess[:n_features, :n_features] = np.dot(X.T, WX)
             # flattened view on the array

From c0ec17d4a9b0a4d6505a3d7ffbf184b62e597a4c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 5 Jul 2022 10:51:27 +0200
Subject: [PATCH 75/97] Update GLM related examples to use the new solver

---
 ...plot_poisson_regression_non_normal_loss.py |  8 +-
 ...lot_tweedie_regression_insurance_claims.py | 88 +++++++++++++++----
 2 files changed, 78 insertions(+), 18 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 5ef8f56980dea..46f5c23578b55 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -110,7 +110,11 @@
 linear_model_preprocessor = ColumnTransformer(
     [
         ("passthrough_numeric", "passthrough", ["BonusMalus"]),
-        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "binned_numeric",
+            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),
+            ["VehAge", "DrivAge"],
+        ),
         ("log_scaled_numeric", log_scale_transformer, ["Density"]),
         (
             "onehot_categorical",
@@ -247,7 +251,7 @@ def score_estimator(estimator, df_test):
 poisson_glm = Pipeline(
     [
         ("preprocessor", linear_model_preprocessor),
-        ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)),
+        ("regressor", PoissonRegressor(alpha=1e-12, solver="newton-cholesky")),
     ]
 )
 poisson_glm.fit(
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 3d86903fcdeff..2381d70ba498a 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -56,12 +56,12 @@
 from sklearn.metrics import mean_squared_error
 
 
-def load_mtpl2(n_samples=100000):
+def load_mtpl2(n_samples=None):
     """Fetch the French Motor Third-Party Liability Claims dataset.
 
     Parameters
     ----------
-    n_samples: int, default=100000
+    n_samples: int, default=None
       number of samples to select (for faster run time). Full dataset has
       678013 samples.
     """
@@ -215,7 +215,7 @@ def score_estimator(
 from sklearn.compose import ColumnTransformer
 
 
-df = load_mtpl2(n_samples=60000)
+df = load_mtpl2()
 
 # Note: filter out claims with zero amount, as the severity model
 # requires strictly positive target values.
@@ -233,7 +233,11 @@ def score_estimator(
 
 column_trans = ColumnTransformer(
     [
-        ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "binned_numeric",
+            KBinsDiscretizer(n_bins=10, subsample=int(2e5), random_state=0),
+            ["VehAge", "DrivAge"],
+        ),
         (
             "onehot_categorical",
             OneHotEncoder(),
@@ -276,10 +280,25 @@ def score_estimator(
 
 df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
 
+# %%
+#
+# Let us keep in mind that despite the seemingly large number of data points in
+# this dataset, the number of evaluation points where the claim amount is
+# non-zero is comparatively quite small:
+len(df_test)
+
+# %%
+len(df_test[df_test["ClaimAmount"] > 0])
+
+# %%
+#
+# As a consequence we can expect some significant variability in our
+# evaluations upon random resampling for the train test split.
+#
 # The parameters of the model are estimated by minimizing the Poisson deviance
-# on the training set via a quasi-Newton solver: l-BFGS. Some of the features
-# are collinear, we use a weak penalization to avoid numerical issues.
-glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
+# on the training set via a Newton solver. Some of the features are collinear,
+# we use a weak penalization to avoid numerical issues.
+glm_freq = PoissonRegressor(alpha=1e-4, solver="newton-cholesky")
 glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"])
 
 scores = score_estimator(
@@ -295,6 +314,12 @@ def score_estimator(
 print(scores)
 
 # %%
+#
+# Note that the score measured on the test set is surprisingly better than a
+# training set. This might be specific to this random split. Proper
+# cross-validation is needed to assess how (un)stable our evaluation is under
+# resampling.
+#
 # We can visually compare observed and predicted values, aggregated by the
 # drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
 # bonus/malus (``BonusMalus``).
@@ -374,7 +399,7 @@ def score_estimator(
 mask_train = df_train["ClaimAmount"] > 0
 mask_test = df_test["ClaimAmount"] > 0
 
-glm_sev = GammaRegressor(alpha=10.0, max_iter=10000)
+glm_sev = GammaRegressor(alpha=10.0, solver="newton-cholesky")
 
 glm_sev.fit(
     X_train[mask_train.values],
@@ -395,13 +420,41 @@ def score_estimator(
 print(scores)
 
 # %%
-# Here, the scores for the test data call for caution as they are
-# significantly worse than for the training data indicating an overfit despite
-# the strong regularization.
 #
-# Note that the resulting model is the average claim amount per claim. As
-# such, it is conditional on having at least one claim, and cannot be used to
-# predict the average claim amount per policy in general.
+# Those metrics are not necessarily easy to interpret. It can be useful to
+# constrast those values to a model that does not use the input features and
+# only predict the average claim amount in the same setting:
+
+from sklearn.dummy import DummyRegressor
+
+dummy_sev = DummyRegressor(strategy="mean")
+dummy_sev.fit(
+    X_train[mask_train.values],
+    df_train.loc[mask_train, "AvgClaimAmount"],
+    sample_weight=df_train.loc[mask_train, "ClaimNb"],
+)
+
+scores = score_estimator(
+    dummy_sev,
+    X_train[mask_train.values],
+    X_test[mask_test.values],
+    df_train[mask_train],
+    df_test[mask_test],
+    target="AvgClaimAmount",
+    weights="ClaimNb",
+)
+print("Evaluation of a mean predictor on target AvgClaimAmount")
+print(scores)
+
+# %%
+#
+# We can conlude the claim amount is very challenging to predict, still the
+# Gamma regressor is able to leverage some information from the input features
+# to slighly improve upon the mean baseline in terms of D².
+#
+# Note that the resulting model is the average claim amount per claim. As such,
+# it is conditional on having at least one claim, and cannot be used to predict
+# the average claim amount per policy in general.
 
 print(
     "Mean AvgClaim Amount per policy:              %.2f "
@@ -415,7 +468,10 @@ def score_estimator(
     "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
     % glm_sev.predict(X_train).mean()
 )
-
+print(
+    "Predicted Mean AvgClaim Amount (dummy) | NbClaim > 0: %.2f"
+    % dummy_sev.predict(X_train).mean()
+)
 
 # %%
 # We can visually compare observed and predicted values, aggregated for
@@ -481,7 +537,7 @@ def score_estimator(
 from sklearn.linear_model import TweedieRegressor
 
 
-glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, max_iter=10000)
+glm_pure_premium = TweedieRegressor(power=1.9, alpha=0.1, solver="newton-cholesky")
 glm_pure_premium.fit(
     X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]
 )

From 0d698d0a2ae64f25d27997d193ccf80708e4cfbc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 16 Sep 2022 00:02:15 +0200
Subject: [PATCH 76/97] CLN address reviewer comments

---
 sklearn/linear_model/_glm/glm.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 6dceaf6860fe2..ee5e6ed590040 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -69,8 +69,7 @@ class NewtonSolver(ABC):
 
     Parameters
     ----------
-    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,), \
-        default=None
+    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
         Start/Initial coefficients of a linear model.
         If shape (n_classes * n_dof,), the classes of one feature are contiguous,
         i.e. one reconstructs the 2d-array via
@@ -136,7 +135,7 @@ class NewtonSolver(ABC):
     def __init__(
         self,
         *,
-        coef=None,
+        coef,
         linear_loss=LinearModelLoss(base_loss=HalfSquaredError, fit_intercept=True),
         l2_reg_strength=0.0,
         tol=1e-4,
@@ -161,13 +160,7 @@ def setup(self, X, y, sample_weight):
             - self.raw_prediction
             - self.loss_value
         """
-        if self.coef is None:
-            self.coef = self.linear_loss.init_zero_coef(X)
-            self.raw_prediction = np.zeros_like(y)
-        else:
-            _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(
-                self.coef, X
-            )
+        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
         self.loss_value = self.linear_loss.loss(
             coef=self.coef,
             X=X,
@@ -546,8 +539,8 @@ def inner_solve(self, X, y, sample_weight):
                 "singular or very ill-conditioned Hessian matrix at iteration "
                 f"#{self.iteration}. It will now resort to lbfgs instead.\n"
                 "Further options are to use another solver or to avoid such situation "
-                "in the first place. Possible remedies are removing collinearfeatures "
-                "of X or increasing the penalization strengths.\n"
+                "in the first place. Possible remedies are removing collinear features"
+                " of X or increasing the penalization strengths.\n"
                 "The original Linear Algebra message was:\n"
                 + str(e),
                 scipy.linalg.LinAlgWarning,

From beeb77412fb46f76eec28d1e7be609e360879b37 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 5 Oct 2022 20:59:15 +0200
Subject: [PATCH 77/97] EXA improve some wordings

---
 ...lot_tweedie_regression_insurance_claims.py | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 2381d70ba498a..5f98e716afa78 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -284,7 +284,7 @@ def score_estimator(
 #
 # Let us keep in mind that despite the seemingly large number of data points in
 # this dataset, the number of evaluation points where the claim amount is
-# non-zero is comparatively quite small:
+# non-zero is quite small:
 len(df_test)
 
 # %%
@@ -292,12 +292,13 @@ def score_estimator(
 
 # %%
 #
-# As a consequence we can expect some significant variability in our
-# evaluations upon random resampling for the train test split.
+# As a consequence, we expect a significant variability in our
+# evaluation upon random resampling of the train test split.
 #
 # The parameters of the model are estimated by minimizing the Poisson deviance
-# on the training set via a Newton solver. Some of the features are collinear,
-# we use a weak penalization to avoid numerical issues.
+# on the training set via a Newton solver. Some of the features are collinear
+# (e.g. because we did not drop any categorical level in the `OneHotEncoder`),
+# we use a weak L2 penalization to avoid numerical issues.
 glm_freq = PoissonRegressor(alpha=1e-4, solver="newton-cholesky")
 glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"])
 
@@ -315,10 +316,10 @@ def score_estimator(
 
 # %%
 #
-# Note that the score measured on the test set is surprisingly better than a
-# training set. This might be specific to this random split. Proper
-# cross-validation is needed to assess how (un)stable our evaluation is under
-# resampling.
+# Note that the score measured on the test set is surprisingly better than on
+# the training set. This might be specific to this random train-test split.
+# Proper cross-validation could help us to assess the sampling variability of
+# these results.
 #
 # We can visually compare observed and predicted values, aggregated by the
 # drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
@@ -421,9 +422,10 @@ def score_estimator(
 
 # %%
 #
-# Those metrics are not necessarily easy to interpret. It can be useful to
-# constrast those values to a model that does not use the input features and
-# only predict the average claim amount in the same setting:
+# Those metric values are not necessarily easy to interpret. It can be
+# insightful to compare them with a model that does not use any input
+# features and always predicts a constant value, i.e. the average claim
+# amount, in the same setting:
 
 from sklearn.dummy import DummyRegressor
 
@@ -448,13 +450,14 @@ def score_estimator(
 
 # %%
 #
-# We can conlude the claim amount is very challenging to predict, still the
+# We conlude that the claim amount is very challenging to predict. Still, the
 # Gamma regressor is able to leverage some information from the input features
 # to slighly improve upon the mean baseline in terms of D².
 #
 # Note that the resulting model is the average claim amount per claim. As such,
 # it is conditional on having at least one claim, and cannot be used to predict
-# the average claim amount per policy in general.
+# the average claim amount per policy. For this, it needs to be combined with
+# a claims frequency model.
 
 print(
     "Mean AvgClaim Amount per policy:              %.2f "

From 7c46dd896cfbd31a4ce8ab908fdfe8e15020847b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 8 Oct 2022 17:25:56 +0200
Subject: [PATCH 78/97] CLN do not pop "solver in parameter constraints

---
 sklearn/linear_model/_glm/glm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 1e85111ef3917..f6eca4325ecfb 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -1235,7 +1235,6 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
     _parameter_constraints: dict = {
         **_GeneralizedLinearRegressor._parameter_constraints
     }
-    _parameter_constraints.pop("solver")
 
     def __init__(
         self,
@@ -1366,7 +1365,6 @@ class GammaRegressor(_GeneralizedLinearRegressor):
     _parameter_constraints: dict = {
         **_GeneralizedLinearRegressor._parameter_constraints
     }
-    _parameter_constraints.pop("solver")
 
     def __init__(
         self,

From 41e7c42330268ba27ddf7fd76430515dc3d1f12e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 Oct 2022 16:35:27 +0200
Subject: [PATCH 79/97] CLN fix typos

---
 sklearn/linear_model/_glm/glm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index f6eca4325ecfb..50551ec8c5125 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -355,7 +355,7 @@ def line_search(self, X, y, sample_weight):
                 ConvergenceWarning,
             )
             if self.verbose:
-                print("  Lines search did not converge and resorts to lbfgs instead.")
+                print("  Line search did not converge and resorts to lbfgs instead.")
             self.use_fallback_lbfgs_solve = True
             return
 
@@ -605,8 +605,8 @@ class QRCholeskyNewtonSolver(BaseCholeskyNewtonSolver):
 
         X' = QR with Q'Q = identity(k), k = min(n_samples, n_features)
 
-    This is the same as an LQ decomposition of X. We introduce the new variable t as,
-    see [1]:
+    This is the same as an LQ decomposition of X. We introduce the new variable z, see
+    [1], as:
 
         (coef, intercept) = (Q @ z, intercept)
 

From 9097536519d19b4d2317a941900d9ffef5f073a2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 Oct 2022 16:56:57 +0200
Subject: [PATCH 80/97] DOC fix docstring

---
 sklearn/linear_model/_glm/glm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 50551ec8c5125..0441241fa8a50 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -74,7 +74,6 @@ class NewtonSolver(ABC):
         If shape (n_classes * n_dof,), the classes of one feature are contiguous,
         i.e. one reconstructs the 2d-array via
         coef.reshape((n_classes, -1), order="F").
-        If None, they are initialized with zero.
 
     linear_loss : LinearModelLoss
         The loss to be minimized.

From a1731248d5699783c2e8292d749030156c6de537 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 11 Oct 2022 20:40:39 +0200
Subject: [PATCH 81/97] CLN remove solver newton-qr-cholesky

---
 doc/whats_new/v1.2.rst                      |   7 +-
 sklearn/linear_model/_glm/glm.py            | 187 ++++----------------
 sklearn/linear_model/_glm/tests/test_glm.py |  19 +-
 3 files changed, 48 insertions(+), 165 deletions(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index de9fd92f04973..b8ea862ea2197 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -320,9 +320,10 @@ Changelog
 
 - |Enhancement| :class:`linear_model.GammaRegressor`,
   :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor` got
-  a `solver` parameter with the two new solvers `solver="newton-cholesky"` and
-  `solver="newton-qr-cholesky"`. Those are 2nd order (Newton) optimisation routines
-  that may reach higher precision in less time than the already available `"lbfgs"`.
+  a new solver `solver="newton-cholesky"`. This is a 2nd order (Newton) optimisation
+  routines that uses a Cholesky decomposition of the hessian matrix. It may reach
+  higher precision in less time than the already available `"lbfgs"`, much depending
+  on the training data.
   :pr:`23314` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| :class:`linear_model.GammaRegressor`,
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 0441241fa8a50..5458ba932f3d0 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -491,7 +491,7 @@ def solve(self, X, y, sample_weight):
         return self.coef
 
 
-class BaseCholeskyNewtonSolver(NewtonSolver):
+class CholeskyNewtonSolver(NewtonSolver):
     """Cholesky based Newton solver.
 
     Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
@@ -500,6 +500,24 @@ class BaseCholeskyNewtonSolver(NewtonSolver):
 
     def setup(self, X, y, sample_weight):
         super().setup(X=X, y=y, sample_weight=sample_weight)
+        n_dof = X.shape[1]
+        if self.linear_loss.fit_intercept:
+            n_dof += 1
+        self.gradient = np.empty_like(self.coef)
+        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
 
     def inner_solve(self, X, y, sample_weight):
         if self.hessian_warning:
@@ -563,119 +581,6 @@ def inner_solve(self, X, y, sample_weight):
             return
 
 
-class CholeskyNewtonSolver(BaseCholeskyNewtonSolver):
-    """Cholesky based Newton solver.
-
-    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
-    solver.
-    """
-
-    def setup(self, X, y, sample_weight):
-        super().setup(X=X, y=y, sample_weight=sample_weight)
-
-        n_dof = X.shape[1]
-        if self.linear_loss.fit_intercept:
-            n_dof += 1
-        self.gradient = np.empty_like(self.coef)
-        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
-
-    def update_gradient_hessian(self, X, y, sample_weight):
-        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
-            coef=self.coef,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            l2_reg_strength=self.l2_reg_strength,
-            n_threads=self.n_threads,
-            gradient_out=self.gradient,
-            hessian_out=self.hessian,
-            raw_prediction=self.raw_prediction,  # this was updated in line_search
-        )
-
-
-class QRCholeskyNewtonSolver(BaseCholeskyNewtonSolver):
-    """QR and Cholesky based Newton solver.
-
-    This is a good solver for n_features >> n_samples, see [1].
-
-    This solver uses the structure of the problem, i.e. the fact that coef enters the
-    loss function only as X @ coef and ||coef||_2, and starts with an economic QR
-    decomposition of X':
-
-        X' = QR with Q'Q = identity(k), k = min(n_samples, n_features)
-
-    This is the same as an LQ decomposition of X. We introduce the new variable z, see
-    [1], as:
-
-        (coef, intercept) = (Q @ z, intercept)
-
-    By using X @ coef = R' @ z and ||coef||_2 = ||z||_2, we can just replace X
-    by R', solve for z instead of coef, and finally get coef = Q @ z.
-    Note that z has less elements than coef if n_features > n_samples:
-        len(z) = k = min(n_samples, n_features) <= n_features = len(coef).
-
-    [1] Hastie, T.J., & Tibshirani, R. (2003). Expression Arrays and the p n Problem.
-    https://web.stanford.edu/~hastie/Papers/pgtn.pdf
-    """
-
-    def setup(self, X, y, sample_weight):
-        n_samples, n_features = X.shape
-        # TODO: setting pivoting=True could improve stability
-        # QR of X'
-        self.Q, self.R = scipy.linalg.qr(
-            X.T, mode="economic", pivoting=False, check_finite=False
-        )
-        # use k = min(n_features, n_samples) instead of n_features
-        k = self.R.T.shape[1]
-        n_dof = k
-        if self.linear_loss.fit_intercept:
-            n_dof += 1
-        # store original coef
-        self.coef_original = self.coef
-        # set self.coef = z (coef_original = Q @ z)
-        self.coef = np.zeros_like(self.coef, shape=n_dof)
-        if np.sum(np.abs(self.coef_original)) > 0:
-            self.coef[:k] = self.Q.T @ self.coef_original[:n_features]
-        self.gradient = np.empty_like(self.coef)
-        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
-
-        super().setup(X=self.R.T, y=y, sample_weight=sample_weight)
-
-    def update_gradient_hessian(self, X, y, sample_weight):
-        # Use R' instead of X
-        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
-            coef=self.coef,
-            X=self.R.T,
-            y=y,
-            sample_weight=sample_weight,
-            l2_reg_strength=self.l2_reg_strength,
-            n_threads=self.n_threads,
-            gradient_out=self.gradient,
-            hessian_out=self.hessian,
-            raw_prediction=self.raw_prediction,  # this was updated in line_search
-        )
-
-    def fallback_lbfgs_solve(self, X, y, sample_weight):
-        # Use R' instead of X
-        super().fallback_lbfgs_solve(X=self.R.T, y=y, sample_weight=sample_weight)
-
-    def line_search(self, X, y, sample_weight):
-        # Use R' instead of X
-        super().line_search(X=self.R.T, y=y, sample_weight=sample_weight)
-
-    def check_convergence(self, X, y, sample_weight):
-        # Use R' instead of X
-        super().check_convergence(X=self.R.T, y=y, sample_weight=sample_weight)
-
-    def finalize(self, X, y, sample_weight):
-        n_features = X.shape[1]
-        w, intercept = self.linear_loss.weight_intercept(self.coef)
-        self.coef_original[:n_features] = self.Q @ w
-        if self.linear_loss.fit_intercept:
-            self.coef_original[-1] = intercept
-        self.coef = self.coef_original
-
-
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
 
@@ -713,19 +618,16 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
-            with an inner cholesky based solver.
-
-        'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
-            is better for `n_features >> n_samples` than 'newton-cholesky'.
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is suited for n_samples >> n_features.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -786,7 +688,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         "alpha": [Interval(Real, 0.0, None, closed="left")],
         "fit_intercept": ["boolean"],
         "solver": [
-            StrOptions({"lbfgs", "newton-cholesky", "newton-qr-cholesky"}),
+            StrOptions({"lbfgs", "newton-cholesky"}),
             Hidden(type),
         ],
         "max_iter": [Interval(Integral, 1, None, closed="left")],
@@ -930,12 +832,8 @@ def fit(self, X, y, sample_weight=None):
             )
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
-        elif self.solver in ["newton-cholesky", "newton-qr-cholesky"]:
-            sol_dict = {
-                "newton-cholesky": CholeskyNewtonSolver,
-                "newton-qr-cholesky": QRCholeskyNewtonSolver,
-            }
-            sol = sol_dict[self.solver](
+        elif self.solver == "newton-cholesky":
+            sol = CholeskyNewtonSolver(
                 coef=coef,
                 linear_loss=linear_loss,
                 l2_reg_strength=l2_reg_strength,
@@ -1153,19 +1051,16 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
-            with an inner cholesky based solver.
-
-        'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
-            is better for `n_features >> n_samples` than 'newton-cholesky'.
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is suited for n_samples >> n_features.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -1282,19 +1177,16 @@ class GammaRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
-            with an inner cholesky based solver.
-
-        'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
-            is better for `n_features >> n_samples` than 'newton-cholesky'.
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is suited for n_samples >> n_features.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
@@ -1442,19 +1334,16 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
         - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
           distributions
 
-    solver : {'lbfgs', 'newton-cholesky', 'newton-qr-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
             Calls scipy's L-BFGS-B optimizer.
 
         'newton-cholesky'
-            Uses Newton-Raphson steps (equivalent to iterated reweighted least squares)
-            with an inner cholesky based solver.
-
-        'newton-qr-cholesky'
-            Same as 'newton-cholesky' but uses a QR decomposition of X.T. This solver
-            is better for `n_features >> n_samples` than 'newton-cholesky'.
+            Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
+            iterated reweighted least squares) with an inner Cholesky based solver.
+            This solver is suited for n_samples >> n_features.
 
     max_iter : int, default=100
         The maximal number of iterations for the solver.
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c1d086d73d994..c76d06c2a57c4 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -32,7 +32,7 @@
 from sklearn.model_selection import train_test_split
 
 
-SOLVERS = ["lbfgs", "newton-cholesky", "newton-qr-cholesky"]
+SOLVERS = ["lbfgs", "newton-cholesky"]
 
 
 class BinomialRegressor(_GeneralizedLinearRegressor):
@@ -238,7 +238,7 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
         intercept = 0
 
     with warnings.catch_warnings():
-        if solver in ["newton-cholesky", "newton-qr-cholesky"]:
+        if solver == "newton-cholesky":
             warnings.filterwarnings(
                 action="ignore",
                 message=".*pointwise hessian to have many non-positive values.*",
@@ -396,9 +396,6 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         rtol = 5e-5
         if solver == "newton-cholesky":
             rtol = 5e-4
-        elif solver == "newton-qr-cholesky":
-            if isinstance(model, TweedieRegressor) and model.power == 1.5:
-                pytest.xfail("newton-qr-cholesky fails on TweedieRegressor(power=1.5)")
         assert_allclose(model.predict(X), y, rtol=rtol)
 
         norm_solution = np.linalg.norm(np.r_[intercept, coef])
@@ -490,10 +487,6 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
     if n_samples > n_features:
         assert model_intercept == pytest.approx(intercept)
         rtol = 1e-4
-        if solver == "newton-qr-cholesky":
-            rtol = 5e-4
-            if isinstance(model, TweedieRegressor) and model.power == 1.5:
-                pytest.xfail("newton-qr-cholesky fails on TweedieRegressor(power=1.5)")
         assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
     else:
         # As it is an underdetermined problem, prediction = y. The following shows that
@@ -512,7 +505,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
             # For minimum norm solution, we would have
             # assert model.intercept_ == pytest.approx(model.coef_[-1])
         else:
-            rtol = 5e-5 if solver == "newton-qr-cholesky" else 5e-6
+            rtol = 5e-6
             assert model_intercept == pytest.approx(intercept, rel=rtol)
             assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
 
@@ -857,7 +850,7 @@ def test_normal_ridge_comparison(
     assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
 
 
-@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky", "newton-qr-cholesky"])
+@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky"])
 def test_poisson_glmnet(solver):
     """Compare Poisson regression with L2 regularization and LogLink to glmnet"""
     # library("glmnet")
@@ -968,8 +961,8 @@ def test_family_deprecation(est, family):
             assert est.family.power == family.power
 
 
-@pytest.mark.parametrize("newton_solver", ["newton-cholesky", "newton-qr-cholesky"])
-def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
+def test_linalg_warning_with_newton_solver(global_random_seed):
+    newton_solver = "newton-cholesky"
     rng = np.random.RandomState(global_random_seed)
     # Use at least 20 samples to reduce the likelihood to get a degenerate
     # dataset for any global_random_seed.

From 049a2fc6660744e23e29c84f66003e44716f32e9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 11 Oct 2022 20:49:21 +0200
Subject: [PATCH 82/97] DOC update PR number in whatsnew

---
 doc/whats_new/v1.2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index b8ea862ea2197..3b659fc857fe7 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -324,7 +324,7 @@ Changelog
   routines that uses a Cholesky decomposition of the hessian matrix. It may reach
   higher precision in less time than the already available `"lbfgs"`, much depending
   on the training data.
-  :pr:`23314` by :user:`Christian Lorentzen <lorentzenchr>`.
+  :pr:`24637` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| :class:`linear_model.GammaRegressor`,
   :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor`

From f225453dc1d1119dccd0010abac54cf26f39f652 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 14 Oct 2022 18:22:36 +0200
Subject: [PATCH 83/97] CLN address review comments

---
 doc/whats_new/v1.2.rst                         | 2 +-
 sklearn/linear_model/_glm/tests/test_glm.py    | 9 ++++-----
 sklearn/linear_model/tests/test_linear_loss.py | 9 +++++++++
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index f745311843923..612fcb0b65cd8 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -348,7 +348,7 @@ Changelog
 - |Enhancement| :class:`linear_model.GammaRegressor`,
   :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor` got
   a new solver `solver="newton-cholesky"`. This is a 2nd order (Newton) optimisation
-  routines that uses a Cholesky decomposition of the hessian matrix. It may reach
+  routine that uses a Cholesky decomposition of the hessian matrix. It may reach
   higher precision in less time than the already available `"lbfgs"`, much depending
   on the training data.
   :pr:`24637` by :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 09b9463595821..c2f11d145d0cd 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -505,8 +505,7 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
             # For minimum norm solution, we would have
             # assert model.intercept_ == pytest.approx(model.coef_[-1])
         else:
-            rtol = 5e-6
-            assert model_intercept == pytest.approx(intercept, rel=rtol)
+            assert model_intercept == pytest.approx(intercept, rel=5e-6)
             assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
 
 
@@ -961,7 +960,7 @@ def test_family_deprecation(est, family):
 def test_linalg_warning_with_newton_solver(global_random_seed):
     newton_solver = "newton-cholesky"
     rng = np.random.RandomState(global_random_seed)
-    # Use at least 20 samples to reduce the likelihood to get a degenerate
+    # Use at least 20 samples to reduce the likelihood of getting a degenerate
     # dataset for any global_random_seed.
     X_orig = rng.normal(size=(20, 3))
     y = rng.poisson(
@@ -971,8 +970,8 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     # Collinear variation of the same input features.
     X_collinear = np.hstack([X_orig] * 10)
 
-    # Let's consider the deviance of constant baseline on this problem.
-    baseline_pred = np.full_like(y, y.astype(np.float64).mean())
+    # Let's consider the deviance of a constant baseline on this problem.
+    baseline_pred = np.full_like(y, y.mean())
     constant_model_deviance = mean_poisson_deviance(y, baseline_pred)
     assert constant_model_deviance > 1.0
 
diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index c48680a282611..27e363e0097fc 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -109,6 +109,15 @@ def test_loss_grad_hess_are_the_same(
         g4, h4, _ = loss.gradient_hessian(
             coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
         )
+    else:
+        with pytest.raises(NotImplementedError):
+            loss.gradient_hessian(
+                coef,
+                X,
+                y,
+                sample_weight=sample_weight,
+                l2_reg_strength=l2_reg_strength,
+            )
 
     assert_allclose(l1, l2)
     assert_allclose(g1, g2)

From 28b3820c2b93114cd9f7e2ae83b0c365b5b3484d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 14 Oct 2022 18:56:18 +0200
Subject: [PATCH 84/97] CLN remove unnecessary catch_warnings

---
 sklearn/linear_model/_glm/tests/test_glm.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index c2f11d145d0cd..7970041baa3b3 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -237,14 +237,7 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
         coef = coef_without_intercept
         intercept = 0
 
-    with warnings.catch_warnings():
-        if solver == "newton-cholesky":
-            warnings.filterwarnings(
-                action="ignore",
-                message=".*pointwise hessian to have many non-positive values.*",
-                category=ConvergenceWarning,
-            )
-        model.fit(X, y)
+    model.fit(X, y)
 
     rtol = 5e-5 if solver == "lbfgs" else 1e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)

From 46841bdf611c1ebcdc6a3435ed77b2882ad0b8df Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 18 Oct 2022 18:48:28 +0200
Subject: [PATCH 85/97] CLN address some review comments

---
 .../plot_tweedie_regression_insurance_claims.py  |  9 +++++----
 sklearn/linear_model/_glm/glm.py                 | 16 ++++++++++++----
 sklearn/linear_model/_linear_loss.py             |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
index 5f98e716afa78..10a862127dc65 100644
--- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -422,7 +422,7 @@ def score_estimator(
 
 # %%
 #
-# Those metric values are not necessarily easy to interpret. It can be
+# Those values of the metrics are not necessarily easy to interpret. It can be
 # insightful to compare them with a model that does not use any input
 # features and always predicts a constant value, i.e. the average claim
 # amount, in the same setting:
@@ -450,9 +450,10 @@ def score_estimator(
 
 # %%
 #
-# We conlude that the claim amount is very challenging to predict. Still, the
-# Gamma regressor is able to leverage some information from the input features
-# to slighly improve upon the mean baseline in terms of D².
+# We conclude that the claim amount is very challenging to predict. Still, the
+# :class:`~sklearn.linear.GammaRegressor` is able to leverage some information
+# from the input features to slighly improve upon the mean baseline in terms
+# of D².
 #
 # Note that the resulting model is the average claim amount per claim. As such,
 # it is conditional on having at least one claim, and cannot be used to predict
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 3220dd567830f..9a19f18bba4de 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -70,7 +70,7 @@ class NewtonSolver(ABC):
     Parameters
     ----------
     coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
-        Start/Initial coefficients of a linear model.
+        Initial coefficients of a linear model.
         If shape (n_classes * n_dof,), the classes of one feature are contiguous,
         i.e. one reconstructs the 2d-array via
         coef.reshape((n_classes, -1), order="F").
@@ -114,8 +114,7 @@ class NewtonSolver(ABC):
     loss_value_old : float
         Value of objective function of previous itertion.
 
-    raw_prediction : ndarray of shape (n_samples,) or \
-            (n_samples, n_classes)
+    raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
 
     converged : bool
         Indicator for convergence of the solver.
@@ -124,7 +123,8 @@ class NewtonSolver(ABC):
         Number of Newton steps, i.e. calls to inner_solve
 
     use_fallback_lbfgs_solve : bool
-        An inner solver can set this to True to resort to LBFGS for one iteration.
+        If set to True, the solver will resort to call LBFGS to finish the optimisation
+        procedure in case of convergence issues.
 
     gradient_times_newton : float
         gradient @ coef_newton, set in inner_solve and used by line_search. If the
@@ -629,6 +629,8 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
             iterated reweighted least squares) with an inner Cholesky based solver.
             This solver is suited for n_samples >> n_features.
 
+            .. versionadded:: 1.2
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -1066,6 +1068,8 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
             iterated reweighted least squares) with an inner Cholesky based solver.
             This solver is suited for n_samples >> n_features.
 
+            .. versionadded:: 1.2
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -1192,6 +1196,8 @@ class GammaRegressor(_GeneralizedLinearRegressor):
             iterated reweighted least squares) with an inner Cholesky based solver.
             This solver is suited for n_samples >> n_features.
 
+            .. versionadded:: 1.2
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -1349,6 +1355,8 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
             iterated reweighted least squares) with an inner Cholesky based solver.
             This solver is suited for n_samples >> n_features.
 
+            .. versionadded:: 1.2
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 16200f493125b..463c5fc440d01 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -433,7 +433,7 @@ def gradient_hessian(
         # For non-canonical link functions and far away from the optimum, the pointwise
         # hessian can be negative. We take care that 75% ot the hessian entries are
         # positive.
-        hessian_warning = np.sum(hess_pointwise <= 0) > len(hess_pointwise) * 0.25
+        hessian_warning = np.mean(hess_pointwise <= 0) > 0.25
         hess_pointwise = np.abs(hess_pointwise)
 
         if not self.base_loss.is_multiclass:

From 02c4245b901b0e278a03b234d2fac68f759d25c2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 14:09:29 +0200
Subject: [PATCH 86/97] DOC more precise whatsnew

---
 doc/whats_new/v1.2.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
index 612fcb0b65cd8..545f1fbc4a618 100644
--- a/doc/whats_new/v1.2.rst
+++ b/doc/whats_new/v1.2.rst
@@ -348,9 +348,11 @@ Changelog
 - |Enhancement| :class:`linear_model.GammaRegressor`,
   :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor` got
   a new solver `solver="newton-cholesky"`. This is a 2nd order (Newton) optimisation
-  routine that uses a Cholesky decomposition of the hessian matrix. It may reach
-  higher precision in less time than the already available `"lbfgs"`, much depending
-  on the training data.
+  routine that uses a Cholesky decomposition of the hessian matrix.
+  When `n_samples >> n_features`, the `"newton-cholesky"` solver has been observed to
+  converge both faster and to a higher precision solution than the `"lbfgs"` solver on
+  problems with one-hot encoded categorical variables with some rare categorical
+  levels.
   :pr:`24637` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| :class:`linear_model.GammaRegressor`,

From f841e5414949c7d53455a490c8027828d1f128e2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 14:15:49 +0200
Subject: [PATCH 87/97] CLN use init_zero_coef

---
 sklearn/linear_model/_glm/glm.py            | 4 ++--
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 9a19f18bba4de..5c63d5e7ceb97 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -491,7 +491,7 @@ def solve(self, X, y, sample_weight):
         return self.coef
 
 
-class CholeskyNewtonSolver(NewtonSolver):
+class NewtonCholeskySolver(NewtonSolver):
     """Cholesky based Newton solver.
 
     Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
@@ -835,7 +835,7 @@ def fit(self, X, y, sample_weight=None):
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
         elif self.solver == "newton-cholesky":
-            sol = CholeskyNewtonSolver(
+            sol = NewtonCholeskySolver(
                 coef=coef,
                 linear_loss=linear_loss,
                 l2_reg_strength=l2_reg_strength,
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 7970041baa3b3..d3bf61d76de49 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1003,7 +1003,7 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     # without regularization should raise an informative warning and fallback
     # to the LBFGS solver.
     msg = (
-        "The inner solver of .*NewtonSolver stumbled upon a singular or very "
+        "The inner solver of .*Newton.*Solver stumbled upon a singular or very "
         "ill-conditioned Hessian matrix"
     )
     with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):

From e285f05c9385a7c6b40da78b74f23c3776e7a825 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 16:26:22 +0200
Subject: [PATCH 88/97] CLN use and test init_zero_coef

---
 sklearn/linear_model/_glm/glm.py              |  4 +--
 sklearn/linear_model/_linear_loss.py          | 15 ++++++++--
 .../linear_model/tests/test_linear_loss.py    | 28 +++++++++++++++++--
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 5c63d5e7ceb97..eb1f156986943 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -799,13 +799,11 @@ def fit(self, X, y, sample_weight=None):
                 coef = self.coef_
             coef = coef.astype(loss_dtype, copy=False)
         else:
+            coef = linear_loss.init_zero_coef(X, dtype=loss_dtype)
             if self.fit_intercept:
-                coef = np.zeros(n_features + 1, dtype=loss_dtype)
                 coef[-1] = linear_loss.base_loss.link.link(
                     np.average(y, weights=sample_weight)
                 )
-            else:
-                coef = np.zeros(n_features, dtype=loss_dtype)
 
         l2_reg_strength = self.alpha
         n_threads = _openmp_effective_n_threads()
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 463c5fc440d01..b83ecb1d34460 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -67,13 +67,21 @@ def __init__(self, base_loss, fit_intercept):
         self.base_loss = base_loss
         self.fit_intercept = fit_intercept
 
-    def init_zero_coef(self, X):
+    def init_zero_coef(self, X, dtype=None):
         """Allocate coef of correct shape with zeros.
 
         Parameters:
         -----------
         X : {array-like, sparse matrix} of shape (n_samples, n_features)
             Training data.
+        dtype : data-type, default=None
+            Overrides the data type of coef. With dtype=None, coef will have the same
+            dtype as X.
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
+            Coefficients of a linear model.
         """
         n_features = X.shape[1]
         n_classes = self.base_loss.n_classes
@@ -82,9 +90,10 @@ def init_zero_coef(self, X):
         else:
             n_dof = n_features
         if self.base_loss.is_multiclass:
-            self.coef = np.zeros_like(X, shape=(n_classes, n_dof))
+            coef = np.zeros_like(X, shape=(n_classes, n_dof), dtype=dtype, order="F")
         else:
-            self.coef = np.zeros_like(X, shape=n_dof)
+            coef = np.zeros_like(X, shape=n_dof, dtype=dtype)
+        return coef
 
     def weight_intercept(self, coef):
         """Helper function to get coefficients and intercept.
diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index 27e363e0097fc..574cd8c69ffce 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -35,10 +35,10 @@ def random_X_y_coef(
         n_features=n_features,
         random_state=rng,
     )
+    coef = linear_model_loss.init_zero_coef(X)
 
     if linear_model_loss.base_loss.is_multiclass:
         n_classes = linear_model_loss.base_loss.n_classes
-        coef = np.empty((n_classes, n_dof))
         coef.flat[:] = rng.uniform(
             low=coef_bound[0],
             high=coef_bound[1],
@@ -60,7 +60,6 @@ def choice_vectorized(items, p):
 
         y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
     else:
-        coef = np.empty((n_dof,))
         coef.flat[:] = rng.uniform(
             low=coef_bound[0],
             high=coef_bound[1],
@@ -77,6 +76,31 @@ def choice_vectorized(items, p):
     return X, y, coef
 
 
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("n_features", [0, 1, 10])
+@pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64])
+def test_init_zero_coef(base_loss, fit_intercept, n_features, dtype):
+    """Test that init_zero_coef initializes coef correctly."""
+    loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
+    rng = np.random.RandomState(42)
+    X = rng.normal(size=(5, n_features))
+    coef = loss.init_zero_coef(X, dtype=dtype)
+    if loss.base_loss.is_multiclass:
+        n_classes = loss.base_loss.n_classes
+        assert coef.shape == (n_classes, n_features + fit_intercept)
+        assert coef.flags["F_CONTIGUOUS"]
+    else:
+        assert coef.shape == (n_features + fit_intercept,)
+
+    if dtype is None:
+        assert coef.dtype == X.dtype
+    else:
+        assert coef.dtype == dtype
+
+    assert np.sum(np.abs(coef)) == 0
+
+
 @pytest.mark.parametrize("base_loss", LOSSES)
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("sample_weight", [None, "range"])

From 55e57df8a17776fc0f88463e32c367c87b034a10 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 16:41:52 +0200
Subject: [PATCH 89/97] CLN address some review comments

---
 sklearn/linear_model/_linear_loss.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index b83ecb1d34460..e881c43ba4988 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -161,7 +161,7 @@ def weight_intercept_raw(self, coef, X):
         if not self.base_loss.is_multiclass:
             raw_prediction = X @ weights + intercept
         else:
-            # weights has shape to (n_classes, n_dof)
+            # weights has shape (n_classes, n_dof)
             raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
 
         return weights, intercept, raw_prediction
@@ -481,8 +481,10 @@ def gradient_hessian(
                 # multiplication (gemm), is by far faster.
                 WX = hess_pointwise[:, None] * X
                 hess[:n_features, :n_features] = np.dot(X.T, WX)
-            # flattened view on the array
+
             if l2_reg_strength > 0:
+                # The L2 penalty enters the Hessian on the diagonal only. To add those
+                # terms, we use a flattened view on the array.
                 hess.reshape(-1)[
                     : (n_features * n_dof) : (n_dof + 1)
                 ] += l2_reg_strength
@@ -492,6 +494,8 @@ def gradient_hessian(
                 # hess = (X, 1)' @ diag(h) @ (X, 1)
                 #      = (X' @ diag(h) @ X, X' @ h)
                 #        (           h @ X, sum(h))
+                # The left upper part has already been filled, it remains to compute
+                # the last row and the last column.
                 Xh = X.T @ hess_pointwise
                 hess[:-1, -1] = Xh
                 hess[-1, :-1] = Xh

From 1d158cbef49197cd48de73fc6e8eadd5b7257906 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 16:50:52 +0200
Subject: [PATCH 90/97] CLN mark NewtonSolver as private by leading underscore

---
 sklearn/linear_model/_glm/glm.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index eb1f156986943..5b9992584d8e2 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -33,7 +33,7 @@
 from .._linear_loss import LinearModelLoss
 
 
-class NewtonSolver(ABC):
+class _NewtonSolver(ABC):
     """Newton solver for GLMs.
 
     This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
@@ -54,7 +54,7 @@ class NewtonSolver(ABC):
     above pattern and use structure specific tricks.
 
     Usage pattern:
-        - initialize solver: sol = NewtonSolver(...)
+        - initialize solver: sol = _NewtonSolver(...)
         - solve the problem: sol.solve(X, y, sample_weight)
 
     References
@@ -491,7 +491,7 @@ def solve(self, X, y, sample_weight):
         return self.coef
 
 
-class NewtonCholeskySolver(NewtonSolver):
+class _NewtonCholeskySolver(_NewtonSolver):
     """Cholesky based Newton solver.
 
     Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
@@ -683,7 +683,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         we have `y_pred = exp(X @ coeff + intercept)`.
     """
 
-    # We allow for NewtonSolver classes for the "solver" parameter but do not
+    # We allow for _NewtonSolver classes for the "solver" parameter but do not
     # make them public in the docstrings. This facilitates testing and
     # benchmarking.
     _parameter_constraints: dict = {
@@ -833,7 +833,7 @@ def fit(self, X, y, sample_weight=None):
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
         elif self.solver == "newton-cholesky":
-            sol = NewtonCholeskySolver(
+            sol = _NewtonCholeskySolver(
                 coef=coef,
                 linear_loss=linear_loss,
                 l2_reg_strength=l2_reg_strength,
@@ -844,7 +844,7 @@ def fit(self, X, y, sample_weight=None):
             )
             coef = sol.solve(X, y, sample_weight)
             self.n_iter_ = sol.iteration
-        elif issubclass(self.solver, NewtonSolver):
+        elif issubclass(self.solver, _NewtonSolver):
             sol = self.solver(
                 coef=coef,
                 linear_loss=linear_loss,

From a30c71f0af8fe01676cef056ea6ea254d243de2c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 19:10:11 +0200
Subject: [PATCH 91/97] CLN exact comments for inner_solve

---
 sklearn/linear_model/_glm/glm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 5b9992584d8e2..08ff6c729fe68 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -180,7 +180,7 @@ def inner_solve(self, X, y, sample_weight):
 
         Sets:
             - self.coef_newton
-            - gradient_times_newton
+            - self.gradient_times_newton
         """
 
     def fallback_lbfgs_solve(self, X, y, sample_weight):
@@ -453,7 +453,7 @@ def solve(self, X, y, sample_weight):
 
             # 2. Inner solver
             #    Calculate Newton step/direction
-            #    This usually sets self.coef_newton.
+            #    This usually sets self.coef_newton and self.gradient_times_newton.
             self.inner_solve(X=X, y=y, sample_weight=sample_weight)
             if self.use_fallback_lbfgs_solve:
                 break

From 298ce607cf29475808611f2ffd71d84a101354f4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 19:10:53 +0200
Subject: [PATCH 92/97] TST add test_newton_solver_verbosity

---
 sklearn/linear_model/_glm/tests/test_glm.py | 80 ++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index d3bf61d76de49..de4e47d65f0b0 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -14,7 +14,7 @@
 from scipy.optimize import minimize, root
 
 from sklearn.base import clone
-from sklearn._loss import HalfBinomialLoss
+from sklearn._loss import HalfBinomialLoss, HalfPoissonLoss, HalfTweedieLoss
 from sklearn._loss.glm_distribution import TweedieDistribution
 from sklearn._loss.link import IdentityLink, LogLink
 
@@ -26,6 +26,7 @@
     TweedieRegressor,
 )
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
+from sklearn.linear_model._glm.glm import _NewtonCholeskySolver
 from sklearn.linear_model._linear_loss import LinearModelLoss
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
@@ -1029,3 +1030,80 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     assert penalized_collinear_newton_deviance == pytest.approx(
         original_newton_deviance, rel=rtol
     )
+
+
+@pytest.mark.parametrize("verbose", [0, 1, 2])
+def test_newton_solver_verbosity(capsys, verbose):
+    """Test the std output of verbose newton solvers."""
+    y = np.array([1, 2], dtype=float)
+    X = np.array([[1.0, 0], [0, 1]], dtype=float)
+    linear_loss = LinearModelLoss(base_loss=HalfPoissonLoss(), fit_intercept=False)
+    sol = _NewtonCholeskySolver(
+        coef=linear_loss.init_zero_coef(X),
+        linear_loss=linear_loss,
+        l2_reg_strength=0,
+        verbose=verbose,
+    )
+    sol.solve(X, y, None)  # returns array([0., 0.69314758])
+    captured = capsys.readouterr()
+
+    if verbose == 0:
+        assert captured.out == ""
+    else:
+        msg = [
+            "Newton iter=1",
+            "Check Convergence",
+            "1. max |gradient|",
+            "2. Newton decrement",
+            "Solver did converge at loss = ",
+        ]
+        for m in msg:
+            assert m in captured.out
+
+    if verbose >= 2:
+        msg = ["Backtracking Line Search"]
+        for m in msg:
+            assert m in captured.out
+
+    # Set the Newton solver to a state with a completely wrong Newton step.
+    sol = _NewtonCholeskySolver(
+        coef=linear_loss.init_zero_coef(X),
+        linear_loss=linear_loss,
+        l2_reg_strength=0,
+        verbose=verbose,
+    )
+    sol.setup(X=X, y=y, sample_weight=None)
+    sol.iteration = 1
+    sol.update_gradient_hessian(X=X, y=y, sample_weight=None)
+    sol.coef_newton = np.array([1.0, 0])
+    sol.gradient_times_newton = sol.gradient @ sol.coef_newton
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        sol.line_search(X=X, y=y, sample_weight=None)
+        captured = capsys.readouterr()
+    if verbose >= 1:
+        assert (
+            "Line search did not converge and resorts to lbfgs instead." in captured.out
+        )
+
+    # Test for a case with negative hessian. We badly initialize coef for a Tweedie
+    # loss with non-canonical link, e.g. Inverse Gaussian deviance with a log link.
+    linear_loss = LinearModelLoss(
+        base_loss=HalfTweedieLoss(power=3), fit_intercept=False
+    )
+    sol = _NewtonCholeskySolver(
+        coef=linear_loss.init_zero_coef(X) + 1,
+        linear_loss=linear_loss,
+        l2_reg_strength=0,
+        verbose=verbose,
+    )
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        sol.solve(X, y, None)
+    captured = capsys.readouterr()
+    if verbose >= 1:
+        assert (
+            "The inner solver detected a pointwise Hessian with many negative values"
+            " and resorts to lbfgs instead."
+            in captured.out
+        )

From 00f7465ed4b6f14197651324b1080f541fa5d3c0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 23 Oct 2022 20:55:22 +0200
Subject: [PATCH 93/97] TST extend test_newton_solver_verbosity

---
 sklearn/linear_model/_glm/tests/test_glm.py | 32 ++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index de4e47d65f0b0..7b5cdde51161a 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1061,7 +1061,7 @@ def test_newton_solver_verbosity(capsys, verbose):
             assert m in captured.out
 
     if verbose >= 2:
-        msg = ["Backtracking Line Search"]
+        msg = ["Backtracking Line Search", "line search iteration="]
         for m in msg:
             assert m in captured.out
 
@@ -1086,6 +1086,36 @@ def test_newton_solver_verbosity(capsys, verbose):
             "Line search did not converge and resorts to lbfgs instead." in captured.out
         )
 
+    # Set the Newton solver to a state with bad Newton step such that the loss
+    # improvement in line search is tiny.
+    sol = _NewtonCholeskySolver(
+        coef=np.array([1e-12, 0.69314758]),
+        linear_loss=linear_loss,
+        l2_reg_strength=0,
+        verbose=verbose,
+    )
+    sol.setup(X=X, y=y, sample_weight=None)
+    sol.iteration = 1
+    sol.update_gradient_hessian(X=X, y=y, sample_weight=None)
+    sol.coef_newton = np.array([1e-6, 0])
+    sol.gradient_times_newton = sol.gradient @ sol.coef_newton
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", ConvergenceWarning)
+        sol.line_search(X=X, y=y, sample_weight=None)
+        captured = capsys.readouterr()
+    if verbose >= 2:
+        msg = [
+            "line search iteration=",
+            "check loss improvement <= armijo term:",
+            "check loss |improvement| <= eps * |loss_old|:",
+            "check sum(|gradient|) < sum(|gradient_old|):",
+            "check |sum(|gradient|) - sum(|gradient_old|)| <= eps *"
+            " sum(|gradient_old|):",
+            "check if previously sum(|gradient",
+        ]
+        for m in msg:
+            assert m in captured.out
+
     # Test for a case with negative hessian. We badly initialize coef for a Tweedie
     # loss with non-canonical link, e.g. Inverse Gaussian deviance with a log link.
     linear_loss = LinearModelLoss(

From 308fd886d757698663d4b1965ff9f74155046f03 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 24 Oct 2022 10:52:25 +0200
Subject: [PATCH 94/97] TST logic in test_glm_regression_unpenalized

---
 sklearn/linear_model/_glm/tests/test_glm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 7b5cdde51161a..df2d384e77743 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -397,7 +397,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
         if solver == "newton-cholesky":
             # XXX: This solver shows random behaviour. Sometimes it finds solutions
             # with norm_model <= norm_solution! So we check conditionally.
-            if not (norm_model > (1 + 1e-12) * norm_solution):
+            if norm_model < (1 + 1e-12) * norm_solution:
                 assert model.intercept_ == pytest.approx(intercept)
                 assert_allclose(model.coef_, coef, rtol=rtol)
         elif solver == "lbfgs" and fit_intercept:

From ebf930bcfcfef43d4754652a1c27bdfed42ba206 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 24 Oct 2022 12:19:51 +0200
Subject: [PATCH 95/97] TST use count_nonzero

---
 sklearn/linear_model/tests/test_linear_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index 574cd8c69ffce..0c0053a103098 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -98,7 +98,7 @@ def test_init_zero_coef(base_loss, fit_intercept, n_features, dtype):
     else:
         assert coef.dtype == dtype
 
-    assert np.sum(np.abs(coef)) == 0
+    assert np.count_nonzero(coef) == 0
 
 
 @pytest.mark.parametrize("base_loss", LOSSES)

From d304ce9912e87cf981684809e94c92223254b1ee Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 24 Oct 2022 18:35:23 +0200
Subject: [PATCH 96/97] CLN remove super rare line search checks

---
 sklearn/linear_model/_glm/glm.py            | 48 ---------------------
 sklearn/linear_model/_glm/tests/test_glm.py |  3 --
 2 files changed, 51 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 08ff6c729fe68..47a86527666bb 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -241,8 +241,6 @@ def line_search(self, X, y, sample_weight):
 
         # np.sum(np.abs(self.gradient_old))
         sum_abs_grad_old = -1
-        sum_abs_grad_previous = -1  # Used to track sum|gradients| of i-1
-        has_improved_sum_abs_grad_previous = False
 
         is_verbose = self.verbose >= 2
         if is_verbose:
@@ -298,52 +296,6 @@ def line_search(self, X, y, sample_weight):
                     )
                 if check:
                     break
-                # 2.2 Deal with relative gradient differences around machine precision.
-                tiny_grad = sum_abs_grad_old * eps
-                abs_grad_improvement = np.abs(sum_abs_grad - sum_abs_grad_old)
-                check = abs_grad_improvement <= tiny_grad
-                if is_verbose:
-                    print(
-                        "      check |sum(|gradient|) - sum(|gradient_old|)| <= eps * "
-                        "sum(|gradient_old|):"
-                        f" {abs_grad_improvement} <= {tiny_grad} {check}"
-                    )
-                if check:
-                    break
-                # 2.3 This is really the last resort.
-                # Check that sum(|gradient_{i-1}|) < sum(|gradient_{i-2}|)
-                #            = has_improved_sum_abs_grad_previous
-                # If now sum(|gradient_{i}|) >= sum(|gradient_{i-1}|), this iteration
-                # made things worse and we should have stopped at i-1.
-                check = (
-                    has_improved_sum_abs_grad_previous
-                    and sum_abs_grad >= sum_abs_grad_previous
-                )
-                if is_verbose:
-                    print(
-                        "      check if previously "
-                        f"sum(|gradient_{i-1}|) < sum(|gradient_{i-2}|) but now "
-                        f"sum(|gradient_{i}|) >= sum(|gradient_{i-1}|) {check}"
-                    )
-                if check:
-                    t /= beta  # we go back to i-1
-                    self.coef = self.coef_old + t * self.coef_newton
-                    raw = self.raw_prediction + t * raw_prediction_newton
-                    self.loss_value, self.gradient = self.linear_loss.loss_gradient(
-                        coef=self.coef,
-                        X=X,
-                        y=y,
-                        sample_weight=sample_weight,
-                        l2_reg_strength=self.l2_reg_strength,
-                        n_threads=self.n_threads,
-                        raw_prediction=raw,
-                    )
-                    break
-                # Calculate for the next iteration
-                has_improved_sum_abs_grad_previous = (
-                    sum_abs_grad < sum_abs_grad_previous
-                )
-                sum_abs_grad_previous = sum_abs_grad
 
             t *= beta
         else:
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index df2d384e77743..4390ee7620e9d 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -1109,9 +1109,6 @@ def test_newton_solver_verbosity(capsys, verbose):
             "check loss improvement <= armijo term:",
             "check loss |improvement| <= eps * |loss_old|:",
             "check sum(|gradient|) < sum(|gradient_old|):",
-            "check |sum(|gradient|) - sum(|gradient_old|)| <= eps *"
-            " sum(|gradient_old|):",
-            "check if previously sum(|gradient",
         ]
         for m in msg:
             assert m in captured.out

From f002eb78a2fdb36eee4ab8ec77df0e40a2790623 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 24 Oct 2022 18:56:58 +0200
Subject: [PATCH 97/97] MNT move Newton solver to new file _newton_solver.py

---
 sklearn/linear_model/_glm/_newton_solver.py | 518 ++++++++++++++++++++
 sklearn/linear_model/_glm/glm.py            | 516 +------------------
 sklearn/linear_model/_glm/tests/test_glm.py |  10 +-
 3 files changed, 529 insertions(+), 515 deletions(-)
 create mode 100644 sklearn/linear_model/_glm/_newton_solver.py

diff --git a/sklearn/linear_model/_glm/_newton_solver.py b/sklearn/linear_model/_glm/_newton_solver.py
new file mode 100644
index 0000000000000..d624d1399b1b9
--- /dev/null
+++ b/sklearn/linear_model/_glm/_newton_solver.py
@@ -0,0 +1,518 @@
+"""
+Newton solver for Generalized Linear Models
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+# License: BSD 3 clause
+
+import warnings
+from abc import ABC, abstractmethod
+
+import numpy as np
+import scipy.linalg
+import scipy.optimize
+
+from ..._loss.loss import HalfSquaredError
+from ...exceptions import ConvergenceWarning
+from ...utils.optimize import _check_optimize_result
+from .._linear_loss import LinearModelLoss
+
+
+class NewtonSolver(ABC):
+    """Newton solver for GLMs.
+
+    This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
+    iteration aims at finding the Newton step which is done by the inner solver. With
+    Hessian H, gradient g and coefficients coef, one step solves:
+
+        H @ coef_newton = -g
+
+    For our GLM / LinearModelLoss, we have gradient g and Hessian H:
+
+        g = X.T @ loss.gradient + l2_reg_strength * coef
+        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+
+    Backtracking line search updates coef = coef_old + t * coef_newton for some t in
+    (0, 1].
+
+    This is a base class, actual implementations (child classes) may deviate from the
+    above pattern and use structure specific tricks.
+
+    Usage pattern:
+        - initialize solver: sol = NewtonSolver(...)
+        - solve the problem: sol.solve(X, y, sample_weight)
+
+    References
+    ----------
+    - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
+      2nd edition
+      https://doi.org/10.1007/978-0-387-40065-5
+
+    - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
+      Cambridge University Press, 2004.
+      https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
+
+    Parameters
+    ----------
+    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+        Initial coefficients of a linear model.
+        If shape (n_classes * n_dof,), the classes of one feature are contiguous,
+        i.e. one reconstructs the 2d-array via
+        coef.reshape((n_classes, -1), order="F").
+
+    linear_loss : LinearModelLoss
+        The loss to be minimized.
+
+    l2_reg_strength : float, default=0.0
+        L2 regularization strength.
+
+    tol : float, default=1e-4
+        The optimization problem is solved when each of the following condition is
+        fulfilled:
+        1. maximum |gradient| <= tol
+        2. Newton decrement d: 1/2 * d^2 <= tol
+
+    max_iter : int, default=100
+        Maximum number of Newton steps allowed.
+
+    n_threads : int, default=1
+        Number of OpenMP threads to use for the computation of the Hessian and gradient
+        of the loss function.
+
+    Attributes
+    ----------
+    coef_old : ndarray of shape coef.shape
+        Coefficient of previous iteration.
+
+    coef_newton : ndarray of shape coef.shape
+        Newton step.
+
+    gradient : ndarray of shape coef.shape
+        Gradient of the loss wrt. the coefficients.
+
+    gradient_old : ndarray of shape coef.shape
+        Gradient of previous iteration.
+
+    loss_value : float
+        Value of objective function = loss + penalty.
+
+    loss_value_old : float
+        Value of objective function of previous itertion.
+
+    raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
+
+    converged : bool
+        Indicator for convergence of the solver.
+
+    iteration : int
+        Number of Newton steps, i.e. calls to inner_solve
+
+    use_fallback_lbfgs_solve : bool
+        If set to True, the solver will resort to call LBFGS to finish the optimisation
+        procedure in case of convergence issues.
+
+    gradient_times_newton : float
+        gradient @ coef_newton, set in inner_solve and used by line_search. If the
+        Newton step is a descent direction, this is negative.
+    """
+
+    def __init__(
+        self,
+        *,
+        coef,
+        linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),
+        l2_reg_strength=0.0,
+        tol=1e-4,
+        max_iter=100,
+        n_threads=1,
+        verbose=0,
+    ):
+        self.coef = coef
+        self.linear_loss = linear_loss
+        self.l2_reg_strength = l2_reg_strength
+        self.tol = tol
+        self.max_iter = max_iter
+        self.n_threads = n_threads
+        self.verbose = verbose
+
+    def setup(self, X, y, sample_weight):
+        """Precomputations
+
+        If None, initializes:
+            - self.coef
+        Sets:
+            - self.raw_prediction
+            - self.loss_value
+        """
+        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
+        self.loss_value = self.linear_loss.loss(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            raw_prediction=self.raw_prediction,
+        )
+
+    @abstractmethod
+    def update_gradient_hessian(self, X, y, sample_weight):
+        """Update gradient and Hessian."""
+
+    @abstractmethod
+    def inner_solve(self, X, y, sample_weight):
+        """Compute Newton step.
+
+        Sets:
+            - self.coef_newton
+            - self.gradient_times_newton
+        """
+
+    def fallback_lbfgs_solve(self, X, y, sample_weight):
+        """Fallback solver in case of emergency.
+
+        If a solver detects convergence problems, it may fall back to this methods in
+        the hope to exit with success instead of raising an error.
+
+        Sets:
+            - self.coef
+            - self.converged
+        """
+        opt_res = scipy.optimize.minimize(
+            self.linear_loss.loss_gradient,
+            self.coef,
+            method="L-BFGS-B",
+            jac=True,
+            options={
+                "maxiter": self.max_iter,
+                "maxls": 50,  # default is 20
+                "iprint": self.verbose - 1,
+                "gtol": self.tol,
+                "ftol": 64 * np.finfo(np.float64).eps,
+            },
+            args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
+        )
+        self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
+        self.coef = opt_res.x
+        self.converged = opt_res.status == 0
+
+    def line_search(self, X, y, sample_weight):
+        """Backtracking line search.
+
+        Sets:
+            - self.coef_old
+            - self.coef
+            - self.loss_value_old
+            - self.loss_value
+            - self.gradient_old
+            - self.gradient
+            - self.raw_prediction
+        """
+        # line search parameters
+        beta, sigma = 0.5, 0.00048828125  # 1/2, 1/2**11
+        eps = 16 * np.finfo(self.loss_value.dtype).eps
+        t = 1  # step size
+
+        # gradient_times_newton = self.gradient @ self.coef_newton
+        # was computed in inner_solve.
+        armijo_term = sigma * self.gradient_times_newton
+        _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
+            self.coef_newton, X
+        )
+
+        self.coef_old = self.coef
+        self.loss_value_old = self.loss_value
+        self.gradient_old = self.gradient
+
+        # np.sum(np.abs(self.gradient_old))
+        sum_abs_grad_old = -1
+
+        is_verbose = self.verbose >= 2
+        if is_verbose:
+            print("  Backtracking Line Search")
+            print(f"    eps=10 * finfo.eps={eps}")
+
+        for i in range(21):  # until and including t = beta**20 ~ 1e-6
+            self.coef = self.coef_old + t * self.coef_newton
+            raw = self.raw_prediction + t * raw_prediction_newton
+            self.loss_value, self.gradient = self.linear_loss.loss_gradient(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+                raw_prediction=raw,
+            )
+            # Note: If coef_newton is too large, loss_gradient may produce inf values,
+            # potentially accompanied by a RuntimeWarning.
+            # This case will be captured by the Armijo condition.
+
+            # 1. Check Armijo / sufficient decrease condition.
+            # The smaller (more negative) the better.
+            loss_improvement = self.loss_value - self.loss_value_old
+            check = loss_improvement <= t * armijo_term
+            if is_verbose:
+                print(
+                    f"    line search iteration={i+1}, step size={t}\n"
+                    f"      check loss improvement <= armijo term: {loss_improvement} "
+                    f"<= {t * armijo_term} {check}"
+                )
+            if check:
+                break
+            # 2. Deal with relative loss differences around machine precision.
+            tiny_loss = np.abs(self.loss_value_old * eps)
+            check = np.abs(loss_improvement) <= tiny_loss
+            if is_verbose:
+                print(
+                    "      check loss |improvement| <= eps * |loss_old|:"
+                    f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
+                )
+            if check:
+                if sum_abs_grad_old < 0:
+                    sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
+                # 2.1 Check sum of absolute gradients as alternative condition.
+                sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
+                check = sum_abs_grad < sum_abs_grad_old
+                if is_verbose:
+                    print(
+                        "      check sum(|gradient|) < sum(|gradient_old|): "
+                        f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
+                    )
+                if check:
+                    break
+
+            t *= beta
+        else:
+            warnings.warn(
+                f"Line search of Newton solver {self.__class__.__name__} at iteration "
+                f"#{self.iteration} did no converge after 21 line search refinement "
+                "iterations. It will now resort to lbfgs instead.",
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print("  Line search did not converge and resorts to lbfgs instead.")
+            self.use_fallback_lbfgs_solve = True
+            return
+
+        self.raw_prediction = raw
+
+    def check_convergence(self, X, y, sample_weight):
+        """Check for convergence.
+
+        Sets self.converged.
+        """
+        if self.verbose:
+            print("  Check Convergence")
+        # Note: Checking maximum relative change of coefficient <= tol is a bad
+        # convergence criterion because even a large step could have brought us close
+        # to the true minimum.
+        # coef_step = self.coef - self.coef_old
+        # check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
+
+        # 1. Criterion: maximum |gradient| <= tol
+        #    The gradient was already updated in line_search()
+        check = np.max(np.abs(self.gradient))
+        if self.verbose:
+            print(f"    1. max |gradient| {check} <= {self.tol}")
+        if check > self.tol:
+            return
+
+        # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
+        #       d = sqrt(grad @ hessian^-1 @ grad)
+        #         = sqrt(coef_newton @ hessian @ coef_newton)
+        #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
+        d2 = self.coef_newton @ self.hessian @ self.coef_newton
+        if self.verbose:
+            print(f"    2. Newton decrement {0.5 * d2} <= {self.tol}")
+        if 0.5 * d2 > self.tol:
+            return
+
+        if self.verbose:
+            loss_value = self.linear_loss.loss(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+            )
+            print(f"  Solver did converge at loss = {loss_value}.")
+        self.converged = True
+
+    def finalize(self, X, y, sample_weight):
+        """Finalize the solvers results.
+
+        Some solvers may need this, others not.
+        """
+        pass
+
+    def solve(self, X, y, sample_weight):
+        """Solve the optimization problem.
+
+        This is the main routine.
+
+        Order of calls:
+            self.setup()
+            while iteration:
+                self.update_gradient_hessian()
+                self.inner_solve()
+                self.line_search()
+                self.check_convergence()
+            self.finalize()
+
+        Returns
+        -------
+        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
+            Solution of the optimization problem.
+        """
+        # setup usually:
+        #   - initializes self.coef if needed
+        #   - initializes and calculates self.raw_predictions, self.loss_value
+        self.setup(X=X, y=y, sample_weight=sample_weight)
+
+        self.iteration = 1
+        self.converged = False
+
+        while self.iteration <= self.max_iter and not self.converged:
+            if self.verbose:
+                print(f"Newton iter={self.iteration}")
+
+            self.use_fallback_lbfgs_solve = False  # Fallback solver.
+
+            # 1. Update Hessian and gradient
+            self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
+
+            # TODO:
+            # if iteration == 1:
+            # We might stop early, e.g. we already are close to the optimum,
+            # usually detected by zero gradients at this stage.
+
+            # 2. Inner solver
+            #    Calculate Newton step/direction
+            #    This usually sets self.coef_newton and self.gradient_times_newton.
+            self.inner_solve(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
+
+            # 3. Backtracking line search
+            #    This usually sets self.coef_old, self.coef, self.loss_value_old
+            #    self.loss_value, self.gradient_old, self.gradient,
+            #    self.raw_prediction.
+            self.line_search(X=X, y=y, sample_weight=sample_weight)
+            if self.use_fallback_lbfgs_solve:
+                break
+
+            # 4. Check convergence
+            #    Sets self.converged.
+            self.check_convergence(X=X, y=y, sample_weight=sample_weight)
+
+            # 5. Next iteration
+            self.iteration += 1
+
+        if not self.converged:
+            if self.use_fallback_lbfgs_solve:
+                # Note: The fallback solver circumvents check_convergence and relies on
+                # the convergence checks of lbfgs instead. Enough warnings have been
+                # raised on the way.
+                self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
+            else:
+                warnings.warn(
+                    f"Newton solver did not converge after {self.iteration - 1} "
+                    "iterations.",
+                    ConvergenceWarning,
+                )
+
+        self.iteration -= 1
+        self.finalize(X=X, y=y, sample_weight=sample_weight)
+        return self.coef
+
+
+class NewtonCholeskySolver(NewtonSolver):
+    """Cholesky based Newton solver.
+
+    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
+    solver.
+    """
+
+    def setup(self, X, y, sample_weight):
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+        n_dof = X.shape[1]
+        if self.linear_loss.fit_intercept:
+            n_dof += 1
+        self.gradient = np.empty_like(self.coef)
+        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
+            coef=self.coef,
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            l2_reg_strength=self.l2_reg_strength,
+            n_threads=self.n_threads,
+            gradient_out=self.gradient,
+            hessian_out=self.hessian,
+            raw_prediction=self.raw_prediction,  # this was updated in line_search
+        )
+
+    def inner_solve(self, X, y, sample_weight):
+        if self.hessian_warning:
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} detected a "
+                "pointwise hessian with many negative values at iteration "
+                f"#{self.iteration}. It will now resort to lbfgs instead.",
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print(
+                    "  The inner solver detected a pointwise Hessian with many "
+                    "negative values and resorts to lbfgs instead."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
+
+        try:
+            with warnings.catch_warnings():
+                warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
+                self.coef_newton = scipy.linalg.solve(
+                    self.hessian, -self.gradient, check_finite=False, assume_a="sym"
+                )
+                self.gradient_times_newton = self.gradient @ self.coef_newton
+                if self.gradient_times_newton > 0:
+                    if self.verbose:
+                        print(
+                            "  The inner solver found a Newton step that is not a "
+                            "descent direction and resorts to LBFGS steps instead."
+                        )
+                    self.use_fallback_lbfgs_solve = True
+                    return
+        except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
+            warnings.warn(
+                f"The inner solver of {self.__class__.__name__} stumbled upon a "
+                "singular or very ill-conditioned Hessian matrix at iteration "
+                f"#{self.iteration}. It will now resort to lbfgs instead.\n"
+                "Further options are to use another solver or to avoid such situation "
+                "in the first place. Possible remedies are removing collinear features"
+                " of X or increasing the penalization strengths.\n"
+                "The original Linear Algebra message was:\n"
+                + str(e),
+                scipy.linalg.LinAlgWarning,
+            )
+            # Possible causes:
+            # 1. hess_pointwise is negative. But this is already taken care in
+            #    LinearModelLoss.gradient_hessian.
+            # 2. X is singular or ill-conditioned
+            #    This might be the most probable cause.
+            #
+            # There are many possible ways to deal with this situation. Most of them
+            # add, explicitly or implicitly, a matrix to the hessian to make it
+            # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
+            # Instead, we resort to lbfgs.
+            if self.verbose:
+                print(
+                    "  The inner solver stumbled upon an singular or ill-conditioned "
+                    "Hessian matrix and resorts to LBFGS instead."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 47a86527666bb..6dd02a387e0f3 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -6,15 +6,12 @@
 # some parts and tricks stolen from other sklearn files.
 # License: BSD 3 clause
 
-from abc import ABC, abstractmethod
-import warnings
 from numbers import Integral, Real
 
 import numpy as np
-import scipy.linalg
 import scipy.optimize
-import scipy.sparse
 
+from ._newton_solver import NewtonCholeskySolver, NewtonSolver
 from ..._loss.glm_distribution import TweedieDistribution
 from ..._loss.loss import (
     HalfGammaLoss,
@@ -24,515 +21,14 @@
     HalfTweedieLossIdentity,
 )
 from ...base import BaseEstimator, RegressorMixin
-from ...exceptions import ConvergenceWarning
 from ...utils import check_array, deprecated
-from ...utils.validation import check_is_fitted, _check_sample_weight
-from ...utils._param_validation import Interval, StrOptions, Hidden
 from ...utils._openmp_helpers import _openmp_effective_n_threads
+from ...utils._param_validation import Hidden, Interval, StrOptions
 from ...utils.optimize import _check_optimize_result
+from ...utils.validation import _check_sample_weight, check_is_fitted
 from .._linear_loss import LinearModelLoss
 
 
-class _NewtonSolver(ABC):
-    """Newton solver for GLMs.
-
-    This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
-    iteration aims at finding the Newton step which is done by the inner solver. With
-    Hessian H, gradient g and coefficients coef, one step solves:
-
-        H @ coef_newton = -g
-
-    For our GLM / LinearModelLoss, we have gradient g and Hessian H:
-
-        g = X.T @ loss.gradient + l2_reg_strength * coef
-        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
-
-    Backtracking line search updates coef = coef_old + t * coef_newton for some t in
-    (0, 1].
-
-    This is a base class, actual implementations (child classes) may deviate from the
-    above pattern and use structure specific tricks.
-
-    Usage pattern:
-        - initialize solver: sol = _NewtonSolver(...)
-        - solve the problem: sol.solve(X, y, sample_weight)
-
-    References
-    ----------
-    - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
-      2nd edition
-      https://doi.org/10.1007/978-0-387-40065-5
-
-    - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
-      Cambridge University Press, 2004.
-      https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
-
-    Parameters
-    ----------
-    coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
-        Initial coefficients of a linear model.
-        If shape (n_classes * n_dof,), the classes of one feature are contiguous,
-        i.e. one reconstructs the 2d-array via
-        coef.reshape((n_classes, -1), order="F").
-
-    linear_loss : LinearModelLoss
-        The loss to be minimized.
-
-    l2_reg_strength : float, default=0.0
-        L2 regularization strength.
-
-    tol : float, default=1e-4
-        The optimization problem is solved when each of the following condition is
-        fulfilled:
-        1. maximum |gradient| <= tol
-        2. Newton decrement d: 1/2 * d^2 <= tol
-
-    max_iter : int, default=100
-        Maximum number of Newton steps allowed.
-
-    n_threads : int, default=1
-        Number of OpenMP threads to use for the computation of the Hessian and gradient
-        of the loss function.
-
-    Attributes
-    ----------
-    coef_old : ndarray of shape coef.shape
-        Coefficient of previous iteration.
-
-    coef_newton : ndarray of shape coef.shape
-        Newton step.
-
-    gradient : ndarray of shape coef.shape
-        Gradient of the loss wrt. the coefficients.
-
-    gradient_old : ndarray of shape coef.shape
-        Gradient of previous iteration.
-
-    loss_value : float
-        Value of objective function = loss + penalty.
-
-    loss_value_old : float
-        Value of objective function of previous itertion.
-
-    raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
-
-    converged : bool
-        Indicator for convergence of the solver.
-
-    iteration : int
-        Number of Newton steps, i.e. calls to inner_solve
-
-    use_fallback_lbfgs_solve : bool
-        If set to True, the solver will resort to call LBFGS to finish the optimisation
-        procedure in case of convergence issues.
-
-    gradient_times_newton : float
-        gradient @ coef_newton, set in inner_solve and used by line_search. If the
-        Newton step is a descent direction, this is negative.
-    """
-
-    def __init__(
-        self,
-        *,
-        coef,
-        linear_loss=LinearModelLoss(base_loss=HalfSquaredError, fit_intercept=True),
-        l2_reg_strength=0.0,
-        tol=1e-4,
-        max_iter=100,
-        n_threads=1,
-        verbose=0,
-    ):
-        self.coef = coef
-        self.linear_loss = linear_loss
-        self.l2_reg_strength = l2_reg_strength
-        self.tol = tol
-        self.max_iter = max_iter
-        self.n_threads = n_threads
-        self.verbose = verbose
-
-    def setup(self, X, y, sample_weight):
-        """Precomputations
-
-        If None, initializes:
-            - self.coef
-        Sets:
-            - self.raw_prediction
-            - self.loss_value
-        """
-        _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
-        self.loss_value = self.linear_loss.loss(
-            coef=self.coef,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            l2_reg_strength=self.l2_reg_strength,
-            n_threads=self.n_threads,
-            raw_prediction=self.raw_prediction,
-        )
-
-    @abstractmethod
-    def update_gradient_hessian(self, X, y, sample_weight):
-        """Update gradient and Hessian."""
-
-    @abstractmethod
-    def inner_solve(self, X, y, sample_weight):
-        """Compute Newton step.
-
-        Sets:
-            - self.coef_newton
-            - self.gradient_times_newton
-        """
-
-    def fallback_lbfgs_solve(self, X, y, sample_weight):
-        """Fallback solver in case of emergency.
-
-        If a solver detects convergence problems, it may fall back to this methods in
-        the hope to exit with success instead of raising an error.
-
-        Sets:
-            - self.coef
-            - self.converged
-        """
-        opt_res = scipy.optimize.minimize(
-            self.linear_loss.loss_gradient,
-            self.coef,
-            method="L-BFGS-B",
-            jac=True,
-            options={
-                "maxiter": self.max_iter,
-                "maxls": 50,  # default is 20
-                "iprint": self.verbose - 1,
-                "gtol": self.tol,
-                "ftol": 64 * np.finfo(np.float64).eps,
-            },
-            args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
-        )
-        self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
-        self.coef = opt_res.x
-        self.converged = opt_res.status == 0
-
-    def line_search(self, X, y, sample_weight):
-        """Backtracking line search.
-
-        Sets:
-            - self.coef_old
-            - self.coef
-            - self.loss_value_old
-            - self.loss_value
-            - self.gradient_old
-            - self.gradient
-            - self.raw_prediction
-        """
-        # line search parameters
-        beta, sigma = 0.5, 0.00048828125  # 1/2, 1/2**11
-        eps = 16 * np.finfo(self.loss_value.dtype).eps
-        t = 1  # step size
-
-        # gradient_times_newton = self.gradient @ self.coef_newton
-        # was computed in inner_solve.
-        armijo_term = sigma * self.gradient_times_newton
-        _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
-            self.coef_newton, X
-        )
-
-        self.coef_old = self.coef
-        self.loss_value_old = self.loss_value
-        self.gradient_old = self.gradient
-
-        # np.sum(np.abs(self.gradient_old))
-        sum_abs_grad_old = -1
-
-        is_verbose = self.verbose >= 2
-        if is_verbose:
-            print("  Backtracking Line Search")
-            print(f"    eps=10 * finfo.eps={eps}")
-
-        for i in range(21):  # until and including t = beta**20 ~ 1e-6
-            self.coef = self.coef_old + t * self.coef_newton
-            raw = self.raw_prediction + t * raw_prediction_newton
-            self.loss_value, self.gradient = self.linear_loss.loss_gradient(
-                coef=self.coef,
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-                l2_reg_strength=self.l2_reg_strength,
-                n_threads=self.n_threads,
-                raw_prediction=raw,
-            )
-            # Note: If coef_newton is too large, loss_gradient may produce inf values,
-            # potentially accompanied by a RuntimeWarning.
-            # This case will be captured by the Armijo condition.
-
-            # 1. Check Armijo / sufficient decrease condition.
-            # The smaller (more negative) the better.
-            loss_improvement = self.loss_value - self.loss_value_old
-            check = loss_improvement <= t * armijo_term
-            if is_verbose:
-                print(
-                    f"    line search iteration={i+1}, step size={t}\n"
-                    f"      check loss improvement <= armijo term: {loss_improvement} "
-                    f"<= {t * armijo_term} {check}"
-                )
-            if check:
-                break
-            # 2. Deal with relative loss differences around machine precision.
-            tiny_loss = np.abs(self.loss_value_old * eps)
-            check = np.abs(loss_improvement) <= tiny_loss
-            if is_verbose:
-                print(
-                    "      check loss |improvement| <= eps * |loss_old|:"
-                    f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
-                )
-            if check:
-                if sum_abs_grad_old < 0:
-                    sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
-                # 2.1 Check sum of absolute gradients as alternative condition.
-                sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
-                check = sum_abs_grad < sum_abs_grad_old
-                if is_verbose:
-                    print(
-                        "      check sum(|gradient|) < sum(|gradient_old|): "
-                        f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
-                    )
-                if check:
-                    break
-
-            t *= beta
-        else:
-            warnings.warn(
-                f"Line search of Newton solver {self.__class__.__name__} at iteration "
-                f"#{self.iteration} did no converge after 21 line search refinement "
-                "iterations. It will now resort to lbfgs instead.",
-                ConvergenceWarning,
-            )
-            if self.verbose:
-                print("  Line search did not converge and resorts to lbfgs instead.")
-            self.use_fallback_lbfgs_solve = True
-            return
-
-        self.raw_prediction = raw
-
-    def check_convergence(self, X, y, sample_weight):
-        """Check for convergence.
-
-        Sets self.converged.
-        """
-        if self.verbose:
-            print("  Check Convergence")
-        # Note: Checking maximum relative change of coefficient <= tol is a bad
-        # convergence criterion because even a large step could have brought us close
-        # to the true minimum.
-        # coef_step = self.coef - self.coef_old
-        # check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
-
-        # 1. Criterion: maximum |gradient| <= tol
-        #    The gradient was already updated in line_search()
-        check = np.max(np.abs(self.gradient))
-        if self.verbose:
-            print(f"    1. max |gradient| {check} <= {self.tol}")
-        if check > self.tol:
-            return
-
-        # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
-        #       d = sqrt(grad @ hessian^-1 @ grad)
-        #         = sqrt(coef_newton @ hessian @ coef_newton)
-        #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
-        d2 = self.coef_newton @ self.hessian @ self.coef_newton
-        if self.verbose:
-            print(f"    2. Newton decrement {0.5 * d2} <= {self.tol}")
-        if 0.5 * d2 > self.tol:
-            return
-
-        if self.verbose:
-            loss_value = self.linear_loss.loss(
-                coef=self.coef,
-                X=X,
-                y=y,
-                sample_weight=sample_weight,
-                l2_reg_strength=self.l2_reg_strength,
-                n_threads=self.n_threads,
-            )
-            print(f"  Solver did converge at loss = {loss_value}.")
-        self.converged = True
-
-    def finalize(self, X, y, sample_weight):
-        """Finalize the solvers results.
-
-        Some solvers may need this, others not.
-        """
-        pass
-
-    def solve(self, X, y, sample_weight):
-        """Solve the optimization problem.
-
-        This is the main routine.
-
-        Order of calls:
-            self.setup()
-            while iteration:
-                self.update_gradient_hessian()
-                self.inner_solve()
-                self.line_search()
-                self.check_convergence()
-            self.finalize()
-
-        Returns
-        -------
-        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
-            Solution of the optimization problem.
-        """
-        # setup usually:
-        #   - initializes self.coef if needed
-        #   - initializes and calculates self.raw_predictions, self.loss_value
-        self.setup(X=X, y=y, sample_weight=sample_weight)
-
-        self.iteration = 1
-        self.converged = False
-
-        while self.iteration <= self.max_iter and not self.converged:
-            if self.verbose:
-                print(f"Newton iter={self.iteration}")
-
-            self.use_fallback_lbfgs_solve = False  # Fallback solver.
-
-            # 1. Update Hessian and gradient
-            self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
-
-            # TODO:
-            # if iteration == 1:
-            # We might stop early, e.g. we already are close to the optimum,
-            # usually detected by zero gradients at this stage.
-
-            # 2. Inner solver
-            #    Calculate Newton step/direction
-            #    This usually sets self.coef_newton and self.gradient_times_newton.
-            self.inner_solve(X=X, y=y, sample_weight=sample_weight)
-            if self.use_fallback_lbfgs_solve:
-                break
-
-            # 3. Backtracking line search
-            #    This usually sets self.coef_old, self.coef, self.loss_value_old
-            #    self.loss_value, self.gradient_old, self.gradient,
-            #    self.raw_prediction.
-            self.line_search(X=X, y=y, sample_weight=sample_weight)
-            if self.use_fallback_lbfgs_solve:
-                break
-
-            # 4. Check convergence
-            #    Sets self.converged.
-            self.check_convergence(X=X, y=y, sample_weight=sample_weight)
-
-            # 5. Next iteration
-            self.iteration += 1
-
-        if not self.converged:
-            if self.use_fallback_lbfgs_solve:
-                # Note: The fallback solver circumvents check_convergence and relies on
-                # the convergence checks of lbfgs instead. Enough warnings have been
-                # raised on the way.
-                self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
-            else:
-                warnings.warn(
-                    f"Newton solver did not converge after {self.iteration - 1} "
-                    "iterations.",
-                    ConvergenceWarning,
-                )
-
-        self.iteration -= 1
-        self.finalize(X=X, y=y, sample_weight=sample_weight)
-        return self.coef
-
-
-class _NewtonCholeskySolver(_NewtonSolver):
-    """Cholesky based Newton solver.
-
-    Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
-    solver.
-    """
-
-    def setup(self, X, y, sample_weight):
-        super().setup(X=X, y=y, sample_weight=sample_weight)
-        n_dof = X.shape[1]
-        if self.linear_loss.fit_intercept:
-            n_dof += 1
-        self.gradient = np.empty_like(self.coef)
-        self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
-
-    def update_gradient_hessian(self, X, y, sample_weight):
-        _, _, self.hessian_warning = self.linear_loss.gradient_hessian(
-            coef=self.coef,
-            X=X,
-            y=y,
-            sample_weight=sample_weight,
-            l2_reg_strength=self.l2_reg_strength,
-            n_threads=self.n_threads,
-            gradient_out=self.gradient,
-            hessian_out=self.hessian,
-            raw_prediction=self.raw_prediction,  # this was updated in line_search
-        )
-
-    def inner_solve(self, X, y, sample_weight):
-        if self.hessian_warning:
-            warnings.warn(
-                f"The inner solver of {self.__class__.__name__} detected a "
-                "pointwise hessian with many negative values at iteration "
-                f"#{self.iteration}. It will now resort to lbfgs instead.",
-                ConvergenceWarning,
-            )
-            if self.verbose:
-                print(
-                    "  The inner solver detected a pointwise Hessian with many "
-                    "negative values and resorts to lbfgs instead."
-                )
-            self.use_fallback_lbfgs_solve = True
-            return
-
-        try:
-            with warnings.catch_warnings():
-                warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
-                self.coef_newton = scipy.linalg.solve(
-                    self.hessian, -self.gradient, check_finite=False, assume_a="sym"
-                )
-                self.gradient_times_newton = self.gradient @ self.coef_newton
-                if self.gradient_times_newton > 0:
-                    if self.verbose:
-                        print(
-                            "  The inner solver found a Newton step that is not a "
-                            "descent direction and resorts to LBFGS steps instead."
-                        )
-                    self.use_fallback_lbfgs_solve = True
-                    return
-        except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
-            warnings.warn(
-                f"The inner solver of {self.__class__.__name__} stumbled upon a "
-                "singular or very ill-conditioned Hessian matrix at iteration "
-                f"#{self.iteration}. It will now resort to lbfgs instead.\n"
-                "Further options are to use another solver or to avoid such situation "
-                "in the first place. Possible remedies are removing collinear features"
-                " of X or increasing the penalization strengths.\n"
-                "The original Linear Algebra message was:\n"
-                + str(e),
-                scipy.linalg.LinAlgWarning,
-            )
-            # Possible causes:
-            # 1. hess_pointwise is negative. But this is already taken care in
-            #    LinearModelLoss.gradient_hessian.
-            # 2. X is singular or ill-conditioned
-            #    This might be the most probable cause.
-            #
-            # There are many possible ways to deal with this situation. Most of them
-            # add, explicitly or implicitly, a matrix to the hessian to make it
-            # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
-            # Instead, we resort to lbfgs.
-            if self.verbose:
-                print(
-                    "  The inner solver stumbled upon an singular or ill-conditioned "
-                    "Hessian matrix and resorts to LBFGS instead."
-                )
-            self.use_fallback_lbfgs_solve = True
-            return
-
-
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
     """Regression via a penalized Generalized Linear Model (GLM).
 
@@ -635,7 +131,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         we have `y_pred = exp(X @ coeff + intercept)`.
     """
 
-    # We allow for _NewtonSolver classes for the "solver" parameter but do not
+    # We allow for NewtonSolver classes for the "solver" parameter but do not
     # make them public in the docstrings. This facilitates testing and
     # benchmarking.
     _parameter_constraints: dict = {
@@ -785,7 +281,7 @@ def fit(self, X, y, sample_weight=None):
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
         elif self.solver == "newton-cholesky":
-            sol = _NewtonCholeskySolver(
+            sol = NewtonCholeskySolver(
                 coef=coef,
                 linear_loss=linear_loss,
                 l2_reg_strength=l2_reg_strength,
@@ -796,7 +292,7 @@ def fit(self, X, y, sample_weight=None):
             )
             coef = sol.solve(X, y, sample_weight)
             self.n_iter_ = sol.iteration
-        elif issubclass(self.solver, _NewtonSolver):
+        elif issubclass(self.solver, NewtonSolver):
             sol = self.solver(
                 coef=coef,
                 linear_loss=linear_loss,
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 4390ee7620e9d..694626d7dba4a 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -26,7 +26,7 @@
     TweedieRegressor,
 )
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
-from sklearn.linear_model._glm.glm import _NewtonCholeskySolver
+from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver
 from sklearn.linear_model._linear_loss import LinearModelLoss
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
@@ -1038,7 +1038,7 @@ def test_newton_solver_verbosity(capsys, verbose):
     y = np.array([1, 2], dtype=float)
     X = np.array([[1.0, 0], [0, 1]], dtype=float)
     linear_loss = LinearModelLoss(base_loss=HalfPoissonLoss(), fit_intercept=False)
-    sol = _NewtonCholeskySolver(
+    sol = NewtonCholeskySolver(
         coef=linear_loss.init_zero_coef(X),
         linear_loss=linear_loss,
         l2_reg_strength=0,
@@ -1066,7 +1066,7 @@ def test_newton_solver_verbosity(capsys, verbose):
             assert m in captured.out
 
     # Set the Newton solver to a state with a completely wrong Newton step.
-    sol = _NewtonCholeskySolver(
+    sol = NewtonCholeskySolver(
         coef=linear_loss.init_zero_coef(X),
         linear_loss=linear_loss,
         l2_reg_strength=0,
@@ -1088,7 +1088,7 @@ def test_newton_solver_verbosity(capsys, verbose):
 
     # Set the Newton solver to a state with bad Newton step such that the loss
     # improvement in line search is tiny.
-    sol = _NewtonCholeskySolver(
+    sol = NewtonCholeskySolver(
         coef=np.array([1e-12, 0.69314758]),
         linear_loss=linear_loss,
         l2_reg_strength=0,
@@ -1118,7 +1118,7 @@ def test_newton_solver_verbosity(capsys, verbose):
     linear_loss = LinearModelLoss(
         base_loss=HalfTweedieLoss(power=3), fit_intercept=False
     )
-    sol = _NewtonCholeskySolver(
+    sol = NewtonCholeskySolver(
         coef=linear_loss.init_zero_coef(X) + 1,
         linear_loss=linear_loss,
         l2_reg_strength=0,