diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
index 01f0384af5c1d..c6da2843564b1 100644
--- a/doc/whats_new/v1.5.rst
+++ b/doc/whats_new/v1.5.rst
@@ -264,6 +264,17 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Enhancement| :class:`linear_model.LogisticRegression`,
+  :class:`linear_model.LogisticRegressionCV`, :class:`linear_model.GammaRegressor`,
+  :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor` got
+  a new solver `solver="newton-lsmr"`. This is a 2nd order (Newton) optimisation
+  routine that uses the iterative LSMR algorithm: to find the Newton direction in each
+  step, the 2nd order equation is cast as a least squares problem and solved
+  iteratively, therefore called iteratively reweighted least squares (IRLS), via LSMR.
+  Due to using LSMR, only matrix-vector multiplications are used and sparse matrices
+  are supported as well. Especially for multiclass problems it might be worth a try.
+  :pr:`25462` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Fix| :class:`linear_model.ElasticNet`, :class:`linear_model.ElasticNetCV`,
   :class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't
   accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger
diff --git a/sklearn/linear_model/_glm/_newton_solver.py b/sklearn/linear_model/_glm/_newton_solver.py
index 0b6adbe44e686..00470dd31d168 100644
--- a/sklearn/linear_model/_glm/_newton_solver.py
+++ b/sklearn/linear_model/_glm/_newton_solver.py
@@ -15,7 +15,7 @@
 from ..._loss.loss import HalfSquaredError
 from ...exceptions import ConvergenceWarning
 from ...utils.optimize import _check_optimize_result
-from .._linear_loss import LinearModelLoss
+from .._linear_loss import LinearModelLoss, Multinomial_LDL_Decomposition
 
 
 class NewtonSolver(ABC):
@@ -25,12 +25,26 @@ class NewtonSolver(ABC):
     iteration aims at finding the Newton step which is done by the inner solver. With
     Hessian H, gradient g and coefficients coef, one step solves:
 
-        H @ coef_newton = -g
+        H @ coef_newton = -G
 
-    For our GLM / LinearModelLoss, we have gradient g and Hessian H:
+    For our GLM / LinearModelLoss, we have gradient G and Hessian H:
 
-        g = X.T @ loss.gradient + l2_reg_strength * coef
-        H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+        G = X.T @ g + l2_reg_strength * P @ coef_old
+        H = X.T @ diag(h) @ X + l2_reg_strength * P
+        g = loss.gradient = pointwise gradient
+        h = loss.hessian = pointwise hessian
+        P = penalty matrix in 1/2 w @ P @ w,
+            for a pure L2 penalty without intercept it equals the identity matrix.
+
+    stemming from the 2nd order Taylor series of the objective:
+
+        loss(coef_old) + g @ X @ coef_newton
+        + 1/2 * coef_newton @ X.T @ diag(h) @ X @ coef_newton
+        + 1/2 * l2_reg_strength * coef @ P @ coef
+        = loss(coef_old) + 1/2 * l2_reg_strength * coef_old @ P @ coef_old
+        + G @ coef_newton + 1/2 * coef_newton @ H @ coef_newton
+
+    where we have coef = coef_old + coef_newton.
 
     Backtracking line search updates coef = coef_old + t * coef_newton for some t in
     (0, 1].
@@ -138,8 +152,6 @@ def __init__(
     def setup(self, X, y, sample_weight):
         """Precomputations
 
-        If None, initializes:
-            - self.coef
         Sets:
             - self.raw_prediction
             - self.loss_value
@@ -178,9 +190,10 @@ def fallback_lbfgs_solve(self, X, y, sample_weight):
             - self.coef
             - self.converged
         """
+        # Note that LinearModelLoss expects coef.ravel(order"F") for multiclass case.
         opt_res = scipy.optimize.minimize(
             self.linear_loss.loss_gradient,
-            self.coef,
+            self.coef.ravel(order="F"),
             method="L-BFGS-B",
             jac=True,
             options={
@@ -194,6 +207,11 @@ def fallback_lbfgs_solve(self, X, y, sample_weight):
         )
         self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
         self.coef = opt_res.x
+        if self.linear_loss.base_loss.is_multiclass:
+            # No test case was found (yet) where NewtonLSMRSolver ends up in this code
+            # path. Note that NewtonCholeskySolver does not support muliclass problems.
+            n_classes = self.linear_loss.base_loss.n_classes
+            self.coef = self.coef.reshape((n_classes, -1), order="F")
         self.converged = opt_res.status == 0
 
     def line_search(self, X, y, sample_weight):
@@ -299,6 +317,10 @@ def line_search(self, X, y, sample_weight):
 
         self.raw_prediction = raw
 
+    def compute_d2(self, X, sample_weight):
+        """Compute square of Newton decrement."""
+        return self.coef_newton @ self.hessian @ self.coef_newton
+
     def check_convergence(self, X, y, sample_weight):
         """Check for convergence.
 
@@ -306,6 +328,16 @@ def check_convergence(self, X, y, sample_weight):
         """
         if self.verbose:
             print("  Check Convergence")
+            loss_value = self.linear_loss.loss(
+                coef=self.coef,
+                X=X,
+                y=y,
+                sample_weight=sample_weight,
+                l2_reg_strength=self.l2_reg_strength,
+                n_threads=self.n_threads,
+            )
+            print(f"    loss = {loss_value}.")
+
         # Note: Checking maximum relative change of coefficient <= tol is a bad
         # convergence criterion because even a large step could have brought us close
         # to the true minimum.
@@ -324,7 +356,7 @@ def check_convergence(self, X, y, sample_weight):
         #       d = sqrt(grad @ hessian^-1 @ grad)
         #         = sqrt(coef_newton @ hessian @ coef_newton)
         #    See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
-        d2 = self.coef_newton @ self.hessian @ self.coef_newton
+        d2 = self.compute_d2(X, sample_weight=sample_weight)
         if self.verbose:
             print(f"    2. Newton decrement {0.5 * d2} <= {self.tol}")
         if 0.5 * d2 > self.tol:
@@ -522,3 +554,555 @@ def inner_solve(self, X, y, sample_weight):
                 )
             self.use_fallback_lbfgs_solve = True
             return
+
+
+class NewtonLSMRSolver(NewtonSolver):
+    """LSMR based inexact Newton solver.
+
+    The inner solver uses LSMR [1] after the Newton update is cast into the iteratively
+    reweighted least squares (IRLS) formulation. This means
+
+        H @ coef_newton = -G
+
+    with
+
+        G = X.T @ g + l2_reg_strength * P @ coef
+        H = X.T @ diag(h) @ X + l2_reg_strength * P
+        g = loss.gradient = pointwise gradient
+        h = loss.hessian = pointwise hessian
+        P = penalty matrix in 1/2 w @ P @ w,
+            for a pure L2 penalty without intercept it equals the identity matrix.
+
+    is cast as a least squares problem
+
+        min ||A @ coef_newton - b||_2^2
+
+    with
+
+        A = [                    sqrt(h) * X]
+            [sqrt(l2_reg_strength) * sqrt(P)]
+        b = [                                - g / sqrt(h)]
+            [- sqrt(l2_reg_strength) * sqrt(P) @ self.coef]
+
+    Notes:
+    - A is a square root of H: H = A'A
+    - A and b form G: G = -A'b
+    - The normal equation of this least squares problem, A'A coef_newton = A'b, is
+      again H @ coef_newton = -G.
+
+    After the Newton iteration, and neglecting line search, the new coefficient is
+    self.coef += coef_newton.
+
+    Note that this solver can naturally deal with sparse X.
+
+    References
+    ----------
+    .. [1] :arxiv:`Fong & Saunders "LSMR: An iterative algorithm for sparse
+           least-squares problems" <1006.0758>`
+           See also
+           https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.lsmr.html
+
+    """  # noqa: E501
+
+    def setup(self, X, y, sample_weight):
+        """Setup.
+
+        Additionally to super().setup(), also sets:
+            - self.g = pointwise gradient
+            - self.h = pointwise hessian
+            - self.gradient
+            - self.sqrt_P = sqrt(l2_reg_strength) * sqrt(P)
+            - self.A_norm
+            - self.r_norm
+            - self.atol
+            - self.lsmr_iter
+            - self.sw_sum
+            - self.sqrt_sw_sum
+        """
+        super().setup(X=X, y=y, sample_weight=sample_weight)
+        (
+            self.g,
+            self.h,
+        ) = self.linear_loss.base_loss.init_gradient_and_hessian(
+            n_samples=X.shape[0], dtype=X.dtype
+        )
+        n_samples, n_features = X.shape
+        # For a general (symmetric) penalty matrix P, we can use any square root of it,
+        # e.g. the Cholesky decomposition. For the simple L2-penalty, we can use the
+        # identity matrix and handle the intercept separately.
+        # We use a 1-d array of shape (n_features,) instead of a 2-d diagonal array to
+        # save memory. We also simply omit the zero at the end for the intercept.
+        # For the multiclass case, we use the same 1-d array of shape (n_features,) and
+        # do not store n_classes copies of it.
+        self.sqrt_P = np.full(
+            shape=n_features, fill_value=np.sqrt(self.l2_reg_strength), dtype=X.dtype
+        )
+
+        # In the inner_solve with LSMR, we set atol ~ 1 / (||A|| * ||r||) with the
+        # Frobenius norm ||A||, see below. We track this in every iteration with
+        # self.A_norm and self.r_norm. For the first call of inner_solve, we need an
+        # initial estimation of ||A||. We assume h = 1 and get
+        #   ||A||^2 = ||X||^2 + sqrt(l2_reg_strength) ||sqrt(P)||^2
+        # if scipy.sparse.issparse(X):
+        #     self.A_norm = scipy.sparse.linalg.norm(X) ** 2
+        # else:
+        #     self.A_norm = scipy.linalg.norm(X) ** 2
+        # self.A_norm += scipy.linalg.norm(self.sqrt_P) ** 2
+        # self.A_norm = np.sqrt(self.A_norm)
+        #
+        # Another, even simpler choice is ||A|| = 1 which likely understimates
+        # ||A|| because ||X|| > 1 is very likely for typical (e.g. standardized) X.
+        # Underestimating ||A|| means overestimating atol ~ 1 / ||A||, which means the
+        # first iteration stops earlier. This is usually a good thing!
+        # Note: For n_featues > n_samples and no regularization, it seems that a higher
+        # starting A_norm is better which forces more initial LSMR iterations. We could
+        # even condider to make it dependent on log(l2_reg_strength).
+        self.A_norm = max(1, n_features / n_samples)
+        if n_features > n_samples and self.l2_reg_strength < 1e-7:
+            # We are in or close to an unpenalized underparametrized setting, where a
+            # large first LSMR iteration is beneficial.
+            # The multiplicative term is 1 <= term <= 50, log(1e-22) ~ -50
+            self.A_norm *= 1 - np.log((self.l2_reg_strength + 1e-22) * 1e7)
+        self.r_norm = 1
+        self.atol = 0
+        self.lsmr_iter = 0  # number of total LSMR iterations
+        self.sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        self.sqrt_sw_sum = np.sqrt(self.sw_sum)
+
+    def update_gradient_hessian(self, X, y, sample_weight):
+        """Update gradient and hessian.
+
+        Update pointwise gradient and hessian, self.g and self.h,
+        as well as the full gradient, self.gradient.
+        """
+        n_samples, n_features = X.shape
+        if not self.linear_loss.base_loss.is_multiclass:
+            self.linear_loss.base_loss.gradient_hessian(
+                y_true=y,
+                raw_prediction=self.raw_prediction,  # this was updated in line_search
+                sample_weight=sample_weight,
+                gradient_out=self.g,
+                hessian_out=self.h,
+                n_threads=self.n_threads,
+            )
+            self.g /= self.sw_sum
+            self.h /= self.sw_sum
+
+            # See LinearModelLoss.gradient_hessian
+            # For non-canonical link functions and far away from the optimum, the
+            # pointwise hessian can be negative. We take care that 75% ot the hessian
+            # entries are positive.
+            self.hessian_warning = np.mean(self.h <= 0) > 0.25
+            self.h = np.abs(self.h)
+
+            # This duplicates a bit of code from LinearModelLoss.gradient.
+            weights, _ = self.linear_loss.weight_intercept(self.coef)
+            self.gradient = np.empty_like(self.coef, dtype=self.coef.dtype)
+            self.gradient[:n_features] = X.T @ self.g + self.l2_reg_strength * weights
+            if self.linear_loss.fit_intercept:
+                self.gradient[-1] = self.g.sum()
+        else:
+            # Here we may safely assume HalfMultinomialLoss.
+            # We use self.h to store the predicted probabilities instead of the hessian
+            self.linear_loss.base_loss.gradient_proba(
+                y_true=y,
+                raw_prediction=self.raw_prediction,  # this was updated in line_search
+                sample_weight=sample_weight,
+                gradient_out=self.g,
+                proba_out=self.h,
+                n_threads=self.n_threads,
+            )
+            self.g /= self.sw_sum
+            # Note: Multinomial hessian is always non-negative.
+
+            # This duplicates a bit of code from LinearModelLoss.gradient.
+            n_classes = self.linear_loss.base_loss.n_classes
+            n_dof = n_features + int(self.linear_loss.fit_intercept)
+            weights, _ = self.linear_loss.weight_intercept(self.coef)
+            self.gradient = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
+            # self.g.shape = (n_samples, n_classes)
+            self.gradient[:, :n_features] = (
+                self.g.T @ X + self.l2_reg_strength * weights
+            )
+            if self.linear_loss.fit_intercept:
+                self.gradient[:, -1] = self.g.sum(axis=0)
+
+    def compute_A_b(self, X, y, sample_weight):
+        """Compute A and b for IRLS formulation.
+
+        Returns
+        -------
+        A : LinearOperator of shape (n_samples + n_features, n_dof) or \
+            (n_samples + n_features) * n_classes, n_dof * n_classes)
+            For the multiclass case, `A @ x` expects `x = coef.ravel(order="C")` for
+            `coef.shape = (n_classes, n_dof)`, and `A.T @ x` expects
+            `x = b.ravel(order="F")` for `b.shape=(n_samples + n_features, n_classes)`.
+
+        b : ndarray of shape (n_samples + n_features) or \
+            ((n_samples + n_features) * n_classes)
+        """
+        n_samples, n_features = X.shape
+
+        if not self.linear_loss.base_loss.is_multiclass:
+            n_classes = 1
+            sqrt_h = np.sqrt(self.h)
+
+            b = np.r_[-self.g / sqrt_h, -self.sqrt_P * self.coef[:n_features]]
+
+            if self.linear_loss.fit_intercept:
+                n_dof = n_features + 1
+
+                def matvec(x):
+                    # A @ x with intercept
+                    # We assume self.sqrt_P to be 1-d array of shape (n_features,),
+                    # representing a diagonal matrix.
+                    return np.r_[sqrt_h * (X @ x[:-1] + x[-1]), self.sqrt_P * x[:-1]]
+
+                def rmatvec(x):
+                    # A.T @ x with intercept
+                    return np.r_[
+                        X.T @ (sqrt_h * x[:n_samples]) + self.sqrt_P * x[n_samples:],
+                        sqrt_h @ x[:n_samples],
+                    ]
+
+            else:
+                n_dof = n_features
+
+                def matvec(x):
+                    # A @ x without intercept
+                    return np.r_[sqrt_h * (X @ x), self.sqrt_P * x]
+
+                def rmatvec(x):
+                    # A.T @ x without intercept
+                    return X.T @ (sqrt_h * x[:n_samples]) + self.sqrt_P * x[n_samples:]
+
+        else:
+            # Here we may safely assume HalfMultinomialLoss.
+            n_classes = self.linear_loss.base_loss.n_classes
+            p = self.h  # probability array of shape (n_samples, n_classes)
+            # We need a square root of X' h X with pointwise hessian h. Note that h is
+            # a priori a 4-dimensional matrix (n_samples ** 2, n_classes ** 2) and is
+            # diagonal in n_samples ** 2, i.e. effectively a 3-dimensional matrix.
+            # To accomplish this, we need the Cholesky or LDL' decomposition of h
+            #   h = diag(p) - p' p
+            # or with indices i and j for classes
+            #   h_ij = p_i * delta_ij - p_i * p_j
+            # This holds for every single point (n_sample). To tackle this, we use the
+            # class Multinomial_LDL_Decomposition, which provides further details.
+            #
+            # We have h = L D L' with lower triangular L having unit diagonal and
+            # diagonal D. This gives
+            #   C = sqrt(D) L' X
+            #   X' h X = C' C
+            # and C is our searched for square root in
+            #   A = [                              C]
+            #       [sqrt(l2_reg_strength) * sqrt(P)]
+            #   b = [                          -(L sqrt(D))^(-1) g]
+            #       [- sqrt(l2_reg_strength) * sqrt(P) @ self.coef]
+
+            # Note again
+            # self.g.shape = p.shape = (n_samples, n_classes)
+            # self.coef.shape = (n_classes, n_dof)
+            # sqrt(h) becomes sqrt(D) L'
+            LDL = Multinomial_LDL_Decomposition(proba=p)
+            self.LDL = LDL  # store it for compute_d2
+
+            # We need "g/sqrt(h)" such that g = L sqrt(D) "g/sqrt(h)", i.e.
+            # "g/sqrt(h)" = 1/sqrt(D) L^-1 @ g.
+            # Note that L^-1 is again lower triangular.
+            g_over_h = self.g.copy()  # F-contiguous, shape (n_samples, n_classes)
+            LDL.inverse_L_sqrt_D_matmul(g_over_h)
+            if sample_weight is not None:
+                # Sample weights sw enter the hessian multiplicatively, i.e. sw * h.
+                # They can be dealt with by adding them to D, which means sqrt(sw)
+                # to sqrt(D) in every place.
+                sqrt_sw = np.sqrt(sample_weight) / self.sqrt_sw_sum
+                g_over_h /= sqrt_sw[:, None]
+            else:
+                sqrt_sw = 1 / self.sqrt_sw_sum
+                g_over_h *= self.sqrt_sw_sum
+
+            # For ravelled results we use the convention that all values for the same
+            # class are in sequence. For n_classes = 2, the ravelled b looks like
+            #   [            -g_over_h[:, 0]]    n_samples elements of class 0
+            #   [-self.sqrt_P * coef.T[:, 0]]    n_features elements of class 0
+            #   [            -g_over_h[:, 1]]    n_samples elements of class 1
+            #   [-self.sqrt_P * coef.T[:, 1]]    n_features elements of class 1
+            # Note that this convention is different from
+            # LinearModelLoss.gradient_hessian_product which expects raveled coef to
+            # to have all classes in sequence.
+            b = np.r_[
+                -g_over_h,
+                -self.sqrt_P[:, None] * self.coef[:, :n_features].T,
+            ].ravel(order="F")
+
+            # Note on performance:
+            # X @ coef.T returns a C-contiguous array, but due to slicing along the
+            # first axis (along samples like x[:, i]), sqrt_D_Lt_matmul is more
+            # efficient for F-contiguous arrays. The output of X @ c can be made
+            # F-contigous by (c.T @ X.T).T.
+
+            if self.linear_loss.fit_intercept:
+                n_dof = n_features + 1
+
+                def matvec(x):
+                    # A @ x with or without intercept
+                    # We assume self.sqrt_P to be 1-d array of shape (n_features,),
+                    # representing a diagonal matrix.
+                    coef = x.reshape((n_classes, -1), order="C")
+                    # C_coef = sqrt(D) L' X coef
+                    # C_coef = X @ coef[:, :-1].T + coef[:, -1]  # C-contiguous
+                    # F-contiguous version, see note above:
+                    C_coef = (coef[:, :-1] @ X.T + coef[:, -1].T[:, None]).T
+                    LDL.sqrt_D_Lt_matmul(C_coef)
+                    # C_coef.shape = (n_samples, n_classes)
+                    if sample_weight is not None:
+                        C_coef *= sqrt_sw[:, None]
+                    else:
+                        C_coef *= sqrt_sw
+                    # For n_classes = 2, the ravelled result looks like
+                    #   [         C_Coef[:, 0]]    n_samples elements of class 0
+                    #   [sqrt_P * coef.T[:, 0]]    n_features elements of class 0
+                    #   [         C_Coef[:, 1]]    n_samples elements of class 1
+                    #   [sqrt_P * coef.T[:, 1]]    n_features elements of class 1
+                    return np.r_[
+                        C_coef,
+                        self.sqrt_P[:, None] * coef[:, :n_features].T,
+                    ].ravel(order="F")
+
+                def rmatvec(x):
+                    # A.T @ x with intercept
+                    # Ct_y = X' L sqrt(D) y
+                    x = x.reshape((-1, n_classes), order="F")
+                    y = x[:n_samples, :]  # y is neither C nor F-contiguous
+                    y = y.copy(order="F")
+                    if sample_weight is not None:
+                        y *= sqrt_sw[:, None]
+                    else:
+                        y *= sqrt_sw
+                    L_sqrtD_y = LDL.L_sqrt_D_matmul(y)
+                    Ct_y = X.T @ L_sqrtD_y  # shape = (n_features, n_classes)
+                    return np.r_[
+                        Ct_y + self.sqrt_P[:, None] * x[n_samples:, :],
+                        np.sum(L_sqrtD_y, axis=0)[None, :],
+                    ].ravel(order="F")
+
+            else:
+                n_dof = n_features
+
+                def matvec(x):
+                    # A @ x without intercept
+                    coef = x.reshape((n_classes, -1), order="C")
+                    # C_coef = sqrt(D) L' X coef
+                    # C_coef = X @ coef.T  # C-contiguous
+                    # F-contiguous version, see note above:
+                    C_coef = (coef @ X.T).T
+                    LDL.sqrt_D_Lt_matmul(C_coef)
+                    if sample_weight is not None:
+                        C_coef *= sqrt_sw[:, None]
+                    else:
+                        C_coef *= sqrt_sw
+                    return np.r_[
+                        C_coef,
+                        self.sqrt_P[:, None] * coef.T,
+                    ].ravel(order="F")
+
+                def rmatvec(x):
+                    # A.T @ x without intercept
+                    # Ct_y = X' L sqrt(D) y
+                    x = x.reshape((-1, n_classes), order="F")
+                    y = x[:n_samples, :]  # y is neither C nor F-contiguous
+                    y = y.copy(order="F")
+                    if sample_weight is not None:
+                        y *= sqrt_sw[:, None]
+                    else:
+                        y *= sqrt_sw
+                    L_sqrtD_y = LDL.L_sqrt_D_matmul(y)
+                    Ct_y = X.T @ L_sqrtD_y  # shape = (n_features, n_classes)
+                    return (Ct_y + self.sqrt_P[:, None] * x[n_samples:, :]).ravel(
+                        order="F"
+                    )
+
+        # Note that initializing LinearOperator seems to have some surprisingly sizable
+        # overhead.
+        A = scipy.sparse.linalg.LinearOperator(
+            shape=((n_samples + n_features) * n_classes, n_dof * n_classes),
+            matvec=matvec,
+            rmatvec=rmatvec,
+        )
+
+        return A, b
+
+    def inner_solve(self, X, y, sample_weight):
+        """Compute Newton step.
+
+        Sets:
+            - self.coef_newton via LSMR
+              As LSMR is an iterative method, NewtonLSMRSolver is an inexact Newton
+              method.
+            - self.gradient_times_newton
+            - self.lsmr_iter
+            - self.A_norm
+            - self.r_norm
+            - self.atol
+
+        A_norm and r_norm are used for the inner tolerance used in LSMR.
+        """
+        n_samples, n_features = X.shape
+        if not self.linear_loss.base_loss.is_multiclass:
+            n_classes = 1
+        else:
+            n_classes = self.linear_loss.base_loss.n_classes
+        A, b = self.compute_A_b(X, y, sample_weight)
+        # The choice of atol in LSMR is essential for stability and for computation
+        # time. For n_samples > n_features, we most certainly have a least squares
+        # problem (no solution to the linear equation A x = b), such that the following
+        # stopping criterion with residual r = b - A x and x = coef_newton applies:
+        #   ||A' r|| = ||H x + G|| <= atol * ||A|| * ||r||.
+        # For inexact Newton solvers, res = H x + G is called the residual and one
+        # usually chooses, see Eq. 7.3 Nocedal & Wright 2nd ed, a stopping criterion
+        #   ||res|| = ||A' r|| <= eta * ||G||
+        # with a forcing sequence 0 < eta < 1 (eta_k for iteration k).
+        # As for our Newton conjugate gradient solver "_newton_cg", we set
+        #   eta = min(0.5, np.sqrt(||G||))
+        # which establishes a superlinear rate of convergence.
+        # Fortunately, we get approximations of the Frobenius norm of A and the norm of
+        # r, ||A|| and ||r|| respectively, for free by LSMR such that we can set
+        #   atol = eta * ||G|| / (||A|| * ||r||)
+        # This results in our desired stopping criterion
+        #   ||res|| = ||A' r|| <= eta * ||G||
+        # at least approximately, as we have to use ||A|| and ||r|| from the last and
+        # not the current iteration.
+        #
+        # In the case of perfect interpolation, n_features >= n_samples, we might have
+        # an exact solution to A x = b. LSMR then uses the stopping criterion
+        #   ||r|| <= atol * ||A|| * ||x|| + btol * ||b||.
+        # Note that
+        #   ||b||^2 = ||g / sqrt(h)||^2 + l2_reg_strength * x0 @ P @ x0
+        # and
+        #   1/2 * ||r||^2 = 1/2 * x @ H @ x + G @ x + 1/2 * ||b||^2
+        # Here, x is the solution of Ax=b, i.e. x=coef_newton, and, neglecting line
+        # search, coef = x + x0. Thus, 1/2 ||r||^2 is just the Taylor series of the
+        # objective with the zero order term loss(x0) replaced by
+        # 1/2 * ||g/sqrt(h)||^2.
+        #
+        # We set btol to the strict self.tol to help cases of collinearity in X when
+        # there is (almost) no L2 penalty.
+        norm_G = scipy.linalg.norm(self.gradient)
+        eta = min(0.5, np.sqrt(norm_G))
+        # Note that norm_G might not be decreasing from Newton iteration to iteration.
+        # This means we are still not close to the minimum and it is good when artol
+        # is then loose enough.
+        if self.verbose >= 3:
+            print(f"    norm(gradient) = {norm_G}")
+        # Avoid division by 0 by adding tiny 1e-16.
+        atol = eta * norm_G / (self.A_norm * self.r_norm + 1e-16)
+        # Avoid that atol gets tiny too fast.
+        if atol < self.atol / 10:
+            atol = self.atol / 10 * np.sqrt(atol * 10 / self.atol)
+        self.atol = atol
+        if self.verbose >= 3:
+            print(f"    {self.A_norm=} {self.r_norm=} {eta=} {atol=}")
+        result = scipy.sparse.linalg.lsmr(
+            A,
+            b,
+            damp=0,
+            atol=atol,
+            btol=self.tol,
+            maxiter=max(n_samples, n_features) * n_classes,  # default is min(A.shape)
+            # default conlim = 1e8, for compatible systems 1e12 is still reasonable,
+            # see LSMR documentation
+            conlim=1e12,
+            show=self.verbose >= 3,
+        )
+        # We store the estimated Frobenius norm of A and norm of residual r in
+        # self.A_norm and self.r_norm for tolerance of next iteration.
+        (
+            self.coef_newton,
+            istop,
+            itn,
+            self.r_norm,
+            normar,
+            self.A_norm,
+            conda,
+            normx,
+        ) = result
+        self.lsmr_iter += itn
+        if self.verbose >= 2:
+            print(f"  Inner iterations in LSMR = {itn}, total = {self.lsmr_iter}")
+        if self.coef.dtype == np.float32:
+            self.coef_newton = self.coef_newton.astype(np.float32)
+        if not self.linear_loss.base_loss.is_multiclass:
+            self.gradient_times_newton = self.gradient @ self.coef_newton
+        else:
+            self.coef_newton = self.coef_newton.reshape((n_classes, -1), order="C")
+            # Center result such that sum over classes equals zero.
+            # Note: The L2 penalty guarantees that coef_newton step is centered, i.e.
+            # np.mean(self.coef_newton, axis=0) = 0.
+            # But for the intercept term self.coef_newton[:, -1] this is less clear.
+            # Nevertheless, all tests pass without the following line.
+            # self.coef_newton -= np.mean(self.coef_newton, axis=0)
+
+            self.gradient_times_newton = self.gradient.ravel(
+                order="F"
+            ) @ self.coef_newton.ravel(order="F")
+
+        # In the first Newton iteraton, we tolerate istop == 7 and other things. We
+        # just got started and are therefore forgiving.
+        if self.iteration == 1:
+            return
+        # Note: We could detect too large steps by comparing norm(coef_newton) = normx
+        # with norm(gradient) or with the already available condition number of A, e.g.
+        # conda.
+        if istop == 7:
+            self.use_fallback_lbfgs_solve = True
+            msg = (
+                f"The inner solver of {self.__class__.__name__} reached "
+                f"maxiter={itn} before the other stopping conditions were "
+                f"satisfied at iteration #{self.iteration}. "
+            )
+        elif istop in (3, 6):
+            self.use_fallback_lbfgs_solve = True
+            msg = (
+                f"The inner solver of {self.__class__.__name__} complained that the "
+                f"condition number of A (A'A = Hessian), conda={conda}, seems to be "
+                "greater than the given limit conlim=1e8 at iteration "
+                f"#{self.iteration}."
+            )
+
+        if self.use_fallback_lbfgs_solve:
+            warnings.warn(
+                msg + "It will now resort to lbfgs instead.\n"
+                "This may be caused by singular or very ill-conditioned Hessian "
+                "matrix. "
+                "Further options are to use another solver or to avoid such situation "
+                "in the first place. Possible remedies are removing collinear features"
+                "of X or increasing the penalization strengths.",
+                ConvergenceWarning,
+            )
+            if self.verbose:
+                print(
+                    "  The inner solver had problems to converge and resorts to lbfgs."
+                )
+            self.use_fallback_lbfgs_solve = True
+            return
+
+    def compute_d2(self, X, sample_weight):
+        """Compute square of Newton decrement."""
+        weights, intercept, raw_prediction = self.linear_loss.weight_intercept_raw(
+            self.coef_newton, X
+        )
+        if not self.linear_loss.base_loss.is_multiclass:
+            d2 = np.sum(raw_prediction * self.h * raw_prediction)
+        else:
+            d = self.LDL.sqrt_D_Lt_matmul(raw_prediction)
+            if sample_weight is not None:
+                d *= sample_weight[:, None]
+            d = d.ravel(order="F")
+            d2 = d @ d
+        d2 += 2 * self.linear_loss.l2_penalty(weights, self.l2_reg_strength)
+        return d2
+
+    def finalize(self, X, y, sample_weight):
+        if hasattr(self, "LDL"):
+            delattr(self, "LDL")
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 4cac889a4da51..5d99ba97ddd41 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -25,7 +25,12 @@
 from ...utils.optimize import _check_optimize_result
 from ...utils.validation import _check_sample_weight, check_is_fitted
 from .._linear_loss import LinearModelLoss
-from ._newton_solver import NewtonCholeskySolver, NewtonSolver
+from ._newton_solver import NewtonCholeskySolver, NewtonLSMRSolver, NewtonSolver
+
+NEWTON_SOLVER = {
+    "newton-cholesky": NewtonCholeskySolver,
+    "newton-lsmr": NewtonLSMRSolver,
+}
 
 
 class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
@@ -65,7 +70,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (X @ coef + intercept).
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
@@ -81,6 +86,20 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
 
             .. versionadded:: 1.2
 
+        'newton-lsmr'
+            Uses Newton-Raphson steps formulated as iteratively reweighted least
+            squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`,
+            this solver does not explicitly materialize the Hessian matrix but instead
+            leverages knowledge about its structure to incrementally solve the least
+            squares problems via a series of matrix vector operations where the
+            matrices have block structure with block sizes scaling as
+            `(n_samples, n_features)` and `(n_samples, n_classes)` therefore limiting
+            the memory requirements.
+            Additionaly, this is numerically more stable for ill-conditioned X compared
+            to `newton-cholesky`.
+
+            .. versionadded:: 1.5
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -140,7 +159,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
         "alpha": [Interval(Real, 0.0, None, closed="left")],
         "fit_intercept": ["boolean"],
         "solver": [
-            StrOptions({"lbfgs", "newton-cholesky"}),
+            StrOptions({"lbfgs", "newton-cholesky", "newton-lsmr"}),
             Hidden(type),
         ],
         "max_iter": [Interval(Integral, 1, None, closed="left")],
@@ -284,8 +303,8 @@ def fit(self, X, y, sample_weight=None):
             )
             self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
             coef = opt_res.x
-        elif self.solver == "newton-cholesky":
-            sol = NewtonCholeskySolver(
+        elif self.solver in NEWTON_SOLVER.keys():
+            sol = NEWTON_SOLVER[self.solver](
                 coef=coef,
                 linear_loss=linear_loss,
                 l2_reg_strength=l2_reg_strength,
@@ -483,7 +502,7 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor (`X @ coef + intercept`).
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
@@ -499,6 +518,18 @@ class PoissonRegressor(_GeneralizedLinearRegressor):
 
             .. versionadded:: 1.2
 
+        'newton-lsmr'
+            Uses Newton-Raphson steps formulated as iteratively reweighted least
+            squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`,
+            this solver does not explicitly materialize the Hessian matrix but instead
+            leverages knowledge about its structure to incrementally solve the least
+            squares problems via a series of matrix vector operations where the
+            matrices have size `(n_samples, n_features)`.
+            Additionaly, this is numerically more stable for ill-conditioned X compared
+            to `newton-cholesky`.
+
+            .. versionadded:: 1.5
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -614,7 +645,7 @@ class GammaRegressor(_GeneralizedLinearRegressor):
         Specifies if a constant (a.k.a. bias or intercept) should be
         added to the linear predictor `X @ coef_ + intercept_`.
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
@@ -630,6 +661,18 @@ class GammaRegressor(_GeneralizedLinearRegressor):
 
             .. versionadded:: 1.2
 
+        'newton-lsmr'
+            Uses Newton-Raphson steps formulated as iteratively reweighted least
+            squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`,
+            this solver does not explicitly materialize the Hessian matrix but instead
+            leverages knowledge about its structure to incrementally solve the least
+            squares problems via a series of matrix vector operations where the
+            matrices have size `(n_samples, n_features)`.
+            Additionaly, this is numerically more stable for ill-conditioned X compared
+            to `newton-cholesky`.
+
+            .. versionadded:: 1.3
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
@@ -776,7 +819,7 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
         - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
           distributions
 
-    solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
+    solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs'
         Algorithm to use in the optimization problem:
 
         'lbfgs'
@@ -792,6 +835,18 @@ class TweedieRegressor(_GeneralizedLinearRegressor):
 
             .. versionadded:: 1.2
 
+        'newton-lsmr'
+            Uses Newton-Raphson steps formulated as iteratively reweighted least
+            squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`,
+            this solver does not explicitly materialize the Hessian matrix but instead
+            leverages knowledge about its structure to incrementally solve the least
+            squares problems via a series of matrix vector operations where the
+            matrices have size `(n_samples, n_features)`.
+            Additionaly, this is numerically more stable for ill-conditioned X compared
+            to `newton-cholesky`.
+
+            .. versionadded:: 1.3
+
     max_iter : int, default=100
         The maximal number of iterations for the solver.
         Values must be in the range `[1, inf)`.
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
index 26f6bdc08d254..28f893038f286 100644
--- a/sklearn/linear_model/_glm/tests/test_glm.py
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -12,11 +12,17 @@
 from numpy.testing import assert_allclose
 from scipy import linalg
 from scipy.optimize import minimize, root
+from scipy.sparse.linalg import lsmr
 
-from sklearn._loss import HalfBinomialLoss, HalfPoissonLoss, HalfTweedieLoss
+from sklearn._loss import (
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfTweedieLoss,
+)
 from sklearn._loss.link import IdentityLink, LogLink
 from sklearn.base import clone
-from sklearn.datasets import make_low_rank_matrix, make_regression
+from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.linear_model import (
     GammaRegressor,
@@ -25,12 +31,18 @@
     TweedieRegressor,
 )
 from sklearn.linear_model._glm import _GeneralizedLinearRegressor
-from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver
-from sklearn.linear_model._linear_loss import LinearModelLoss
+from sklearn.linear_model._glm._newton_solver import (
+    NewtonCholeskySolver,
+    NewtonLSMRSolver,
+)
+from sklearn.linear_model._linear_loss import (
+    LinearModelLoss,
+    Multinomial_LDL_Decomposition,
+)
 from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
 from sklearn.model_selection import train_test_split
 
-SOLVERS = ["lbfgs", "newton-cholesky"]
+SOLVERS = ["lbfgs", "newton-cholesky", "newton-lsmr"]
 
 
 class BinomialRegressor(_GeneralizedLinearRegressor):
@@ -237,7 +249,12 @@ def test_glm_regression(solver, fit_intercept, glm_dataset):
 
     model.fit(X, y)
 
-    rtol = 5e-5 if solver == "lbfgs" else 1e-9
+    if solver == "lbfgs":
+        rtol = 5e-5
+    elif solver == "newton-lsmr":
+        rtol = 5e-9
+    else:
+        rtol = 1e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
@@ -288,7 +305,12 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
         warnings.simplefilter("ignore", ConvergenceWarning)
         model.fit(X, y)
 
-    rtol = 2e-4 if solver == "lbfgs" else 5e-9
+    if solver == "lbfgs":
+        rtol = 2e-4
+    elif solver == "newton-lsmr":
+        rtol = 2e-8
+    else:
+        rtol = 5e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
 
@@ -328,7 +350,12 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
         intercept = 0
     model.fit(X, y)
 
-    rtol = 3e-5 if solver == "lbfgs" else 5e-9
+    if solver == "lbfgs":
+        rtol = 3e-5
+    elif solver == "newton-lsmr":
+        rtol = 1e-8
+    else:
+        rtol = 5e-9
     assert model.intercept_ == pytest.approx(intercept, rel=rtol)
     assert_allclose(model.coef_, coef, rtol=rtol)
 
@@ -391,7 +418,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
 
         norm_solution = np.linalg.norm(np.r_[intercept, coef])
         norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
-        if solver == "newton-cholesky":
+        if solver in ("newton-cholesky", "newton-lsmr"):
             # XXX: This solver shows random behaviour. Sometimes it finds solutions
             # with norm_model <= norm_solution! So we check conditionally.
             if norm_model < (1 + 1e-12) * norm_solution:
@@ -484,7 +511,10 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase
         # we get a solution, i.e. a (non-unique) minimum of the objective function ...
         rtol = 1e-6 if solver == "lbfgs" else 5e-6
         assert_allclose(model.predict(X), y, rtol=rtol)
-        if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky":
+        if (solver == "lbfgs" and fit_intercept) or solver in (
+            "newton-cholesky",
+            "newton-lsmr",
+        ):
             # Same as in test_glm_regression_unpenalized.
             # But it is not the minimum norm solution. Otherwise the norms would be
             # equal.
@@ -558,7 +588,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
 
         norm_solution = np.linalg.norm(np.r_[intercept, coef])
         norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
-        if solver == "newton-cholesky":
+        if solver in ("newton-cholesky", "newton-lsmr"):
             # XXX: This solver shows random behaviour. Sometimes it finds solutions
             # with norm_model <= norm_solution! So we check conditionally.
             if not (norm_model > (1 + 1e-12) * norm_solution):
@@ -570,7 +600,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase
             # equal.
             assert norm_model > (1 + 1e-12) * norm_solution
         else:
-            rtol = 1e-5 if solver == "newton-cholesky" else 1e-4
+            rtol = 1e-5 if solver in ("newton-cholesky", "newton-lsmr") else 1e-4
             assert model.intercept_ == pytest.approx(intercept, rel=rtol)
             assert_allclose(model.coef_, coef, rtol=rtol)
 
@@ -837,7 +867,7 @@ def test_normal_ridge_comparison(
     assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
 
 
-@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky"])
+@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky", "newton-lsmr"])
 def test_poisson_glmnet(solver):
     """Compare Poisson regression with L2 regularization and LogLink to glmnet"""
     # library("glmnet")
@@ -927,8 +957,8 @@ def test_tags(estimator, value):
     assert estimator._get_tags()["requires_positive_y"] is value
 
 
-def test_linalg_warning_with_newton_solver(global_random_seed):
-    newton_solver = "newton-cholesky"
+@pytest.mark.parametrize("newton_solver", ["newton-cholesky", "newton-lsmr"])
+def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed):
     rng = np.random.RandomState(global_random_seed)
     # Use at least 20 samples to reduce the likelihood of getting a degenerate
     # dataset for any global_random_seed.
@@ -979,11 +1009,17 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     # Fitting a Newton solver on the collinear version of the training data
     # without regularization should raise an informative warning and fallback
     # to the LBFGS solver.
-    msg = (
-        "The inner solver of .*Newton.*Solver stumbled upon a singular or very "
-        "ill-conditioned Hessian matrix"
-    )
-    with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
+    if newton_solver == "newton-cholesky":
+        msg = (
+            "The inner solver of .*Newton.*Solver stumbled upon a singular or very "
+            "ill-conditioned Hessian matrix"
+        )
+        with pytest.warns(scipy.linalg.LinAlgWarning, match=msg):
+            reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(
+                X_collinear, y
+            )
+    else:
+        # newton-lsmr has no problems with collinearity
         reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit(
             X_collinear, y
         )
@@ -1008,6 +1044,115 @@ def test_linalg_warning_with_newton_solver(global_random_seed):
     )
 
 
+@pytest.mark.parametrize(
+    "solver, warn1, msg1, warn2, msg2",
+    [
+        (
+            "lbfgs",
+            None,
+            None,
+            [RuntimeWarning],
+            ["invalid value encountered in matmul"],
+        ),
+        (
+            "newton-cholesky",
+            scipy.linalg.LinAlgWarning,
+            (
+                "The inner solver of .*Newton.*Solver stumbled upon a singular or very "
+                "ill-conditioned Hessian matrix"
+            ),
+            [scipy.linalg.LinAlgWarning, RuntimeWarning],
+            [
+                (
+                    "The inner solver of .*Newton.*Solver stumbled upon a singular or"
+                    " very ill-conditioned Hessian matrix"
+                ),
+                "invalid value encountered in matmul",
+            ],
+        ),
+        (
+            "newton-lsmr",
+            None,
+            None,
+            [ConvergenceWarning],
+            ["Newton solver did not converge after .* iterations"],
+        ),
+    ],
+)
+def test_solver_on_ill_conditioned_X(
+    solver, warn1, msg1, warn2, msg2, global_random_seed
+):
+    """Test GLM solvers on ill conditioned X with high condition number.
+
+    Note that numbers in this test are tuned such that is passes for all global random
+    seeds.
+    """
+    rng = np.random.RandomState(global_random_seed)
+    # Use at least 20 samples to reduce the likelihood of getting a degenerate
+    # dataset for any global_random_seed.
+    X_orig = rng.uniform(low=-1, high=1, size=(20, 3))
+    y = rng.poisson(
+        np.exp(X_orig @ np.ones(X_orig.shape[1])), size=X_orig.shape[0]
+    ).astype(np.float64)
+
+    tol = 1e-8
+    model = PoissonRegressor(solver=solver, alpha=0.0, tol=tol, max_iter=200)
+
+    # No warning raised on well-conditioned design, even without regularization.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        reg = clone(model).fit(X_orig, y)
+    original_deviance = mean_poisson_deviance(y, reg.predict(X_orig))
+
+    # Construct an ill-conditioned problem by some almost identical columns.
+    X_ill_conditioned = X_orig.copy()
+    X_ill_conditioned[:, 1] = X_ill_conditioned[:, 0]
+    X_ill_conditioned[0, 1] += 1e-10
+    X_ill_conditioned[-1, 1] -= 1e-10
+    # Make sure that it is ill-conditioned <=> large condition number.
+    assert np.linalg.cond(X_ill_conditioned) > 1e10 * np.linalg.cond(X_orig)
+
+    if warn1 is None:
+        # Assert that no warning is raised.
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            reg = clone(model).fit(X_ill_conditioned, y)
+    else:
+        # Assert that the given warning is raised.
+        with pytest.warns(warn1, match=msg1):
+            reg = clone(model).fit(X_ill_conditioned, y)
+
+    # Construct another ill-conditioned problem by scaling of features.
+    X_ill_conditioned = X_orig.copy()
+    X_ill_conditioned[:, 0] *= 1e-6
+    X_ill_conditioned[:, 1] *= 1e2  # too large X may overflow in link function
+    # Make sure that it is ill conditioned >=> large condition number.
+    assert np.linalg.cond(X_ill_conditioned) > 1e5 * np.linalg.cond(X_orig)
+
+    test_loss = False
+    if warn2 is None:
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            reg = clone(model).fit(X_ill_conditioned, y)
+    else:
+        # Whether or not a warning is raised depends on global_random_seed.
+        with warnings.catch_warnings(record=True) as w:
+            for warn in warn2:
+                warnings.simplefilter("ignore", warn)
+            reg = clone(model).fit(X_ill_conditioned, y)
+            test_loss = True
+            assert len(w) == 0
+
+    if test_loss:
+        # Without penalty, scaling of columns has no effect on predictions.
+        ill_cond_deviance = mean_poisson_deviance(y, reg.predict(X_ill_conditioned))
+        if solver in ("lbfgs", "newton-cholesky"):
+            pytest.xfail(
+                f"Solver {solver} does not converge but does so without warning."
+            )
+        assert ill_cond_deviance == pytest.approx(original_deviance, rel=1e-2)
+
+
 @pytest.mark.parametrize("verbose", [0, 1, 2])
 def test_newton_solver_verbosity(capsys, verbose):
     """Test the std output of verbose newton solvers."""
@@ -1109,3 +1254,325 @@ def test_newton_solver_verbosity(capsys, verbose):
             "The inner solver detected a pointwise Hessian with many negative values"
             " and resorts to lbfgs instead." in captured.out
         )
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1.5])
+def test_NewtonLSMRSolver_multinomial_A_b(
+    fit_intercept, l2_reg_strength, global_random_seed
+):
+    """Test NewtonLSMRSolver.compute_A_b for multinomial case"""
+    n_samples, n_features, n_classes = 12, 10, 5
+    n_dof = n_features + fit_intercept
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_classes=n_classes,
+        n_informative=n_features - 1,
+        n_redundant=1,
+        random_state=rng,
+    )
+    y = y.astype(float)
+    coef = rng.standard_normal(size=(n_classes, n_features + fit_intercept))
+    coef -= np.mean(coef, axis=0)
+
+    multinomial_loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes), fit_intercept=fit_intercept
+    )
+    gradient, hessp = multinomial_loss.gradient_hessian_product(
+        coef=coef,
+        X=X,
+        y=y,
+        sample_weight=None,
+        l2_reg_strength=l2_reg_strength,
+    )
+    sol = NewtonLSMRSolver(
+        coef=coef,
+        linear_loss=multinomial_loss,
+        l2_reg_strength=l2_reg_strength,
+    )
+    sol.setup(X=X, y=y, sample_weight=None)
+    sol.update_gradient_hessian(X=X, y=y, sample_weight=None)
+
+    # Hessian @ coef
+    # with hessp
+    H_coef = hessp(coef)  # shape = (n_classes, n_dof)
+    # with A
+    A, b = sol.compute_A_b(X=X, y=y, sample_weight=None)
+    At_A_coef = A.T @ A @ coef.ravel(order="C")
+    # The results are differently ravelled, we restore the 2-d arrays with n_classes
+    # on the 1st (=last) axis.
+    assert_allclose(At_A_coef.reshape(-1, n_classes, order="F"), H_coef.T)
+    # Note: The following does not work for all global_random_seeds. The reason behind
+    # it is unclear.
+    # assert_allclose(
+    #     A.rmatvec(b).reshape(-1, n_classes, order="F"), -gradient.T, rtol=1e-4
+    # )
+
+    # Test consistency of A, i.e. reconstructing the matrix based on
+    # A @ unit_vector and A.T @ unit_vector should give the same matrix.
+    unit_vec = np.zeros(n_dof * n_classes)
+    A_matrix1 = np.zeros(((n_samples + n_features) * n_classes, n_dof * n_classes))
+    for i in range(n_dof * n_classes):
+        unit_vec[i] = 1
+        A_matrix1[:, i] = A @ unit_vec
+        unit_vec[i] = 0
+    unit_vec = np.zeros((n_samples + n_features) * n_classes)
+    A_matrix2 = np.zeros(((n_samples + n_features) * n_classes, n_dof * n_classes))
+    for j in range((n_samples + n_features) * n_classes):
+        unit_vec[j] = 1
+        A_matrix2[j, :] = A.rmatvec(unit_vec)
+        unit_vec[j] = 0
+    assert_allclose(A_matrix1, A_matrix2)
+
+
+@pytest.mark.parametrize("fit_intercept", (False, True))
+@pytest.mark.parametrize("with_sample_weight", (False, True))
+def test_NewtonLSMRSolver_multinomial_A_b_on_3_classes(
+    fit_intercept, with_sample_weight, global_random_seed
+):
+    """Test NewtonLSMRSolver.compute_A_b for multinomial case with 3 classes."""
+    n_samples, n_features, n_classes = 5, 2, 3
+    n_dof = n_features + fit_intercept
+    rng = np.random.RandomState(global_random_seed)
+    X = np.array([[1.0, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+    Xi = X
+    # coef.shape = (n_classes, n_dof) and coef.sum(axis=0) = 0
+    coef = np.array([[1.0, -1], [1, 0], [-2, 1]])
+    if fit_intercept:
+        X[0, 0] = -1  # to make Xi positive definite
+        coef = np.c_[coef, [[2], [1], [-3]]]
+        Xi = np.c_[X, np.ones(n_samples)]
+    assert np.linalg.matrix_rank(Xi) == n_dof
+    y = np.array([0.0, 1, 1, 1, 2])
+
+    if with_sample_weight:
+        sw = 1.0 + np.arange(n_samples)
+        sw_sum = np.sum(sw)
+    else:
+        sw = None
+        sw_sum = n_samples
+
+    multinomial_loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes, sample_weight=sw),
+        fit_intercept=fit_intercept,
+    )
+    sol = NewtonLSMRSolver(
+        coef=coef,
+        linear_loss=multinomial_loss,
+        l2_reg_strength=0,
+    )
+    sol.setup(X=X, y=y, sample_weight=sw)
+    sol.update_gradient_hessian(X=X, y=y, sample_weight=sw)
+    A, b = sol.compute_A_b(X=X, y=y, sample_weight=sw)
+
+    _, _, raw_prediction = multinomial_loss.weight_intercept_raw(coef, X)
+    # p.shape = (n_samples, n_classes)
+    p = multinomial_loss.base_loss.predict_proba(raw_prediction)
+    assert_allclose(np.sum(p, axis=1), 1)  # consequence of coef.sum(axis=0) = 0
+
+    # Extended vectors for 3 classes:
+    # For y_ext and p_ext, we append the vectors for different classes one after the
+    # other.
+    y_ext = np.r_[y == 0, y == 1, y == 2]
+    p_ext = p.ravel(order="F")
+    coef_ext = coef.ravel(order="C")
+    sw_ext = np.tile(sw, 3)
+    # Extended matrices for 3 classes:
+    # X_ext = [X, 0, 0]  W = [W00, W01, W02]
+    #         [0, X, 0]      [W10, W11, W12]
+    #         [0, 0, X]      [W20, W21, W22]
+    # X_ext.shape = (n_classes * n_samples, n_classes * n_features)
+    # W.shape = (n_classes * n_samples, n_classes * n_samples)
+    # Each Wij is a diagonal matrix with elements pi * (delta_ij - pj).
+    X_ext = linalg.block_diag(Xi, Xi, Xi)
+    W00 = np.diag(p[:, 0] * (1 - p[:, 0]))
+    W11 = np.diag(p[:, 1] * (1 - p[:, 1]))
+    W22 = np.diag(p[:, 2] * (1 - p[:, 2]))
+    W01 = np.diag(-p[:, 0] * p[:, 1])
+    W02 = np.diag(-p[:, 0] * p[:, 2])
+    W12 = np.diag(-p[:, 1] * p[:, 2])
+    W_ext = np.block([[W00, W01, W02], [W01, W11, W12], [W02, W12, W22]]) / sw_sum
+    g_ext = (p_ext - y_ext) / sw_sum  # pointwise gradient
+    if with_sample_weight:
+        W_ext *= sw_ext[:, None]
+        g_ext *= sw_ext
+    G_ext = X_ext.T @ g_ext  # full gradient
+    H_ext = X_ext.T @ W_ext @ X_ext  # full hessian
+
+    assert_allclose(g_ext.reshape((n_samples, n_classes), order="F"), sol.g)
+
+    # Note that both X_ext @ v and A @ v need v = coef.ravel(order="C") for
+    # coef.shape = (n_classes, n_dof).
+
+    # v is like a coefficient
+    v = rng.standard_normal(size=(n_classes, n_dof))
+    # u is a test vector for A', i.e. we test A.T @ u
+    u = rng.standard_normal(size=(n_samples + n_features, n_classes))
+
+    # Check that A is a square root of H, i.e. H = A'A.
+    H_ext_v = H_ext @ v.ravel(order="C")
+    A_v = A @ v.ravel(order="C")
+    At_A_v = A.T @ A_v
+    assert_allclose(At_A_v, H_ext_v)
+
+    # Check that A'b = -G
+    assert_allclose(A.T @ b, -G_ext)
+
+    # Check that A is constructed from LDL decomposition.
+    LDL = Multinomial_LDL_Decomposition(proba=p)
+    D_Lt_X_v = LDL.sqrt_D_Lt_matmul(Xi @ v.T) / np.sqrt(sw_sum)
+    if with_sample_weight:
+        D_Lt_X_v *= np.sqrt(sw)[:, None]
+    assert_allclose(A_v.reshape((-1, n_classes), order="F")[:n_samples, :], D_Lt_X_v)
+    assert_allclose(A_v[n_classes * n_samples :], 0)
+    assert_allclose(
+        A_v.reshape((-1, n_classes), order="F")[:n_samples, :], D_Lt_X_v, atol=1e-8
+    )
+
+    # Check LDL decomposition of W explicitly.
+    L, D, perm = linalg.ldl(W_ext, lower=True)
+    # D should be diagonal (and not block diagonal) and non-negative.
+    assert_allclose(np.diag(np.diag(D)), D)
+    assert np.all(D >= -1e-15)
+    D[D < 1e-15] = 0  # Tiny values would be zero in exact arithmetic.
+    assert_allclose(L @ D @ L.T, W_ext)
+    D_Lt_X = np.sqrt(D) @ L.T @ X_ext
+    assert_allclose(D_Lt_X.T @ D_Lt_X, H_ext)
+    assert_allclose(D_Lt_X @ v.ravel(order="C"), D_Lt_X_v.ravel(order="F"), atol=1e-8)
+
+    # Construct matrix A explicitly. Note: no intercept, penalty is zero.
+    A_ext = np.r_[D_Lt_X, np.zeros((n_classes * n_dof, n_classes * n_dof))]
+    assert_allclose(A_ext.T @ A_ext, H_ext)
+    D_inv = np.sqrt(D)
+    D_inv[D_inv > 0] = 1 / D_inv[D_inv > 0]
+    D_inv[D_inv <= 0] = 0
+    assert_allclose(L @ np.sqrt(D) @ D_inv @ linalg.solve(L, g_ext), g_ext)
+    assert_allclose(X_ext.T @ L @ np.sqrt(D) @ D_inv @ linalg.solve(L, g_ext), G_ext)
+    # Note that A_ext needs slicing u in a way not possible with ravel, i.e.
+    # treat u[:n_samples * n_classes, :] and u[n_samples * n_classes:, :] differently.
+    assert_allclose(
+        A_ext[: n_classes * n_samples, :].T @ u[:n_samples, :].ravel(order="F"),
+        A.T @ u.ravel(order="F"),
+    )
+
+    # Construct b explicitly.
+    b_ext = np.r_[-D_inv @ linalg.solve(L, g_ext), np.zeros(n_dof * n_classes)]
+    assert_allclose(
+        b_ext[: n_samples * n_classes],
+        b.reshape((-1, n_classes), order="F")[:n_samples, :].ravel(order="F"),
+        atol=1e-15,
+    )
+    assert_allclose(A_ext.T @ b_ext, -G_ext)
+
+    # Check equivalence of linear equation H @ x = H_v to least squares problem
+    # ||D_Lt_X @ x - D_Lt_X_v||. Note that H (unpenalized with all classes) is
+    # singular and we use `lstsq` instead of `solve`.
+    res = linalg.lstsq(H_ext, H_ext_v)[0].reshape((n_classes, -1))
+    assert_allclose(res, v - np.mean(v, axis=0))
+    res = linalg.lstsq(D_Lt_X, D_Lt_X_v.ravel(order="F"))[0].reshape((n_classes, -1))
+    assert_allclose(res - np.mean(res, axis=0), v - np.mean(v, axis=0))
+
+    # Check Newton step, i.e. H @ x = -G.
+    # Here, we need to add a penalty term. Otherwise, the linear system would be
+    # singular.
+    alpha = 1e-4
+    sol = NewtonLSMRSolver(
+        coef=coef,
+        linear_loss=multinomial_loss,
+        l2_reg_strength=alpha,
+    )
+    sol.setup(X=X, y=y, sample_weight=sw)
+    sol.update_gradient_hessian(X=X, y=y, sample_weight=sw)
+    A, b = sol.compute_A_b(X=X, y=y, sample_weight=sw)
+    pen_ext = alpha * np.identity(n_classes * n_dof)
+    if fit_intercept:
+        pen_ext[:, n_dof - 1 :: n_dof] = 0
+    assert_allclose(
+        A.T @ A @ v.ravel(order="C"), (H_ext + pen_ext) @ v.ravel(order="C")
+    )
+    assert_allclose(A.T @ b, -(G_ext + pen_ext @ coef_ext))
+
+    # Check Newton step.
+    # Note: This works despite the fact that W=LDL is a singular matrix and therefore
+    # D has some zeros on the diagonal and is not strictly invertible. But this does
+    # not seem to matter.
+    if fit_intercept:
+        with warnings.catch_warnings():
+            # Due to the intercept term not being penalized, the linear system might
+            # still be singular.
+            warnings.simplefilter("ignore", scipy.linalg.LinAlgWarning)
+            res1 = linalg.solve(H_ext + pen_ext, -(G_ext + pen_ext @ coef_ext)).reshape(
+                (n_classes, -1)
+            )
+        # For the same reasons, the intercept might need class centering.
+        res1[:, -1] -= np.mean(res1[:, -1])
+    else:
+        res1 = linalg.solve(H_ext + pen_ext, -(G_ext + pen_ext @ coef_ext)).reshape(
+            (n_classes, -1)
+        )
+    # sum over classes = 0
+    assert_allclose(res1.sum(axis=0), 0, atol=5e-9)
+    assert_allclose((H_ext + pen_ext) @ res1.ravel(), -(G_ext + pen_ext @ coef_ext))
+    assert_allclose(A.T @ A @ res1.ravel(order="C"), A.T @ b)
+    res2 = lsmr(A, b, maxiter=(n_features + n_samples) * n_classes, show=True)
+    res2 = res2[0].reshape((n_classes, -1))
+    assert_allclose(res1, res2)
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("l2_reg_strength", [0, 5.5])
+def test_NewtonLSMRSolver_multinomial_on_binary_problem(
+    fit_intercept, l2_reg_strength, global_random_seed
+):
+    """Test NewtonLSMRSolver multinomial case on a binary problem."""
+    n_samples, n_features, n_classes = 100, 3, 2
+    rng = np.random.RandomState(global_random_seed)
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_classes=n_classes,
+        n_informative=n_features - 1 * 0,
+        n_redundant=1 * 0,
+        flip_y=0.1,
+        random_state=rng,
+    )
+    y = y.astype(float)
+
+    # Note that the factor of 2 for BinomialRegressor is to adjust for the
+    # parametrization difference to the multinomial loss.
+    bin = BinomialRegressor(
+        fit_intercept=fit_intercept,
+        alpha=l2_reg_strength / 2,
+        solver="newton-cholesky",
+        tol=1e-8,
+    ).fit(X, y)
+    bin_loss = LinearModelLoss(
+        base_loss=HalfBinomialLoss(),
+        fit_intercept=fit_intercept,
+    )
+
+    linear_loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes),
+        fit_intercept=fit_intercept,
+    )
+    sol = NewtonLSMRSolver(
+        coef=linear_loss.init_zero_coef(X),
+        linear_loss=linear_loss,
+        l2_reg_strength=l2_reg_strength,
+        tol=1e-8,
+        verbose=0,
+    )
+    sol.solve(X, y, None)
+
+    assert_allclose(np.mean(sol.coef, axis=0), 0, atol=1e-13)
+    if fit_intercept:
+        coef_bin = np.r_[bin.coef_, bin.intercept_]
+    else:
+        coef_bin = bin.coef_
+    assert_allclose(
+        linear_loss.loss(coef=sol.coef, X=X, y=y, l2_reg_strength=l2_reg_strength),
+        bin_loss.loss(coef=coef_bin, X=X, y=y, l2_reg_strength=l2_reg_strength / 2),
+    )
+    assert_allclose(sol.coef[1, :], coef_bin / 2, rtol=5e-6)
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index e8c1466b30623..1e96e687aac6e 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -2,6 +2,8 @@
 Loss functions for linear models with raw_prediction = X @ coef
 """
 
+import warnings
+
 import numpy as np
 from scipy import sparse
 
@@ -19,7 +21,9 @@ class LinearModelLoss:
         loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
                + 1/2 * l2_reg_strength * ||coef||_2^2
 
-    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
+    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i. For the
+    multiclass case, the squared L2 norm is replaced by the squared Frobenius norm,
+    which equals the L2 norm of the flattend/ravelled coefficient array.
 
     Gradient and hessian, for simplicity without intercept, are::
 
@@ -644,6 +648,10 @@ def hessp(s):
             #
             # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411  # noqa
             def hessp(s):
+                # s.shape = (n_classes, n_features,) or (n_classes * n_features)
+                # It could be the coefficients (including the intercept if present).
+                # Returns an array of shape = (n_classes, n_dof) or (n_classes * n_dof)
+                # depending on the shape of s.
                 s = s.reshape((n_classes, -1), order="F")  # shape = (n_classes, n_dof)
                 if self.fit_intercept:
                     s_intercept = s[:, -1]
@@ -670,3 +678,247 @@ def hessp(s):
                 return grad.ravel(order="F"), hessp
 
         return grad, hessp
+
+
+class Multinomial_LDL_Decomposition:
+    """A class for symbolic LDL' decomposition of multinomial hessian.
+
+    The pointwise hessian of the multinomial loss with c = n_classes is given by
+        h = diag(p) - p' p        or with indices
+        h_ij = p_i * delta_ij - p_i * p_j    i and j are indices of classes.
+    This holds for every single point (sample). The LDL decomposition
+    of this 2-dim matrix is given in [1] for p_i > 0 and sum(p) <= 1 as (math indices)
+        h = L D L'   with lower triangular L and diagonal D:: text
+
+        q_0 = 1
+        q_i = 1 - sum(p_k, k=1..c)
+        L_ii = 1
+        for i > j:
+            L_ij = -p_i / q_j
+        D_ii = p_i * q_i / q_{i-1}
+
+    If the p_i sum to 1, then q_c = 0 and D_cc = 0, with c = n_classes.
+    The inverse L^-1 is also lower triangular and given by:: text
+
+        (L^-1)_ii = 1
+        for i > j:
+            (L^-1)_ij = f_i = p_i / q_{i-1} = -L_{i, i-1}
+
+    Note that with Python indices (zero-based), we have: p_i = p[:, i-1],
+    q_i = q[:, i-1] and we don't explicitly need q_0 = 1.
+
+    The trick is to apply this LDL decomposition to all points (samples) at the same
+    time. This class therefore provides methods to apply products with the matrices L
+    and D to vectors/matrices. All objects have the 0-th dimension for n_samples and
+    the 1st dimention for n_classes, i.e. shape (n_samples, n_classes).
+
+    Parameters
+    ----------
+        proba : ndarray of shape (n_samples, n_classes)
+            Array of predicted probabilities per class (and sample).
+
+    Attributes
+    ----------
+        p : ndarray of shape (n_samples, n_classes)
+            Array of predicted probabilities per class (and sample).
+
+        q_inv : ndarray of shape (n_samples, n_classes)
+            Helper array, inverse of q[:, j] = 1 - sum_{i=0}^j p[:, i], i.e. 1/q.
+
+        sqrt_d : ndarray of shape (n_samples, n_classes)
+            Square root of the diagonal matrix D, D_ii = p_i * q_i / q_{i-1}
+            with q_{-1} = 1.
+
+        proba_sum_to_1 : bool
+            True if propabilities p sum to 1. This is most often expected to be true.
+
+    References
+    ----------
+    .. [1] Kunio Tanabe and Masahiko Sagae. (1992) "An Exact Cholesky Decomposition and
+           the Generalized Inverse of the Variance-Covariance Matrix of the Multinomial
+           Distribution, with Applications"
+           https://doi.org/10.1111/j.2517-6161.1992.tb01875.x
+    """
+
+    def __init__(self, *, proba, proba_sum_to_1=True):
+        self.p = proba
+        q = 1 - np.cumsum(self.p, axis=1)  # contiguity of p
+        self.proba_sum_to_1 = proba_sum_to_1
+        if self.p.dtype == np.float32:
+            eps = 2 * np.finfo(np.float32).resolution
+        else:
+            eps = 2 * np.finfo(np.float64).resolution
+        if not np.allclose(q[:, -1], 0, atol=eps):
+            warnings.warn(
+                "Probabilities proba are assumed to sum to 1, but they don't.",
+                UserWarning,
+            )
+        if self.proba_sum_to_1:
+            # If np.sum(p, axis=1) = 1 then q[:, -1] = d[:, -1] = 0.
+            q[:, -1] = 0.0
+            # One might not need all classes for p to sum to 1, so we detect and
+            # correct all values close to zero.
+            q[q <= eps] = 0.0
+            self.p[self.p <= eps] = 0.0
+        d = self.p * q
+        # From now on, q is always used in the denominator. We handle q == 0 by
+        # setting q to 1 whenever q == 0 such that a division of q is a no-op in this
+        # case.
+        q[q == 0] = 1
+        # And we use the inverse of q, 1/q.
+        self.q_inv = 1 / q
+        if self.proba_sum_to_1:
+            # If q_{i - 1} = 0, then also q_i = 0.
+            d[:, 1:-1] *= self.q_inv[:, :-2]  # d[:, -1] = 0 anyway.
+        else:
+            d[:, 1:] *= self.q_inv[:, :-1]
+        self.sqrt_d = np.sqrt(d)
+
+    def sqrt_D_Lt_matmul(self, x):
+        """Compute sqrt(D) L' x from the multinomial LDL' decomposition.
+
+        L' is the transpose of L, Lij is the i-th row and j-th column of L:
+            Lij = -p[:, i] / q[:, j]
+
+        For n_classes = 4, L' looks like:: text
+
+            L' = (1 L10 L20 L30)
+                 (0   1 L21 L31)
+                 (0   0   1 L32)
+                 (0   0   0   1)
+
+        The carried out operation is matmul over n_classes (1st dimention) and
+        element-wise multiplication over n_samples (0th dimension):
+
+            x_ij = sum_k sqrt_d_{i, j} L'_{i, j, k} x_{i, k}
+
+        Parameters
+        ----------
+        x : ndarray of shape (n_samples, n_classes)
+            This array is overwritten with the result.
+
+        Return
+        ------
+        x : ndarray of shape (n_samples, n_classes)
+            Input array x, filled with the result.
+        """
+        n_classes = self.p.shape[1]
+        # precompute
+        px = np.einsum(
+            "ij,ij->i",
+            self.p[:, 1:],
+            x[:, 1:],
+            order="A",
+        )
+        for i in range(0, n_classes - 1):  # row i
+            # L_ij = -p_i / q_j, we need transpose L'
+            # for j in range(i + 1, n_classes):  # column j
+            #     x[:, i] -= self.p[:, j] / self.q[:, i] * x[:, j]
+            # The following is the same but faster.
+            # px = np.einsum(
+            #     "ij,ij->i",
+            #     self.p[:, i + 1 : n_classes],
+            #     x[:, i + 1 : n_classes],
+            #     order="A",
+            # )
+            # And using precomputed px:
+            x[:, i] -= px * self.q_inv[:, i]
+            if i < n_classes - 2:
+                px -= self.p[:, i + 1] * x[:, i + 1]
+        x *= self.sqrt_d
+        return x
+
+    def L_sqrt_D_matmul(self, x):
+        """Compute L sqrt(D) x from the multinomial LDL' decomposition.
+
+        L is lower triangular and given by Lij = -p[:, i] / q[:, j]
+
+        For n_classes = 4, L looks like:: text
+
+            L = (1     0   0 0)
+                (L10   1   0 0)
+                (L20 L21   1 0)
+                (L30 L31 L32 1)
+
+        The carried out operation is matmul over n_classes (1st dimention) and
+        element-wise multiplication over n_samples (0th dimension):
+
+            x_ij = sum_k L'_{i, j, k} sqrt_d_{i, k}  x_{i, k}
+
+        Parameters
+        ----------
+        x : ndarray of shape (n_samples, n_classes)
+            This array is overwritten with the result.
+
+        Return
+        ------
+        x : ndarray of shape (n_samples, n_classes)
+            Input array x, filled with the result.
+        """
+        n_classes = self.p.shape[1]
+        x *= self.sqrt_d
+        # precompute
+        qx = np.einsum("ij,ij->i", self.q_inv[:, :-1], x[:, :-1], order="A")
+        for i in range(n_classes - 1, 0, -1):  # row i
+            # L_ij = -p_i / q_j
+            # for j in range(0, i):  # column j
+            #     x[:, i] -= self.p[:, i] / self.q[:, j] * x[:, j]
+            # The following is the same but faster.
+            # qx = np.einsum("ij,ij->i", self.q_inv[:, :i], x[:, :i], order="A")
+            # And using precomputed qx:
+            x[:, i] -= qx * self.p[:, i]
+            if i > 1:
+                qx -= self.q_inv[:, i - 1] * x[:, i - 1]
+        return x
+
+    def inverse_L_sqrt_D_matmul(self, x):
+        """Compute 1/sqrt(D) L^(-1) x from the multinomial LDL' decomposition.
+
+        L^(-1) is again lower triangular and given by:
+            L^(-1)_ij = f[:, i] = p[:, i] / q[:, i-1]
+
+        For n_classes = 4, L^(-1) looks like:: text
+
+            L^(-1) = (1   0  0  0)
+                     (f1  1  0  0)
+                     (f2  f2 1  0)
+                     (f3  f3 f3 1)
+
+        Note that (L sqrt(D))^(-1) = 1/sqrt(D) L^(-1)
+
+        Parameters
+        ----------
+        x : ndarray of shape (n_samples, n_classes)
+            This array is overwritten with the result.
+
+        Return
+        ------
+        x : ndarray of shape (n_samples, n_classes)
+            Input array x, filled with the result.
+        """
+        n_classes = self.p.shape[1]
+        x_sum = np.sum(x[:, :-1], axis=1)  # precomputation
+        for i in range(n_classes - 1, 0, -1):  # row i, here i > 0.
+            fj = self.p[:, i] * self.q_inv[:, i - 1]
+            # for i = 0 we would set: fj = self.p[:, i]
+
+            # for j in range(0, i):  # column j
+            #     x[:, i] += fj * x[:, j]
+            # The following is the same but faster.
+            # x[:, i] += fj * np.sum(x[:, :i], axis=1)
+            # Using precomputation
+            x[:, i] += fj * x_sum
+            if i > 1:
+                x_sum -= x[:, i - 1]
+        if self.proba_sum_to_1:
+            # x[:, :-1] /= self.sqrt_d[:, :-1]
+            mask = self.sqrt_d[:, :-1] == 0
+            x[:, :-1] *= (~mask) / (self.sqrt_d[:, :-1] + mask)
+            # Important Note:
+            # Strictly speaking, the inverse of D does not exist.
+            # We use 0 as the inverse of 0 and just set:
+            # x[:, -1] = 0
+            x[:, -1] = 0
+        else:
+            x /= self.sqrt_d
+        return x
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index a8ecc29715886..29130cd00ba37 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -51,7 +51,7 @@
     check_is_fitted,
 )
 from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin
-from ._glm.glm import NewtonCholeskySolver
+from ._glm.glm import NEWTON_SOLVER
 from ._linear_loss import LinearModelLoss
 from ._sag import sag_solver
 
@@ -165,11 +165,11 @@ def _logistic_regression_path(
         where ``g_i`` is the i-th component of the gradient.
 
     verbose : int, default=0
-        For the liblinear and lbfgs solvers set verbose to any positive
-        number for verbosity.
+        For most solver, except for 'newton-cg', set verbose to any positive number
+        for verbosity.
 
-    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
-            default='lbfgs'
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \
+            'sag', 'saga'}, default='lbfgs'
         Numerical solver to use.
 
     coef : array-like of shape (n_features,), default=None
@@ -318,7 +318,7 @@ def _logistic_regression_path(
         w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
         mask = y == pos_class
         y_bin = np.ones(y.shape, dtype=X.dtype)
-        if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+        if solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr"]:
             # HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead
             # of in [-1, 1].
             mask_classes = np.array([0, 1])
@@ -335,9 +335,9 @@ def _logistic_regression_path(
             sample_weight *= class_weight_[le.fit_transform(y_bin)]
 
     else:
-        if solver in ["sag", "saga", "lbfgs", "newton-cg"]:
-            # SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder,
-            # not LabelBinarizer, i.e. y as a 1d-array of integers.
+        if solver in ["sag", "saga", "lbfgs", "newton-cg", "newton-lsmr"]:
+            # SAG, lbfgs, newton-cg and newton-lsmr multinomial solvers need
+            # LabelEncoder, not LabelBinarizer, i.e. y as a 1d-array of integers.
             # LabelEncoder also saves memory compared to LabelBinarizer, especially
             # when n_classes is large.
             le = LabelEncoder()
@@ -360,7 +360,7 @@ def _logistic_regression_path(
     #     C * sum(pointwise_loss) + penalty
     # instead of (as LinearModelLoss does)
     #     mean(pointwise_loss) + 1/C * penalty
-    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+    if solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr"]:
         # This needs to be calculated after sample_weight is multiplied by
         # class_weight. It is even tested that passing class_weight is equivalent to
         # passing sample_weights according to class_weight.
@@ -406,18 +406,19 @@ def _logistic_regression_path(
                 w0[:, : coef.shape[1]] = coef
 
     if multi_class == "multinomial":
+        target = Y_multi
+        if solver in ["lbfgs", "newton-cg", "newton-lsmr"]:
+            loss = LinearModelLoss(
+                base_loss=HalfMultinomialLoss(n_classes=classes.size),
+                fit_intercept=fit_intercept,
+            )
         if solver in ["lbfgs", "newton-cg"]:
             # scipy.optimize.minimize and newton-cg accept only ravelled parameters,
             # i.e. 1d-arrays. LinearModelLoss expects classes to be contiguous and
             # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
             # As w0 is F-contiguous, ravel(order="F") also avoids a copy.
             w0 = w0.ravel(order="F")
-            loss = LinearModelLoss(
-                base_loss=HalfMultinomialLoss(n_classes=classes.size),
-                fit_intercept=fit_intercept,
-            )
-        target = Y_multi
-        if solver == "lbfgs":
+        if solver in "lbfgs":
             func = loss.loss_gradient
         elif solver == "newton-cg":
             func = loss.loss
@@ -438,7 +439,7 @@ def _logistic_regression_path(
             func = loss.loss
             grad = loss.gradient
             hess = loss.gradient_hessian_product  # hess = [gradient, hessp]
-        elif solver == "newton-cholesky":
+        elif solver in ("newton-cholesky", "newton-lsmr"):
             loss = LinearModelLoss(
                 base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
             )
@@ -479,9 +480,9 @@ def _logistic_regression_path(
             w0, n_iter_i = _newton_cg(
                 hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
             )
-        elif solver == "newton-cholesky":
+        elif solver in ("newton-cholesky", "newton-lsmr"):
             l2_reg_strength = 1.0 / (C * sw_sum)
-            sol = NewtonCholeskySolver(
+            sol = NEWTON_SOLVER[solver](
                 coef=w0,
                 linear_loss=loss,
                 l2_reg_strength=l2_reg_strength,
@@ -656,10 +657,11 @@ def _log_reg_scoring_path(
         through the fit method) if sample_weight is specified.
 
     verbose : int
-        For the liblinear and lbfgs solvers set verbose to any positive
-        number for verbosity.
+        For most solvers, except for 'newton-cg', set verbose to any positive number
+        for verbosity.
 
-    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \
+            'sag', 'saga'}
         Decides which solver to use.
 
     penalty : {'l1', 'l2', 'elasticnet'}
@@ -892,16 +894,16 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
         Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
         data. See :term:`Glossary <random_state>` for details.
 
-    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
-            default='lbfgs'
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \
+            'sag', 'saga'}, default='lbfgs'
 
         Algorithm to use in the optimization problem. Default is 'lbfgs'.
         To choose a solver, you might want to consider the following aspects:
 
             - For small datasets, 'liblinear' is a good choice, whereas 'sag'
-              and 'saga' are faster for large ones;
-            - For multiclass problems, only 'newton-cg', 'sag', 'saga' and
-              'lbfgs' handle multinomial loss;
+              and 'saga' are faster for large ones.
+            - For multiclass problems, only 'newton-cg', 'newton-lsmr', 'sag', 'saga'
+              and 'lbfgs' handle multinomial loss.
             - 'liblinear' is limited to one-versus-rest schemes.
             - 'newton-cholesky' is a good choice for `n_samples` >> `n_features`,
               especially with one-hot encoded categorical features with rare
@@ -909,6 +911,15 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
               one-versus-rest reduction for multiclass classification. Be aware that
               the memory usage of this solver has a quadratic dependency on
               `n_features` because it explicitly computes the Hessian matrix.
+            - 'newton-lsmr' is a good choice for multiclass classification.
+               Uses Newton-Raphson steps formulated as iteratively reweighted least
+               squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`,
+               this solver does not explicitly materialize the Hessian matrix but
+               instead leverages knowledge about its structure to incrementally solve
+               the least squares problems via a series of matrix vector operations
+               where the matrices have size `(n_samples, n_features)`.
+               Additionaly, this is numerically more stable for ill-conditioned X
+               compared to `newton-cholesky`.
 
         .. warning::
            The choice of the algorithm depends on the penalty chosen.
@@ -918,6 +929,7 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
            - 'liblinear'       -   ['l1', 'l2']
            - 'newton-cg'       -   ['l2', None]
            - 'newton-cholesky' -   ['l2', None]
+           - 'newton-lsmr'     -   ['l2', None]
            - 'sag'             -   ['l2', None]
            - 'saga'            -   ['elasticnet', 'l1', 'l2', None]
 
@@ -940,6 +952,8 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
             The default solver changed from 'liblinear' to 'lbfgs' in 0.22.
         .. versionadded:: 1.2
            newton-cholesky solver.
+        .. versionadded:: 1.3
+           newton-lsmr solver.
 
     max_iter : int, default=100
         Maximum number of iterations taken for the solvers to converge.
@@ -958,8 +972,8 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
             Default changed from 'ovr' to 'auto' in 0.22.
 
     verbose : int, default=0
-        For the liblinear and lbfgs solvers set verbose to any positive
-        number for verbosity.
+        For most solver, except for 'newton-cg', set verbose to any positive number
+        for verbosity.
 
     warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit as
@@ -1093,7 +1107,15 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
         "random_state": ["random_state"],
         "solver": [
             StrOptions(
-                {"lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"}
+                {
+                    "lbfgs",
+                    "liblinear",
+                    "newton-cg",
+                    "newton-cholesky",
+                    "newton-lsmr",
+                    "sag",
+                    "saga",
+                }
             )
         ],
         "max_iter": [Interval(Integral, 0, None, closed="left")],
@@ -1282,7 +1304,7 @@ def fit(self, X, y, sample_weight=None):
         # and multinomial multiclass classification and use joblib only for the
         # one-vs-rest multiclass case.
         if (
-            solver in ["lbfgs", "newton-cg", "newton-cholesky"]
+            solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr"]
             and len(classes_) == 1
             and effective_n_jobs(self.n_jobs) == 1
         ):
@@ -1472,16 +1494,16 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima
         that can be used, look at :mod:`sklearn.metrics`. The
         default scoring option used is 'accuracy'.
 
-    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
-            default='lbfgs'
+    solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \
+            'sag', 'saga'}, default='lbfgs'
 
         Algorithm to use in the optimization problem. Default is 'lbfgs'.
         To choose a solver, you might want to consider the following aspects:
 
             - For small datasets, 'liblinear' is a good choice, whereas 'sag'
-              and 'saga' are faster for large ones;
-            - For multiclass problems, only 'newton-cg', 'sag', 'saga' and
-              'lbfgs' handle multinomial loss;
+              and 'saga' are faster for large ones.
+            - For multiclass problems, only 'newton-cg', 'newton-lsmr', 'sag', 'saga'
+              and 'lbfgs' handle multinomial loss.
             - 'liblinear' might be slower in :class:`LogisticRegressionCV`
               because it does not handle warm-starting. 'liblinear' is
               limited to one-versus-rest schemes.
@@ -1500,6 +1522,7 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima
            - 'liblinear'       -   ['l1', 'l2']
            - 'newton-cg'       -   ['l2']
            - 'newton-cholesky' -   ['l2']
+           - 'newton-lsmr'     -   ['l2']
            - 'sag'             -   ['l2']
            - 'saga'            -   ['elasticnet', 'l1', 'l2']
 
@@ -1514,6 +1537,8 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima
            SAGA solver.
         .. versionadded:: 1.2
            newton-cholesky solver.
+        .. versionadded:: 1.3
+            newton-lsmr solver
 
     tol : float, default=1e-4
         Tolerance for stopping criteria.
@@ -1542,8 +1567,8 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima
         for more details.
 
     verbose : int, default=0
-        For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any
-        positive number for verbosity.
+        For most solver, except for 'newton-cg', set verbose to any positive number
+        for verbosity.
 
     refit : bool, default=True
         If set to True, the scores are averaged across all folds, and the
diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py
index 230966db1ceaf..1a343493a9008 100644
--- a/sklearn/linear_model/tests/test_linear_loss.py
+++ b/sklearn/linear_model/tests/test_linear_loss.py
@@ -9,6 +9,7 @@
 import pytest
 from numpy.testing import assert_allclose
 from scipy import linalg, optimize
+from scipy.special import logit
 
 from sklearn._loss.loss import (
     HalfBinomialLoss,
@@ -16,7 +17,10 @@
     HalfPoissonLoss,
 )
 from sklearn.datasets import make_low_rank_matrix
-from sklearn.linear_model._linear_loss import LinearModelLoss
+from sklearn.linear_model._linear_loss import (
+    LinearModelLoss,
+    Multinomial_LDL_Decomposition,
+)
 from sklearn.utils.extmath import squared_norm
 from sklearn.utils.fixes import CSR_CONTAINERS
 
@@ -353,5 +357,293 @@ def test_multinomial_coef_shape(fit_intercept):
     assert_allclose(g_r, g1_r)
     assert_allclose(g_r, g2_r)
 
-    assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
-    assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))
+    assert_allclose(g, g_r.reshape((loss.base_loss.n_classes, -1), order="F"))
+    assert_allclose(h, h_r.reshape((loss.base_loss.n_classes, -1), order="F"))
+
+
+def test_multinomial_identifiability_properties(global_random_seed):
+    """Test that multinomial LinearModelLoss fulfills identifiability properties.
+
+    In our symmetrical overdetermined parametrization, we always expect
+    np.sum(coef, axis=0) = 0. But if we add the same value cj to all classes, i.e.
+    coef[:, j] += cj, certain properties hold:
+      1. The unpenalized loss in invariant.
+      2. The unpenalized Hessian @ c gives zero.
+      3. The unpenalized Gradient @ c gives zero.
+    """
+    n_samples, n_features, n_classes = 30, 4, 6
+    l2_reg_strength = 0
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.standard_normal((n_samples, n_features))
+    y = rng.randint(low=0, high=n_classes, size=(n_samples)).astype(float)
+    coef = rng.standard_normal((n_classes, n_features))
+    coef -= np.mean(coef, axis=0)
+    assert_allclose(np.mean(coef, axis=0), 0, atol=1e-15)
+
+    multinomial_loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes),
+        fit_intercept=False,
+    )
+    gradient, hessp = multinomial_loss.gradient_hessian_product(
+        coef=coef,
+        X=X,
+        y=y,
+        l2_reg_strength=l2_reg_strength,
+    )
+
+    # Construct a coef-like array with same values for all classes, e.g.
+    # c = [[1, 2, 3],
+    #      [1, 2, 3]]
+    c = np.tile(rng.standard_normal(n_features), (n_classes, 1))
+    assert_allclose(hessp(c), 0, atol=1e-14)
+    assert_allclose(gradient.ravel() @ c.ravel(), 0, atol=1e-13)
+
+    loss1 = multinomial_loss.loss(coef=coef, X=X, y=y, l2_reg_strength=l2_reg_strength)
+    loss2 = multinomial_loss.loss(
+        coef=coef + c, X=X, y=y, l2_reg_strength=l2_reg_strength
+    )
+    assert_allclose(loss1, loss2)
+
+
+def test_multinomial_LDL_decomposition_operates_inplace(global_random_seed):
+    n_samples, n_classes = 3, 5
+    rng = np.random.RandomState(global_random_seed)
+    p = rng.uniform(low=0, high=1, size=(n_samples, n_classes))
+    p /= np.sum(p, axis=1)[:, None]
+    assert_allclose(np.sum(p, axis=1), np.ones(shape=n_samples))
+
+    LDL = Multinomial_LDL_Decomposition(proba=p)
+    v = rng.standard_normal(size=(n_samples, n_classes))
+    res = LDL.sqrt_D_Lt_matmul(v)
+    assert res is v
+
+    v = rng.standard_normal(size=(n_samples, n_classes))
+    res = LDL.L_sqrt_D_matmul(v)
+    assert res is v
+
+    v = rng.standard_normal(size=(n_samples, n_classes))
+    res = LDL.inverse_L_sqrt_D_matmul(v)
+    assert res is v
+
+
+def test_multinomial_LDL_decomposition_binomial_single_point():
+    """Test LDL' decomposition of multinomial hessian for simple cases.
+
+    For the binomial case, we have p0 = 1 - p1
+    LDL = [p0 * (1 - p0),      -p0 * p1] = p0 * (1 - p0) * [1, -1]
+          [     -p0 * p1, p1 * (1 - p1)]                   [-1, 1]
+
+    L = [ 1, 0]    D = [p0 * (1 - p0), 0]
+        [-1, 1]        [            0, 0]
+    """
+    p0, p1 = 0.2, 0.8
+    p = np.array([[p0, p1]])
+    # Note that LDL sets q = 1 whenever q = 0 for easier handling of divisions by zero.
+    # We compare those values to "zero", to make that clear.
+    zero = 1
+
+    LDL = Multinomial_LDL_Decomposition(proba=p)
+    assert_allclose(1 / LDL.q_inv[0, :], [1 - p0, zero])
+    assert_allclose(LDL.sqrt_d[0, :] ** 2, [p0 * (1 - p0), 0])
+
+    # C = diag(D) L' x with x = [[1, 0]] (n_samples=1, n_classes=2)
+    C = LDL.sqrt_D_Lt_matmul(np.array([[1.0, 0.0]]))
+    assert_allclose(C[0, :], [np.sqrt(p0 * (1 - p0)), 0])
+
+    # C = diag(D) L' x with x = [[0, 1]] (n_samples=1, n_classes=2)
+    C = LDL.sqrt_D_Lt_matmul(np.array([[0.0, 1.0]]))
+    assert_allclose(C[0, :], [-np.sqrt(p1 * (1 - p1)), 0])
+
+    # Test with hessian product (hessp) of LinearModelLoss with X = [[1]] and coef such
+    # that probabilities are again (p0, p1).
+    # Hessian = X' LDL X
+    loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=2),
+        fit_intercept=False,
+    )
+    # Note that y has no effect on the hessian.
+    coef = 0.5 * np.array([[logit(p0)], [logit(p1)]])  # tested below
+    X = np.array([[1.0]])
+    raw = X @ coef.T  # raw.shape = (n_samples, n_classes) = (1, 2)
+    assert_allclose(loss.base_loss.predict_proba(raw), [[p0, p1]])
+    C = LDL.sqrt_D_Lt_matmul(raw.copy())
+    grad, hessp = loss.gradient_hessian_product(
+        coef=coef,
+        X=X,
+        y=np.array([0.0]),
+        l2_reg_strength=0.0,
+    )
+    # Note: hessp(coef).shape = (n_classes, n_features) = (2, 1)
+    assert_allclose(
+        hessp(coef),
+        p0
+        * (1 - p0)
+        * np.array([raw[0, 0] - raw[0, 1], -raw[0, 0] + raw[0, 1]])[:, None],
+    )
+    assert_allclose(C @ C.T, coef.T @ hessp(coef))
+
+
+def test_multinomial_LDL_decomposition_3_classes():
+    """Test LDL' decomposition of multinomial hessian for 3 classes and 2 points.
+
+    For n_classes = 3 and n_samples = 2, we have
+      p0 = [p0_0, p0_1]
+      p1 = [p1_0, p1_1]
+      p2 = [p2_0, p2_1]
+    and with 2 x 2 diagonal subblocks
+      LDL = [p0 * (1-p0),    -p0 * p1,    -p0 * p2]
+            [   -p0 * p1, p1 * (1-p1),    -p1 * p2]
+            [   -p0 * p2,    -p1 * p2, p2 * (1-p2)]
+
+      L = [         1,                 0, 0]
+          [-p1 / (1-p0),               1, 0]
+          [-p2 / (1-p0), -p2 / (1-p0-p1), 1]
+
+      D = [p0 * (1-p0),                       0, 0]
+          [          0, p1 * (1-p0-p1) / (1-p0), 0]
+          [          0,                       0, 0]
+    """
+    n = 2 * 3  # n_samples * n_classes
+    p0 = np.array([0.7, 0.6])
+    p1 = np.array([0.2, 0.25])
+    p2 = np.array([0.1, 0.15])
+    one = np.ones(2)
+    zero = np.zeros(2)
+    p0d, p1d, p2d, oned, zerod = (
+        np.diag(p0),
+        np.diag(p1),
+        np.diag(p2),
+        np.diag(one),
+        np.diag(zero),
+    )
+    H = np.block(
+        [
+            [p0d * (oned - p0d), -p0d * p1d, -p0d * p2d],
+            [-p0d * p1d, p1d * (oned - p1d), -p1d * p2d],
+            [-p0d * p2d, -p1d * p2d, p2d * (oned - p2d)],
+        ]
+    )
+    L = np.block(
+        [
+            [oned, zerod, zerod],
+            [np.diag(-p1 / (one - p0)), oned, zerod],
+            [np.diag(-p2 / (one - p0)), np.diag(-p2 / (one - p0 - p1)), oned],
+        ]
+    )
+    D = np.diag(np.r_[p0 * (1 - p0), p1 * (1 - p0 - p1) / (1 - p0), zero])
+    L_inv = np.block(
+        [
+            [oned, zerod, zerod],
+            [np.diag(p1 / (one - p0)), oned, zerod],
+            [np.diag(p2 / (one - p0 - p1)), np.diag(p2 / (one - p0 - p1)), oned],
+        ]
+    )
+    D_inv = np.zeros_like(D)
+    D_inv[D > 0] = 1 / np.sqrt(D[D > 0])
+
+    lu, d, perm = linalg.ldl(H)
+    assert_allclose(lu @ d @ lu.T, H)
+    assert_allclose(L @ D @ L.T, H)
+    assert_allclose(L, lu)
+    assert_allclose(D, d, atol=1e-14)
+    assert_allclose(L_inv @ L, np.eye(n), atol=1e-14)
+    assert_allclose(L @ L_inv, np.eye(n), atol=1e-14)
+
+    LDL = Multinomial_LDL_Decomposition(proba=np.c_[p0, p1, p2])
+    v = 1.0 + np.arange(2 * 3)
+    DLt_v = LDL.sqrt_D_Lt_matmul(v.copy().reshape((2, 3), order="F"))
+    assert_allclose(DLt_v.ravel(order="F"), np.sqrt(D) @ L.T @ v)
+    LDL_v = LDL.L_sqrt_D_matmul(DLt_v.copy())
+    assert_allclose(LDL_v.ravel(order="F"), H @ v)
+
+    LD_v = LDL.L_sqrt_D_matmul(v.copy().reshape((2, 3), order="F"))
+    assert_allclose(LD_v.ravel(order="F"), L @ np.sqrt(D) @ v)
+
+    x1 = LDL.inverse_L_sqrt_D_matmul(v.copy().reshape((2, 3), order="F"))
+    # This does not work as L @ D is singular.
+    #   x2 = linalg.solve(L @ np.sqrt(D), v)
+    # We can just neglect the last class as it is redundant.
+    x2 = linalg.solve(L[:-2, :-2] @ np.sqrt(D[:-2, :-2]), v[:-2])
+    assert_allclose(x1.ravel(order="F")[:-2], x2)
+    assert_allclose(x1.ravel(order="F"), D_inv @ L_inv @ v)
+
+    # Test consistency of L_sqrt_D_matmul, sqrt_D_Lt_matmul and inverse_L_sqrt_D_matmul,
+    # i.e. reconstructing the matrices based on X @ unit_vector and compare with
+    # explicit matrices D and L.
+    unit_vec = np.zeros(n)
+    for op, result in (
+        (LDL.sqrt_D_Lt_matmul, np.sqrt(D) @ L.T),
+        (LDL.L_sqrt_D_matmul, L @ np.sqrt(D)),
+        (LDL.inverse_L_sqrt_D_matmul, D_inv @ L_inv),
+    ):
+        X = np.zeros((n, n))
+        for i in range(n):
+            unit_vec[i] = 1
+            X[:, i] = op(unit_vec.copy().reshape((2, 3), order="F")).ravel(order="F")
+            unit_vec[i] = 0
+        assert_allclose(X, result)
+
+
+def test_multinomial_LDL_decomposition(global_random_seed):
+    """Test LDL' decomposition of multinomial hessian."""
+    n_samples, n_features, n_classes = 3, 4, 5
+    rng = np.random.RandomState(global_random_seed)
+    X = rng.standard_normal(size=(n_samples, n_features))
+    coef = rng.standard_normal(size=(n_classes, n_features))
+    raw_prediction = X @ coef.T  # shape = (n_samples, n_classes)
+    loss = LinearModelLoss(
+        base_loss=HalfMultinomialLoss(n_classes=n_classes),
+        fit_intercept=False,
+    )
+    p = loss.base_loss.predict_proba(raw_prediction=raw_prediction)
+    assert_allclose(np.sum(p, axis=1), np.ones(shape=n_samples))
+
+    LDL = Multinomial_LDL_Decomposition(proba=p)
+    # Note that LDL sets q = 1 whenever q = 0 for easier handling of divisions by zero.
+    # As q[:, -1] = 0 if p sums to 1, we do not compare the last values corresponding
+    # to the last class.
+    assert_allclose(1 / LDL.q_inv[:, :-1], 1 - np.cumsum(LDL.p, axis=1)[:, :-1])
+
+    # C = diag(D) L' x with x = X @ coef = raw_prediction
+    C = LDL.sqrt_D_Lt_matmul(raw_prediction.copy())
+
+    # Note that y has no effect on the hessian.
+    grad, hessp = loss.gradient_hessian_product(
+        coef=coef,
+        X=X,
+        y=rng.randint(low=0, high=n_classes, size=n_samples).astype(X.dtype),
+        l2_reg_strength=0.0,
+    )
+    # C.shape = (n_samples, n_classes), hessp(coef).shape = (n_classes, n_features)
+    # LinearModelLoss normalizes loss as 1 / n_samples * sum(loss_i), therefore, we
+    # need to devide C'C by 1/n_samples.
+    assert_allclose(
+        np.tensordot(C, C, axes=2) / n_samples, np.tensordot(coef, hessp(coef), axes=2)
+    )
+    CtC = LDL.L_sqrt_D_matmul(C.copy())
+    CtC = np.tensordot(raw_prediction, CtC, axes=2)
+    assert_allclose(CtC, np.tensordot(C, C, axes=2))
+
+
+def test_multinomial_LDL_inverse_sqrt_D_Lt_matmul(global_random_seed):
+    """Test inverse of LDL' decomposition of multinomial hessian."""
+    n_samples, n_classes = 4, 5
+    rng = np.random.RandomState(global_random_seed)
+    p = rng.uniform(low=0, high=1, size=(n_samples, n_classes))
+    p /= np.sum(p, axis=1)[:, None]
+    LDL = Multinomial_LDL_Decomposition(proba=p)
+    x = rng.standard_normal(size=(n_samples, n_classes))
+    # Without centering, the last class, x[:, -1] would fail in the assert below.
+    x -= np.mean(x, axis=1)[:, None]
+
+    invDL_x = LDL.inverse_L_sqrt_D_matmul(x.copy())
+    LD_invDL_x = LDL.L_sqrt_D_matmul(invDL_x.copy())
+    assert_allclose(LD_invDL_x, x)
+
+
+def test_multinomial_LDL_warning():
+    """Test that LDL warns if probabilities do not sum to 1."""
+    p = np.arange(4, dtype=float).reshape((2, 2))
+    msg = "Probabilities proba are assumed to sum to 1, but they don't."
+    with pytest.warns(UserWarning, match=msg):
+        Multinomial_LDL_Decomposition(proba=p, proba_sum_to_1=True)
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 30625ea9f2bef..01948a7f47b46 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -49,7 +49,15 @@
 LogisticRegressionCV = partial(LogisticRegressionCVDefault, random_state=0)
 
 
-SOLVERS = ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga")
+SOLVERS = (
+    "lbfgs",
+    "liblinear",
+    "newton-cg",
+    "newton-cholesky",
+    "newton-lsmr",
+    "sag",
+    "saga",
+)
 X = [[-1, 0], [0, 1], [1, 1]]
 Y1 = [0, 1, 1]
 Y2 = [2, 1, 0]
@@ -165,6 +173,9 @@ def test_predict_3_classes(csr_container):
         LogisticRegression(
             C=len(iris.data), solver="newton-cholesky", multi_class="ovr"
         ),
+        LogisticRegression(
+            C=len(iris.data), solver="newton-lsmr", multi_class="multinomial"
+        ),
     ],
 )
 def test_predict_iris(clf):
@@ -207,12 +218,12 @@ def test_check_solver_option(LR):
             lr.fit(X, y)
 
     # all solvers except 'liblinear' and 'saga'
-    for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag"]:
+    for solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr", "sag"]:
         msg = "Solver %s supports only 'l2' or None penalties," % solver
         lr = LR(solver=solver, penalty="l1", multi_class="ovr")
         with pytest.raises(ValueError, match=msg):
             lr.fit(X, y)
-    for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]:
+    for solver in set(SOLVERS) - set(["liblinear"]):
         msg = "Solver %s supports only dual=False, got dual=True" % solver
         lr = LR(solver=solver, dual=True, multi_class="ovr")
         with pytest.raises(ValueError, match=msg):
@@ -393,7 +404,7 @@ def test_consistency_path():
             )
 
     # test for fit_intercept=True
-    for solver in ("lbfgs", "newton-cg", "newton-cholesky", "liblinear", "sag", "saga"):
+    for solver in set(SOLVERS):
         Cs = [1e3]
         coefs, Cs, _ = f(_logistic_regression_path)(
             X,
@@ -767,6 +778,8 @@ def test_logistic_regressioncv_class_weights(weight, class_weight, global_random
             clf.set_params(
                 tol=1e-18, max_iter=10000, random_state=global_random_seed + 1
             )
+        elif solver == "newton-lsmr":
+            clf.set_params(tol=1e-6)
         clf.fit(X, y)
 
         assert_allclose(
@@ -795,7 +808,7 @@ def test_logistic_regression_sample_weights():
             assert_allclose(clf_sw_none.coef_, clf_sw_ones.coef_, rtol=1e-4)
 
         # Test that sample weights work the same with the lbfgs,
-        # newton-cg, newton-cholesky and 'sag' solvers
+        # newton-cg, newton-cholesky, newton-lsmr and sag solvers
         clf_sw_lbfgs = LR(**kw, tol=1e-5)
         clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight)
         for solver in set(SOLVERS) - set(("lbfgs", "saga")):
@@ -1331,7 +1344,7 @@ def test_saga_vs_liblinear(csr_container):
 
 @pytest.mark.parametrize("multi_class", ["ovr", "multinomial"])
 @pytest.mark.parametrize(
-    "solver", ["liblinear", "newton-cg", "newton-cholesky", "saga"]
+    "solver", ["liblinear", "newton-cg", "newton-cholesky", "newton-lsmr", "saga"]
 )
 @pytest.mark.parametrize("fit_intercept", [False, True])
 @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)