diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 01f0384af5c1d..c6da2843564b1 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -264,6 +264,17 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |Enhancement| :class:`linear_model.LogisticRegression`, + :class:`linear_model.LogisticRegressionCV`, :class:`linear_model.GammaRegressor`, + :class:`linear_model.PoissonRegressor` and :class:`linear_model.TweedieRegressor` got + a new solver `solver="newton-lsmr"`. This is a 2nd order (Newton) optimisation + routine that uses the iterative LSMR algorithm: to find the Newton direction in each + step, the 2nd order equation is cast as a least squares problem and solved + iteratively, therefore called iteratively reweighted least squares (IRLS), via LSMR. + Due to using LSMR, only matrix-vector multiplications are used and sparse matrices + are supported as well. Especially for multiclass problems it might be worth a try. + :pr:`25462` by :user:`Christian Lorentzen `. + - |Fix| :class:`linear_model.ElasticNet`, :class:`linear_model.ElasticNetCV`, :class:`linear_model.Lasso` and :class:`linear_model.LassoCV` now explicitly don't accept large sparse data formats. :pr:`27576` by :user:`Stefanie Senger diff --git a/sklearn/linear_model/_glm/_newton_solver.py b/sklearn/linear_model/_glm/_newton_solver.py index 0b6adbe44e686..00470dd31d168 100644 --- a/sklearn/linear_model/_glm/_newton_solver.py +++ b/sklearn/linear_model/_glm/_newton_solver.py @@ -15,7 +15,7 @@ from ..._loss.loss import HalfSquaredError from ...exceptions import ConvergenceWarning from ...utils.optimize import _check_optimize_result -from .._linear_loss import LinearModelLoss +from .._linear_loss import LinearModelLoss, Multinomial_LDL_Decomposition class NewtonSolver(ABC): @@ -25,12 +25,26 @@ class NewtonSolver(ABC): iteration aims at finding the Newton step which is done by the inner solver. With Hessian H, gradient g and coefficients coef, one step solves: - H @ coef_newton = -g + H @ coef_newton = -G - For our GLM / LinearModelLoss, we have gradient g and Hessian H: + For our GLM / LinearModelLoss, we have gradient G and Hessian H: - g = X.T @ loss.gradient + l2_reg_strength * coef - H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity + G = X.T @ g + l2_reg_strength * P @ coef_old + H = X.T @ diag(h) @ X + l2_reg_strength * P + g = loss.gradient = pointwise gradient + h = loss.hessian = pointwise hessian + P = penalty matrix in 1/2 w @ P @ w, + for a pure L2 penalty without intercept it equals the identity matrix. + + stemming from the 2nd order Taylor series of the objective: + + loss(coef_old) + g @ X @ coef_newton + + 1/2 * coef_newton @ X.T @ diag(h) @ X @ coef_newton + + 1/2 * l2_reg_strength * coef @ P @ coef + = loss(coef_old) + 1/2 * l2_reg_strength * coef_old @ P @ coef_old + + G @ coef_newton + 1/2 * coef_newton @ H @ coef_newton + + where we have coef = coef_old + coef_newton. Backtracking line search updates coef = coef_old + t * coef_newton for some t in (0, 1]. @@ -138,8 +152,6 @@ def __init__( def setup(self, X, y, sample_weight): """Precomputations - If None, initializes: - - self.coef Sets: - self.raw_prediction - self.loss_value @@ -178,9 +190,10 @@ def fallback_lbfgs_solve(self, X, y, sample_weight): - self.coef - self.converged """ + # Note that LinearModelLoss expects coef.ravel(order"F") for multiclass case. opt_res = scipy.optimize.minimize( self.linear_loss.loss_gradient, - self.coef, + self.coef.ravel(order="F"), method="L-BFGS-B", jac=True, options={ @@ -194,6 +207,11 @@ def fallback_lbfgs_solve(self, X, y, sample_weight): ) self.n_iter_ = _check_optimize_result("lbfgs", opt_res) self.coef = opt_res.x + if self.linear_loss.base_loss.is_multiclass: + # No test case was found (yet) where NewtonLSMRSolver ends up in this code + # path. Note that NewtonCholeskySolver does not support muliclass problems. + n_classes = self.linear_loss.base_loss.n_classes + self.coef = self.coef.reshape((n_classes, -1), order="F") self.converged = opt_res.status == 0 def line_search(self, X, y, sample_weight): @@ -299,6 +317,10 @@ def line_search(self, X, y, sample_weight): self.raw_prediction = raw + def compute_d2(self, X, sample_weight): + """Compute square of Newton decrement.""" + return self.coef_newton @ self.hessian @ self.coef_newton + def check_convergence(self, X, y, sample_weight): """Check for convergence. @@ -306,6 +328,16 @@ def check_convergence(self, X, y, sample_weight): """ if self.verbose: print(" Check Convergence") + loss_value = self.linear_loss.loss( + coef=self.coef, + X=X, + y=y, + sample_weight=sample_weight, + l2_reg_strength=self.l2_reg_strength, + n_threads=self.n_threads, + ) + print(f" loss = {loss_value}.") + # Note: Checking maximum relative change of coefficient <= tol is a bad # convergence criterion because even a large step could have brought us close # to the true minimum. @@ -324,7 +356,7 @@ def check_convergence(self, X, y, sample_weight): # d = sqrt(grad @ hessian^-1 @ grad) # = sqrt(coef_newton @ hessian @ coef_newton) # See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1. - d2 = self.coef_newton @ self.hessian @ self.coef_newton + d2 = self.compute_d2(X, sample_weight=sample_weight) if self.verbose: print(f" 2. Newton decrement {0.5 * d2} <= {self.tol}") if 0.5 * d2 > self.tol: @@ -522,3 +554,555 @@ def inner_solve(self, X, y, sample_weight): ) self.use_fallback_lbfgs_solve = True return + + +class NewtonLSMRSolver(NewtonSolver): + """LSMR based inexact Newton solver. + + The inner solver uses LSMR [1] after the Newton update is cast into the iteratively + reweighted least squares (IRLS) formulation. This means + + H @ coef_newton = -G + + with + + G = X.T @ g + l2_reg_strength * P @ coef + H = X.T @ diag(h) @ X + l2_reg_strength * P + g = loss.gradient = pointwise gradient + h = loss.hessian = pointwise hessian + P = penalty matrix in 1/2 w @ P @ w, + for a pure L2 penalty without intercept it equals the identity matrix. + + is cast as a least squares problem + + min ||A @ coef_newton - b||_2^2 + + with + + A = [ sqrt(h) * X] + [sqrt(l2_reg_strength) * sqrt(P)] + b = [ - g / sqrt(h)] + [- sqrt(l2_reg_strength) * sqrt(P) @ self.coef] + + Notes: + - A is a square root of H: H = A'A + - A and b form G: G = -A'b + - The normal equation of this least squares problem, A'A coef_newton = A'b, is + again H @ coef_newton = -G. + + After the Newton iteration, and neglecting line search, the new coefficient is + self.coef += coef_newton. + + Note that this solver can naturally deal with sparse X. + + References + ---------- + .. [1] :arxiv:`Fong & Saunders "LSMR: An iterative algorithm for sparse + least-squares problems" <1006.0758>` + See also + https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.lsmr.html + + """ # noqa: E501 + + def setup(self, X, y, sample_weight): + """Setup. + + Additionally to super().setup(), also sets: + - self.g = pointwise gradient + - self.h = pointwise hessian + - self.gradient + - self.sqrt_P = sqrt(l2_reg_strength) * sqrt(P) + - self.A_norm + - self.r_norm + - self.atol + - self.lsmr_iter + - self.sw_sum + - self.sqrt_sw_sum + """ + super().setup(X=X, y=y, sample_weight=sample_weight) + ( + self.g, + self.h, + ) = self.linear_loss.base_loss.init_gradient_and_hessian( + n_samples=X.shape[0], dtype=X.dtype + ) + n_samples, n_features = X.shape + # For a general (symmetric) penalty matrix P, we can use any square root of it, + # e.g. the Cholesky decomposition. For the simple L2-penalty, we can use the + # identity matrix and handle the intercept separately. + # We use a 1-d array of shape (n_features,) instead of a 2-d diagonal array to + # save memory. We also simply omit the zero at the end for the intercept. + # For the multiclass case, we use the same 1-d array of shape (n_features,) and + # do not store n_classes copies of it. + self.sqrt_P = np.full( + shape=n_features, fill_value=np.sqrt(self.l2_reg_strength), dtype=X.dtype + ) + + # In the inner_solve with LSMR, we set atol ~ 1 / (||A|| * ||r||) with the + # Frobenius norm ||A||, see below. We track this in every iteration with + # self.A_norm and self.r_norm. For the first call of inner_solve, we need an + # initial estimation of ||A||. We assume h = 1 and get + # ||A||^2 = ||X||^2 + sqrt(l2_reg_strength) ||sqrt(P)||^2 + # if scipy.sparse.issparse(X): + # self.A_norm = scipy.sparse.linalg.norm(X) ** 2 + # else: + # self.A_norm = scipy.linalg.norm(X) ** 2 + # self.A_norm += scipy.linalg.norm(self.sqrt_P) ** 2 + # self.A_norm = np.sqrt(self.A_norm) + # + # Another, even simpler choice is ||A|| = 1 which likely understimates + # ||A|| because ||X|| > 1 is very likely for typical (e.g. standardized) X. + # Underestimating ||A|| means overestimating atol ~ 1 / ||A||, which means the + # first iteration stops earlier. This is usually a good thing! + # Note: For n_featues > n_samples and no regularization, it seems that a higher + # starting A_norm is better which forces more initial LSMR iterations. We could + # even condider to make it dependent on log(l2_reg_strength). + self.A_norm = max(1, n_features / n_samples) + if n_features > n_samples and self.l2_reg_strength < 1e-7: + # We are in or close to an unpenalized underparametrized setting, where a + # large first LSMR iteration is beneficial. + # The multiplicative term is 1 <= term <= 50, log(1e-22) ~ -50 + self.A_norm *= 1 - np.log((self.l2_reg_strength + 1e-22) * 1e7) + self.r_norm = 1 + self.atol = 0 + self.lsmr_iter = 0 # number of total LSMR iterations + self.sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + self.sqrt_sw_sum = np.sqrt(self.sw_sum) + + def update_gradient_hessian(self, X, y, sample_weight): + """Update gradient and hessian. + + Update pointwise gradient and hessian, self.g and self.h, + as well as the full gradient, self.gradient. + """ + n_samples, n_features = X.shape + if not self.linear_loss.base_loss.is_multiclass: + self.linear_loss.base_loss.gradient_hessian( + y_true=y, + raw_prediction=self.raw_prediction, # this was updated in line_search + sample_weight=sample_weight, + gradient_out=self.g, + hessian_out=self.h, + n_threads=self.n_threads, + ) + self.g /= self.sw_sum + self.h /= self.sw_sum + + # See LinearModelLoss.gradient_hessian + # For non-canonical link functions and far away from the optimum, the + # pointwise hessian can be negative. We take care that 75% ot the hessian + # entries are positive. + self.hessian_warning = np.mean(self.h <= 0) > 0.25 + self.h = np.abs(self.h) + + # This duplicates a bit of code from LinearModelLoss.gradient. + weights, _ = self.linear_loss.weight_intercept(self.coef) + self.gradient = np.empty_like(self.coef, dtype=self.coef.dtype) + self.gradient[:n_features] = X.T @ self.g + self.l2_reg_strength * weights + if self.linear_loss.fit_intercept: + self.gradient[-1] = self.g.sum() + else: + # Here we may safely assume HalfMultinomialLoss. + # We use self.h to store the predicted probabilities instead of the hessian + self.linear_loss.base_loss.gradient_proba( + y_true=y, + raw_prediction=self.raw_prediction, # this was updated in line_search + sample_weight=sample_weight, + gradient_out=self.g, + proba_out=self.h, + n_threads=self.n_threads, + ) + self.g /= self.sw_sum + # Note: Multinomial hessian is always non-negative. + + # This duplicates a bit of code from LinearModelLoss.gradient. + n_classes = self.linear_loss.base_loss.n_classes + n_dof = n_features + int(self.linear_loss.fit_intercept) + weights, _ = self.linear_loss.weight_intercept(self.coef) + self.gradient = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") + # self.g.shape = (n_samples, n_classes) + self.gradient[:, :n_features] = ( + self.g.T @ X + self.l2_reg_strength * weights + ) + if self.linear_loss.fit_intercept: + self.gradient[:, -1] = self.g.sum(axis=0) + + def compute_A_b(self, X, y, sample_weight): + """Compute A and b for IRLS formulation. + + Returns + ------- + A : LinearOperator of shape (n_samples + n_features, n_dof) or \ + (n_samples + n_features) * n_classes, n_dof * n_classes) + For the multiclass case, `A @ x` expects `x = coef.ravel(order="C")` for + `coef.shape = (n_classes, n_dof)`, and `A.T @ x` expects + `x = b.ravel(order="F")` for `b.shape=(n_samples + n_features, n_classes)`. + + b : ndarray of shape (n_samples + n_features) or \ + ((n_samples + n_features) * n_classes) + """ + n_samples, n_features = X.shape + + if not self.linear_loss.base_loss.is_multiclass: + n_classes = 1 + sqrt_h = np.sqrt(self.h) + + b = np.r_[-self.g / sqrt_h, -self.sqrt_P * self.coef[:n_features]] + + if self.linear_loss.fit_intercept: + n_dof = n_features + 1 + + def matvec(x): + # A @ x with intercept + # We assume self.sqrt_P to be 1-d array of shape (n_features,), + # representing a diagonal matrix. + return np.r_[sqrt_h * (X @ x[:-1] + x[-1]), self.sqrt_P * x[:-1]] + + def rmatvec(x): + # A.T @ x with intercept + return np.r_[ + X.T @ (sqrt_h * x[:n_samples]) + self.sqrt_P * x[n_samples:], + sqrt_h @ x[:n_samples], + ] + + else: + n_dof = n_features + + def matvec(x): + # A @ x without intercept + return np.r_[sqrt_h * (X @ x), self.sqrt_P * x] + + def rmatvec(x): + # A.T @ x without intercept + return X.T @ (sqrt_h * x[:n_samples]) + self.sqrt_P * x[n_samples:] + + else: + # Here we may safely assume HalfMultinomialLoss. + n_classes = self.linear_loss.base_loss.n_classes + p = self.h # probability array of shape (n_samples, n_classes) + # We need a square root of X' h X with pointwise hessian h. Note that h is + # a priori a 4-dimensional matrix (n_samples ** 2, n_classes ** 2) and is + # diagonal in n_samples ** 2, i.e. effectively a 3-dimensional matrix. + # To accomplish this, we need the Cholesky or LDL' decomposition of h + # h = diag(p) - p' p + # or with indices i and j for classes + # h_ij = p_i * delta_ij - p_i * p_j + # This holds for every single point (n_sample). To tackle this, we use the + # class Multinomial_LDL_Decomposition, which provides further details. + # + # We have h = L D L' with lower triangular L having unit diagonal and + # diagonal D. This gives + # C = sqrt(D) L' X + # X' h X = C' C + # and C is our searched for square root in + # A = [ C] + # [sqrt(l2_reg_strength) * sqrt(P)] + # b = [ -(L sqrt(D))^(-1) g] + # [- sqrt(l2_reg_strength) * sqrt(P) @ self.coef] + + # Note again + # self.g.shape = p.shape = (n_samples, n_classes) + # self.coef.shape = (n_classes, n_dof) + # sqrt(h) becomes sqrt(D) L' + LDL = Multinomial_LDL_Decomposition(proba=p) + self.LDL = LDL # store it for compute_d2 + + # We need "g/sqrt(h)" such that g = L sqrt(D) "g/sqrt(h)", i.e. + # "g/sqrt(h)" = 1/sqrt(D) L^-1 @ g. + # Note that L^-1 is again lower triangular. + g_over_h = self.g.copy() # F-contiguous, shape (n_samples, n_classes) + LDL.inverse_L_sqrt_D_matmul(g_over_h) + if sample_weight is not None: + # Sample weights sw enter the hessian multiplicatively, i.e. sw * h. + # They can be dealt with by adding them to D, which means sqrt(sw) + # to sqrt(D) in every place. + sqrt_sw = np.sqrt(sample_weight) / self.sqrt_sw_sum + g_over_h /= sqrt_sw[:, None] + else: + sqrt_sw = 1 / self.sqrt_sw_sum + g_over_h *= self.sqrt_sw_sum + + # For ravelled results we use the convention that all values for the same + # class are in sequence. For n_classes = 2, the ravelled b looks like + # [ -g_over_h[:, 0]] n_samples elements of class 0 + # [-self.sqrt_P * coef.T[:, 0]] n_features elements of class 0 + # [ -g_over_h[:, 1]] n_samples elements of class 1 + # [-self.sqrt_P * coef.T[:, 1]] n_features elements of class 1 + # Note that this convention is different from + # LinearModelLoss.gradient_hessian_product which expects raveled coef to + # to have all classes in sequence. + b = np.r_[ + -g_over_h, + -self.sqrt_P[:, None] * self.coef[:, :n_features].T, + ].ravel(order="F") + + # Note on performance: + # X @ coef.T returns a C-contiguous array, but due to slicing along the + # first axis (along samples like x[:, i]), sqrt_D_Lt_matmul is more + # efficient for F-contiguous arrays. The output of X @ c can be made + # F-contigous by (c.T @ X.T).T. + + if self.linear_loss.fit_intercept: + n_dof = n_features + 1 + + def matvec(x): + # A @ x with or without intercept + # We assume self.sqrt_P to be 1-d array of shape (n_features,), + # representing a diagonal matrix. + coef = x.reshape((n_classes, -1), order="C") + # C_coef = sqrt(D) L' X coef + # C_coef = X @ coef[:, :-1].T + coef[:, -1] # C-contiguous + # F-contiguous version, see note above: + C_coef = (coef[:, :-1] @ X.T + coef[:, -1].T[:, None]).T + LDL.sqrt_D_Lt_matmul(C_coef) + # C_coef.shape = (n_samples, n_classes) + if sample_weight is not None: + C_coef *= sqrt_sw[:, None] + else: + C_coef *= sqrt_sw + # For n_classes = 2, the ravelled result looks like + # [ C_Coef[:, 0]] n_samples elements of class 0 + # [sqrt_P * coef.T[:, 0]] n_features elements of class 0 + # [ C_Coef[:, 1]] n_samples elements of class 1 + # [sqrt_P * coef.T[:, 1]] n_features elements of class 1 + return np.r_[ + C_coef, + self.sqrt_P[:, None] * coef[:, :n_features].T, + ].ravel(order="F") + + def rmatvec(x): + # A.T @ x with intercept + # Ct_y = X' L sqrt(D) y + x = x.reshape((-1, n_classes), order="F") + y = x[:n_samples, :] # y is neither C nor F-contiguous + y = y.copy(order="F") + if sample_weight is not None: + y *= sqrt_sw[:, None] + else: + y *= sqrt_sw + L_sqrtD_y = LDL.L_sqrt_D_matmul(y) + Ct_y = X.T @ L_sqrtD_y # shape = (n_features, n_classes) + return np.r_[ + Ct_y + self.sqrt_P[:, None] * x[n_samples:, :], + np.sum(L_sqrtD_y, axis=0)[None, :], + ].ravel(order="F") + + else: + n_dof = n_features + + def matvec(x): + # A @ x without intercept + coef = x.reshape((n_classes, -1), order="C") + # C_coef = sqrt(D) L' X coef + # C_coef = X @ coef.T # C-contiguous + # F-contiguous version, see note above: + C_coef = (coef @ X.T).T + LDL.sqrt_D_Lt_matmul(C_coef) + if sample_weight is not None: + C_coef *= sqrt_sw[:, None] + else: + C_coef *= sqrt_sw + return np.r_[ + C_coef, + self.sqrt_P[:, None] * coef.T, + ].ravel(order="F") + + def rmatvec(x): + # A.T @ x without intercept + # Ct_y = X' L sqrt(D) y + x = x.reshape((-1, n_classes), order="F") + y = x[:n_samples, :] # y is neither C nor F-contiguous + y = y.copy(order="F") + if sample_weight is not None: + y *= sqrt_sw[:, None] + else: + y *= sqrt_sw + L_sqrtD_y = LDL.L_sqrt_D_matmul(y) + Ct_y = X.T @ L_sqrtD_y # shape = (n_features, n_classes) + return (Ct_y + self.sqrt_P[:, None] * x[n_samples:, :]).ravel( + order="F" + ) + + # Note that initializing LinearOperator seems to have some surprisingly sizable + # overhead. + A = scipy.sparse.linalg.LinearOperator( + shape=((n_samples + n_features) * n_classes, n_dof * n_classes), + matvec=matvec, + rmatvec=rmatvec, + ) + + return A, b + + def inner_solve(self, X, y, sample_weight): + """Compute Newton step. + + Sets: + - self.coef_newton via LSMR + As LSMR is an iterative method, NewtonLSMRSolver is an inexact Newton + method. + - self.gradient_times_newton + - self.lsmr_iter + - self.A_norm + - self.r_norm + - self.atol + + A_norm and r_norm are used for the inner tolerance used in LSMR. + """ + n_samples, n_features = X.shape + if not self.linear_loss.base_loss.is_multiclass: + n_classes = 1 + else: + n_classes = self.linear_loss.base_loss.n_classes + A, b = self.compute_A_b(X, y, sample_weight) + # The choice of atol in LSMR is essential for stability and for computation + # time. For n_samples > n_features, we most certainly have a least squares + # problem (no solution to the linear equation A x = b), such that the following + # stopping criterion with residual r = b - A x and x = coef_newton applies: + # ||A' r|| = ||H x + G|| <= atol * ||A|| * ||r||. + # For inexact Newton solvers, res = H x + G is called the residual and one + # usually chooses, see Eq. 7.3 Nocedal & Wright 2nd ed, a stopping criterion + # ||res|| = ||A' r|| <= eta * ||G|| + # with a forcing sequence 0 < eta < 1 (eta_k for iteration k). + # As for our Newton conjugate gradient solver "_newton_cg", we set + # eta = min(0.5, np.sqrt(||G||)) + # which establishes a superlinear rate of convergence. + # Fortunately, we get approximations of the Frobenius norm of A and the norm of + # r, ||A|| and ||r|| respectively, for free by LSMR such that we can set + # atol = eta * ||G|| / (||A|| * ||r||) + # This results in our desired stopping criterion + # ||res|| = ||A' r|| <= eta * ||G|| + # at least approximately, as we have to use ||A|| and ||r|| from the last and + # not the current iteration. + # + # In the case of perfect interpolation, n_features >= n_samples, we might have + # an exact solution to A x = b. LSMR then uses the stopping criterion + # ||r|| <= atol * ||A|| * ||x|| + btol * ||b||. + # Note that + # ||b||^2 = ||g / sqrt(h)||^2 + l2_reg_strength * x0 @ P @ x0 + # and + # 1/2 * ||r||^2 = 1/2 * x @ H @ x + G @ x + 1/2 * ||b||^2 + # Here, x is the solution of Ax=b, i.e. x=coef_newton, and, neglecting line + # search, coef = x + x0. Thus, 1/2 ||r||^2 is just the Taylor series of the + # objective with the zero order term loss(x0) replaced by + # 1/2 * ||g/sqrt(h)||^2. + # + # We set btol to the strict self.tol to help cases of collinearity in X when + # there is (almost) no L2 penalty. + norm_G = scipy.linalg.norm(self.gradient) + eta = min(0.5, np.sqrt(norm_G)) + # Note that norm_G might not be decreasing from Newton iteration to iteration. + # This means we are still not close to the minimum and it is good when artol + # is then loose enough. + if self.verbose >= 3: + print(f" norm(gradient) = {norm_G}") + # Avoid division by 0 by adding tiny 1e-16. + atol = eta * norm_G / (self.A_norm * self.r_norm + 1e-16) + # Avoid that atol gets tiny too fast. + if atol < self.atol / 10: + atol = self.atol / 10 * np.sqrt(atol * 10 / self.atol) + self.atol = atol + if self.verbose >= 3: + print(f" {self.A_norm=} {self.r_norm=} {eta=} {atol=}") + result = scipy.sparse.linalg.lsmr( + A, + b, + damp=0, + atol=atol, + btol=self.tol, + maxiter=max(n_samples, n_features) * n_classes, # default is min(A.shape) + # default conlim = 1e8, for compatible systems 1e12 is still reasonable, + # see LSMR documentation + conlim=1e12, + show=self.verbose >= 3, + ) + # We store the estimated Frobenius norm of A and norm of residual r in + # self.A_norm and self.r_norm for tolerance of next iteration. + ( + self.coef_newton, + istop, + itn, + self.r_norm, + normar, + self.A_norm, + conda, + normx, + ) = result + self.lsmr_iter += itn + if self.verbose >= 2: + print(f" Inner iterations in LSMR = {itn}, total = {self.lsmr_iter}") + if self.coef.dtype == np.float32: + self.coef_newton = self.coef_newton.astype(np.float32) + if not self.linear_loss.base_loss.is_multiclass: + self.gradient_times_newton = self.gradient @ self.coef_newton + else: + self.coef_newton = self.coef_newton.reshape((n_classes, -1), order="C") + # Center result such that sum over classes equals zero. + # Note: The L2 penalty guarantees that coef_newton step is centered, i.e. + # np.mean(self.coef_newton, axis=0) = 0. + # But for the intercept term self.coef_newton[:, -1] this is less clear. + # Nevertheless, all tests pass without the following line. + # self.coef_newton -= np.mean(self.coef_newton, axis=0) + + self.gradient_times_newton = self.gradient.ravel( + order="F" + ) @ self.coef_newton.ravel(order="F") + + # In the first Newton iteraton, we tolerate istop == 7 and other things. We + # just got started and are therefore forgiving. + if self.iteration == 1: + return + # Note: We could detect too large steps by comparing norm(coef_newton) = normx + # with norm(gradient) or with the already available condition number of A, e.g. + # conda. + if istop == 7: + self.use_fallback_lbfgs_solve = True + msg = ( + f"The inner solver of {self.__class__.__name__} reached " + f"maxiter={itn} before the other stopping conditions were " + f"satisfied at iteration #{self.iteration}. " + ) + elif istop in (3, 6): + self.use_fallback_lbfgs_solve = True + msg = ( + f"The inner solver of {self.__class__.__name__} complained that the " + f"condition number of A (A'A = Hessian), conda={conda}, seems to be " + "greater than the given limit conlim=1e8 at iteration " + f"#{self.iteration}." + ) + + if self.use_fallback_lbfgs_solve: + warnings.warn( + msg + "It will now resort to lbfgs instead.\n" + "This may be caused by singular or very ill-conditioned Hessian " + "matrix. " + "Further options are to use another solver or to avoid such situation " + "in the first place. Possible remedies are removing collinear features" + "of X or increasing the penalization strengths.", + ConvergenceWarning, + ) + if self.verbose: + print( + " The inner solver had problems to converge and resorts to lbfgs." + ) + self.use_fallback_lbfgs_solve = True + return + + def compute_d2(self, X, sample_weight): + """Compute square of Newton decrement.""" + weights, intercept, raw_prediction = self.linear_loss.weight_intercept_raw( + self.coef_newton, X + ) + if not self.linear_loss.base_loss.is_multiclass: + d2 = np.sum(raw_prediction * self.h * raw_prediction) + else: + d = self.LDL.sqrt_D_Lt_matmul(raw_prediction) + if sample_weight is not None: + d *= sample_weight[:, None] + d = d.ravel(order="F") + d2 = d @ d + d2 += 2 * self.linear_loss.l2_penalty(weights, self.l2_reg_strength) + return d2 + + def finalize(self, X, y, sample_weight): + if hasattr(self, "LDL"): + delattr(self, "LDL") diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 4cac889a4da51..5d99ba97ddd41 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -25,7 +25,12 @@ from ...utils.optimize import _check_optimize_result from ...utils.validation import _check_sample_weight, check_is_fitted from .._linear_loss import LinearModelLoss -from ._newton_solver import NewtonCholeskySolver, NewtonSolver +from ._newton_solver import NewtonCholeskySolver, NewtonLSMRSolver, NewtonSolver + +NEWTON_SOLVER = { + "newton-cholesky": NewtonCholeskySolver, + "newton-lsmr": NewtonLSMRSolver, +} class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): @@ -65,7 +70,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X @ coef + intercept). - solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs' Algorithm to use in the optimization problem: 'lbfgs' @@ -81,6 +86,20 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): .. versionadded:: 1.2 + 'newton-lsmr' + Uses Newton-Raphson steps formulated as iteratively reweighted least + squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`, + this solver does not explicitly materialize the Hessian matrix but instead + leverages knowledge about its structure to incrementally solve the least + squares problems via a series of matrix vector operations where the + matrices have block structure with block sizes scaling as + `(n_samples, n_features)` and `(n_samples, n_classes)` therefore limiting + the memory requirements. + Additionaly, this is numerically more stable for ill-conditioned X compared + to `newton-cholesky`. + + .. versionadded:: 1.5 + max_iter : int, default=100 The maximal number of iterations for the solver. Values must be in the range `[1, inf)`. @@ -140,7 +159,7 @@ class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator): "alpha": [Interval(Real, 0.0, None, closed="left")], "fit_intercept": ["boolean"], "solver": [ - StrOptions({"lbfgs", "newton-cholesky"}), + StrOptions({"lbfgs", "newton-cholesky", "newton-lsmr"}), Hidden(type), ], "max_iter": [Interval(Integral, 1, None, closed="left")], @@ -284,8 +303,8 @@ def fit(self, X, y, sample_weight=None): ) self.n_iter_ = _check_optimize_result("lbfgs", opt_res) coef = opt_res.x - elif self.solver == "newton-cholesky": - sol = NewtonCholeskySolver( + elif self.solver in NEWTON_SOLVER.keys(): + sol = NEWTON_SOLVER[self.solver]( coef=coef, linear_loss=linear_loss, l2_reg_strength=l2_reg_strength, @@ -483,7 +502,7 @@ class PoissonRegressor(_GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (`X @ coef + intercept`). - solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs' Algorithm to use in the optimization problem: 'lbfgs' @@ -499,6 +518,18 @@ class PoissonRegressor(_GeneralizedLinearRegressor): .. versionadded:: 1.2 + 'newton-lsmr' + Uses Newton-Raphson steps formulated as iteratively reweighted least + squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`, + this solver does not explicitly materialize the Hessian matrix but instead + leverages knowledge about its structure to incrementally solve the least + squares problems via a series of matrix vector operations where the + matrices have size `(n_samples, n_features)`. + Additionaly, this is numerically more stable for ill-conditioned X compared + to `newton-cholesky`. + + .. versionadded:: 1.5 + max_iter : int, default=100 The maximal number of iterations for the solver. Values must be in the range `[1, inf)`. @@ -614,7 +645,7 @@ class GammaRegressor(_GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor `X @ coef_ + intercept_`. - solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs' Algorithm to use in the optimization problem: 'lbfgs' @@ -630,6 +661,18 @@ class GammaRegressor(_GeneralizedLinearRegressor): .. versionadded:: 1.2 + 'newton-lsmr' + Uses Newton-Raphson steps formulated as iteratively reweighted least + squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`, + this solver does not explicitly materialize the Hessian matrix but instead + leverages knowledge about its structure to incrementally solve the least + squares problems via a series of matrix vector operations where the + matrices have size `(n_samples, n_features)`. + Additionaly, this is numerically more stable for ill-conditioned X compared + to `newton-cholesky`. + + .. versionadded:: 1.3 + max_iter : int, default=100 The maximal number of iterations for the solver. Values must be in the range `[1, inf)`. @@ -776,7 +819,7 @@ class TweedieRegressor(_GeneralizedLinearRegressor): - 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian distributions - solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + solver : {'lbfgs', 'newton-cholesky', 'newton-lsmr'}, default='lbfgs' Algorithm to use in the optimization problem: 'lbfgs' @@ -792,6 +835,18 @@ class TweedieRegressor(_GeneralizedLinearRegressor): .. versionadded:: 1.2 + 'newton-lsmr' + Uses Newton-Raphson steps formulated as iteratively reweighted least + squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`, + this solver does not explicitly materialize the Hessian matrix but instead + leverages knowledge about its structure to incrementally solve the least + squares problems via a series of matrix vector operations where the + matrices have size `(n_samples, n_features)`. + Additionaly, this is numerically more stable for ill-conditioned X compared + to `newton-cholesky`. + + .. versionadded:: 1.3 + max_iter : int, default=100 The maximal number of iterations for the solver. Values must be in the range `[1, inf)`. diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 26f6bdc08d254..28f893038f286 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -12,11 +12,17 @@ from numpy.testing import assert_allclose from scipy import linalg from scipy.optimize import minimize, root +from scipy.sparse.linalg import lsmr -from sklearn._loss import HalfBinomialLoss, HalfPoissonLoss, HalfTweedieLoss +from sklearn._loss import ( + HalfBinomialLoss, + HalfMultinomialLoss, + HalfPoissonLoss, + HalfTweedieLoss, +) from sklearn._loss.link import IdentityLink, LogLink from sklearn.base import clone -from sklearn.datasets import make_low_rank_matrix, make_regression +from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import ( GammaRegressor, @@ -25,12 +31,18 @@ TweedieRegressor, ) from sklearn.linear_model._glm import _GeneralizedLinearRegressor -from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver -from sklearn.linear_model._linear_loss import LinearModelLoss +from sklearn.linear_model._glm._newton_solver import ( + NewtonCholeskySolver, + NewtonLSMRSolver, +) +from sklearn.linear_model._linear_loss import ( + LinearModelLoss, + Multinomial_LDL_Decomposition, +) from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance from sklearn.model_selection import train_test_split -SOLVERS = ["lbfgs", "newton-cholesky"] +SOLVERS = ["lbfgs", "newton-cholesky", "newton-lsmr"] class BinomialRegressor(_GeneralizedLinearRegressor): @@ -237,7 +249,12 @@ def test_glm_regression(solver, fit_intercept, glm_dataset): model.fit(X, y) - rtol = 5e-5 if solver == "lbfgs" else 1e-9 + if solver == "lbfgs": + rtol = 5e-5 + elif solver == "newton-lsmr": + rtol = 5e-9 + else: + rtol = 1e-9 assert model.intercept_ == pytest.approx(intercept, rel=rtol) assert_allclose(model.coef_, coef, rtol=rtol) @@ -288,7 +305,12 @@ def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset): warnings.simplefilter("ignore", ConvergenceWarning) model.fit(X, y) - rtol = 2e-4 if solver == "lbfgs" else 5e-9 + if solver == "lbfgs": + rtol = 2e-4 + elif solver == "newton-lsmr": + rtol = 2e-8 + else: + rtol = 5e-9 assert model.intercept_ == pytest.approx(intercept, rel=rtol) assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol) @@ -328,7 +350,12 @@ def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset): intercept = 0 model.fit(X, y) - rtol = 3e-5 if solver == "lbfgs" else 5e-9 + if solver == "lbfgs": + rtol = 3e-5 + elif solver == "newton-lsmr": + rtol = 1e-8 + else: + rtol = 5e-9 assert model.intercept_ == pytest.approx(intercept, rel=rtol) assert_allclose(model.coef_, coef, rtol=rtol) @@ -391,7 +418,7 @@ def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset): norm_solution = np.linalg.norm(np.r_[intercept, coef]) norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_]) - if solver == "newton-cholesky": + if solver in ("newton-cholesky", "newton-lsmr"): # XXX: This solver shows random behaviour. Sometimes it finds solutions # with norm_model <= norm_solution! So we check conditionally. if norm_model < (1 + 1e-12) * norm_solution: @@ -484,7 +511,10 @@ def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_datase # we get a solution, i.e. a (non-unique) minimum of the objective function ... rtol = 1e-6 if solver == "lbfgs" else 5e-6 assert_allclose(model.predict(X), y, rtol=rtol) - if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky": + if (solver == "lbfgs" and fit_intercept) or solver in ( + "newton-cholesky", + "newton-lsmr", + ): # Same as in test_glm_regression_unpenalized. # But it is not the minimum norm solution. Otherwise the norms would be # equal. @@ -558,7 +588,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase norm_solution = np.linalg.norm(np.r_[intercept, coef]) norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_]) - if solver == "newton-cholesky": + if solver in ("newton-cholesky", "newton-lsmr"): # XXX: This solver shows random behaviour. Sometimes it finds solutions # with norm_model <= norm_solution! So we check conditionally. if not (norm_model > (1 + 1e-12) * norm_solution): @@ -570,7 +600,7 @@ def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_datase # equal. assert norm_model > (1 + 1e-12) * norm_solution else: - rtol = 1e-5 if solver == "newton-cholesky" else 1e-4 + rtol = 1e-5 if solver in ("newton-cholesky", "newton-lsmr") else 1e-4 assert model.intercept_ == pytest.approx(intercept, rel=rtol) assert_allclose(model.coef_, coef, rtol=rtol) @@ -837,7 +867,7 @@ def test_normal_ridge_comparison( assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4) -@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky"]) +@pytest.mark.parametrize("solver", ["lbfgs", "newton-cholesky", "newton-lsmr"]) def test_poisson_glmnet(solver): """Compare Poisson regression with L2 regularization and LogLink to glmnet""" # library("glmnet") @@ -927,8 +957,8 @@ def test_tags(estimator, value): assert estimator._get_tags()["requires_positive_y"] is value -def test_linalg_warning_with_newton_solver(global_random_seed): - newton_solver = "newton-cholesky" +@pytest.mark.parametrize("newton_solver", ["newton-cholesky", "newton-lsmr"]) +def test_linalg_warning_with_newton_solver(newton_solver, global_random_seed): rng = np.random.RandomState(global_random_seed) # Use at least 20 samples to reduce the likelihood of getting a degenerate # dataset for any global_random_seed. @@ -979,11 +1009,17 @@ def test_linalg_warning_with_newton_solver(global_random_seed): # Fitting a Newton solver on the collinear version of the training data # without regularization should raise an informative warning and fallback # to the LBFGS solver. - msg = ( - "The inner solver of .*Newton.*Solver stumbled upon a singular or very " - "ill-conditioned Hessian matrix" - ) - with pytest.warns(scipy.linalg.LinAlgWarning, match=msg): + if newton_solver == "newton-cholesky": + msg = ( + "The inner solver of .*Newton.*Solver stumbled upon a singular or very " + "ill-conditioned Hessian matrix" + ) + with pytest.warns(scipy.linalg.LinAlgWarning, match=msg): + reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit( + X_collinear, y + ) + else: + # newton-lsmr has no problems with collinearity reg = PoissonRegressor(solver=newton_solver, alpha=0.0, tol=tol).fit( X_collinear, y ) @@ -1008,6 +1044,115 @@ def test_linalg_warning_with_newton_solver(global_random_seed): ) +@pytest.mark.parametrize( + "solver, warn1, msg1, warn2, msg2", + [ + ( + "lbfgs", + None, + None, + [RuntimeWarning], + ["invalid value encountered in matmul"], + ), + ( + "newton-cholesky", + scipy.linalg.LinAlgWarning, + ( + "The inner solver of .*Newton.*Solver stumbled upon a singular or very " + "ill-conditioned Hessian matrix" + ), + [scipy.linalg.LinAlgWarning, RuntimeWarning], + [ + ( + "The inner solver of .*Newton.*Solver stumbled upon a singular or" + " very ill-conditioned Hessian matrix" + ), + "invalid value encountered in matmul", + ], + ), + ( + "newton-lsmr", + None, + None, + [ConvergenceWarning], + ["Newton solver did not converge after .* iterations"], + ), + ], +) +def test_solver_on_ill_conditioned_X( + solver, warn1, msg1, warn2, msg2, global_random_seed +): + """Test GLM solvers on ill conditioned X with high condition number. + + Note that numbers in this test are tuned such that is passes for all global random + seeds. + """ + rng = np.random.RandomState(global_random_seed) + # Use at least 20 samples to reduce the likelihood of getting a degenerate + # dataset for any global_random_seed. + X_orig = rng.uniform(low=-1, high=1, size=(20, 3)) + y = rng.poisson( + np.exp(X_orig @ np.ones(X_orig.shape[1])), size=X_orig.shape[0] + ).astype(np.float64) + + tol = 1e-8 + model = PoissonRegressor(solver=solver, alpha=0.0, tol=tol, max_iter=200) + + # No warning raised on well-conditioned design, even without regularization. + with warnings.catch_warnings(): + warnings.simplefilter("error") + reg = clone(model).fit(X_orig, y) + original_deviance = mean_poisson_deviance(y, reg.predict(X_orig)) + + # Construct an ill-conditioned problem by some almost identical columns. + X_ill_conditioned = X_orig.copy() + X_ill_conditioned[:, 1] = X_ill_conditioned[:, 0] + X_ill_conditioned[0, 1] += 1e-10 + X_ill_conditioned[-1, 1] -= 1e-10 + # Make sure that it is ill-conditioned <=> large condition number. + assert np.linalg.cond(X_ill_conditioned) > 1e10 * np.linalg.cond(X_orig) + + if warn1 is None: + # Assert that no warning is raised. + with warnings.catch_warnings(): + warnings.simplefilter("error") + reg = clone(model).fit(X_ill_conditioned, y) + else: + # Assert that the given warning is raised. + with pytest.warns(warn1, match=msg1): + reg = clone(model).fit(X_ill_conditioned, y) + + # Construct another ill-conditioned problem by scaling of features. + X_ill_conditioned = X_orig.copy() + X_ill_conditioned[:, 0] *= 1e-6 + X_ill_conditioned[:, 1] *= 1e2 # too large X may overflow in link function + # Make sure that it is ill conditioned >=> large condition number. + assert np.linalg.cond(X_ill_conditioned) > 1e5 * np.linalg.cond(X_orig) + + test_loss = False + if warn2 is None: + with warnings.catch_warnings(): + warnings.simplefilter("error") + reg = clone(model).fit(X_ill_conditioned, y) + else: + # Whether or not a warning is raised depends on global_random_seed. + with warnings.catch_warnings(record=True) as w: + for warn in warn2: + warnings.simplefilter("ignore", warn) + reg = clone(model).fit(X_ill_conditioned, y) + test_loss = True + assert len(w) == 0 + + if test_loss: + # Without penalty, scaling of columns has no effect on predictions. + ill_cond_deviance = mean_poisson_deviance(y, reg.predict(X_ill_conditioned)) + if solver in ("lbfgs", "newton-cholesky"): + pytest.xfail( + f"Solver {solver} does not converge but does so without warning." + ) + assert ill_cond_deviance == pytest.approx(original_deviance, rel=1e-2) + + @pytest.mark.parametrize("verbose", [0, 1, 2]) def test_newton_solver_verbosity(capsys, verbose): """Test the std output of verbose newton solvers.""" @@ -1109,3 +1254,325 @@ def test_newton_solver_verbosity(capsys, verbose): "The inner solver detected a pointwise Hessian with many negative values" " and resorts to lbfgs instead." in captured.out ) + + +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("l2_reg_strength", [0, 1.5]) +def test_NewtonLSMRSolver_multinomial_A_b( + fit_intercept, l2_reg_strength, global_random_seed +): + """Test NewtonLSMRSolver.compute_A_b for multinomial case""" + n_samples, n_features, n_classes = 12, 10, 5 + n_dof = n_features + fit_intercept + rng = np.random.RandomState(global_random_seed) + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_classes=n_classes, + n_informative=n_features - 1, + n_redundant=1, + random_state=rng, + ) + y = y.astype(float) + coef = rng.standard_normal(size=(n_classes, n_features + fit_intercept)) + coef -= np.mean(coef, axis=0) + + multinomial_loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes), fit_intercept=fit_intercept + ) + gradient, hessp = multinomial_loss.gradient_hessian_product( + coef=coef, + X=X, + y=y, + sample_weight=None, + l2_reg_strength=l2_reg_strength, + ) + sol = NewtonLSMRSolver( + coef=coef, + linear_loss=multinomial_loss, + l2_reg_strength=l2_reg_strength, + ) + sol.setup(X=X, y=y, sample_weight=None) + sol.update_gradient_hessian(X=X, y=y, sample_weight=None) + + # Hessian @ coef + # with hessp + H_coef = hessp(coef) # shape = (n_classes, n_dof) + # with A + A, b = sol.compute_A_b(X=X, y=y, sample_weight=None) + At_A_coef = A.T @ A @ coef.ravel(order="C") + # The results are differently ravelled, we restore the 2-d arrays with n_classes + # on the 1st (=last) axis. + assert_allclose(At_A_coef.reshape(-1, n_classes, order="F"), H_coef.T) + # Note: The following does not work for all global_random_seeds. The reason behind + # it is unclear. + # assert_allclose( + # A.rmatvec(b).reshape(-1, n_classes, order="F"), -gradient.T, rtol=1e-4 + # ) + + # Test consistency of A, i.e. reconstructing the matrix based on + # A @ unit_vector and A.T @ unit_vector should give the same matrix. + unit_vec = np.zeros(n_dof * n_classes) + A_matrix1 = np.zeros(((n_samples + n_features) * n_classes, n_dof * n_classes)) + for i in range(n_dof * n_classes): + unit_vec[i] = 1 + A_matrix1[:, i] = A @ unit_vec + unit_vec[i] = 0 + unit_vec = np.zeros((n_samples + n_features) * n_classes) + A_matrix2 = np.zeros(((n_samples + n_features) * n_classes, n_dof * n_classes)) + for j in range((n_samples + n_features) * n_classes): + unit_vec[j] = 1 + A_matrix2[j, :] = A.rmatvec(unit_vec) + unit_vec[j] = 0 + assert_allclose(A_matrix1, A_matrix2) + + +@pytest.mark.parametrize("fit_intercept", (False, True)) +@pytest.mark.parametrize("with_sample_weight", (False, True)) +def test_NewtonLSMRSolver_multinomial_A_b_on_3_classes( + fit_intercept, with_sample_weight, global_random_seed +): + """Test NewtonLSMRSolver.compute_A_b for multinomial case with 3 classes.""" + n_samples, n_features, n_classes = 5, 2, 3 + n_dof = n_features + fit_intercept + rng = np.random.RandomState(global_random_seed) + X = np.array([[1.0, 0], [1, 1], [1, 2], [1, 3], [1, 4]]) + Xi = X + # coef.shape = (n_classes, n_dof) and coef.sum(axis=0) = 0 + coef = np.array([[1.0, -1], [1, 0], [-2, 1]]) + if fit_intercept: + X[0, 0] = -1 # to make Xi positive definite + coef = np.c_[coef, [[2], [1], [-3]]] + Xi = np.c_[X, np.ones(n_samples)] + assert np.linalg.matrix_rank(Xi) == n_dof + y = np.array([0.0, 1, 1, 1, 2]) + + if with_sample_weight: + sw = 1.0 + np.arange(n_samples) + sw_sum = np.sum(sw) + else: + sw = None + sw_sum = n_samples + + multinomial_loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes, sample_weight=sw), + fit_intercept=fit_intercept, + ) + sol = NewtonLSMRSolver( + coef=coef, + linear_loss=multinomial_loss, + l2_reg_strength=0, + ) + sol.setup(X=X, y=y, sample_weight=sw) + sol.update_gradient_hessian(X=X, y=y, sample_weight=sw) + A, b = sol.compute_A_b(X=X, y=y, sample_weight=sw) + + _, _, raw_prediction = multinomial_loss.weight_intercept_raw(coef, X) + # p.shape = (n_samples, n_classes) + p = multinomial_loss.base_loss.predict_proba(raw_prediction) + assert_allclose(np.sum(p, axis=1), 1) # consequence of coef.sum(axis=0) = 0 + + # Extended vectors for 3 classes: + # For y_ext and p_ext, we append the vectors for different classes one after the + # other. + y_ext = np.r_[y == 0, y == 1, y == 2] + p_ext = p.ravel(order="F") + coef_ext = coef.ravel(order="C") + sw_ext = np.tile(sw, 3) + # Extended matrices for 3 classes: + # X_ext = [X, 0, 0] W = [W00, W01, W02] + # [0, X, 0] [W10, W11, W12] + # [0, 0, X] [W20, W21, W22] + # X_ext.shape = (n_classes * n_samples, n_classes * n_features) + # W.shape = (n_classes * n_samples, n_classes * n_samples) + # Each Wij is a diagonal matrix with elements pi * (delta_ij - pj). + X_ext = linalg.block_diag(Xi, Xi, Xi) + W00 = np.diag(p[:, 0] * (1 - p[:, 0])) + W11 = np.diag(p[:, 1] * (1 - p[:, 1])) + W22 = np.diag(p[:, 2] * (1 - p[:, 2])) + W01 = np.diag(-p[:, 0] * p[:, 1]) + W02 = np.diag(-p[:, 0] * p[:, 2]) + W12 = np.diag(-p[:, 1] * p[:, 2]) + W_ext = np.block([[W00, W01, W02], [W01, W11, W12], [W02, W12, W22]]) / sw_sum + g_ext = (p_ext - y_ext) / sw_sum # pointwise gradient + if with_sample_weight: + W_ext *= sw_ext[:, None] + g_ext *= sw_ext + G_ext = X_ext.T @ g_ext # full gradient + H_ext = X_ext.T @ W_ext @ X_ext # full hessian + + assert_allclose(g_ext.reshape((n_samples, n_classes), order="F"), sol.g) + + # Note that both X_ext @ v and A @ v need v = coef.ravel(order="C") for + # coef.shape = (n_classes, n_dof). + + # v is like a coefficient + v = rng.standard_normal(size=(n_classes, n_dof)) + # u is a test vector for A', i.e. we test A.T @ u + u = rng.standard_normal(size=(n_samples + n_features, n_classes)) + + # Check that A is a square root of H, i.e. H = A'A. + H_ext_v = H_ext @ v.ravel(order="C") + A_v = A @ v.ravel(order="C") + At_A_v = A.T @ A_v + assert_allclose(At_A_v, H_ext_v) + + # Check that A'b = -G + assert_allclose(A.T @ b, -G_ext) + + # Check that A is constructed from LDL decomposition. + LDL = Multinomial_LDL_Decomposition(proba=p) + D_Lt_X_v = LDL.sqrt_D_Lt_matmul(Xi @ v.T) / np.sqrt(sw_sum) + if with_sample_weight: + D_Lt_X_v *= np.sqrt(sw)[:, None] + assert_allclose(A_v.reshape((-1, n_classes), order="F")[:n_samples, :], D_Lt_X_v) + assert_allclose(A_v[n_classes * n_samples :], 0) + assert_allclose( + A_v.reshape((-1, n_classes), order="F")[:n_samples, :], D_Lt_X_v, atol=1e-8 + ) + + # Check LDL decomposition of W explicitly. + L, D, perm = linalg.ldl(W_ext, lower=True) + # D should be diagonal (and not block diagonal) and non-negative. + assert_allclose(np.diag(np.diag(D)), D) + assert np.all(D >= -1e-15) + D[D < 1e-15] = 0 # Tiny values would be zero in exact arithmetic. + assert_allclose(L @ D @ L.T, W_ext) + D_Lt_X = np.sqrt(D) @ L.T @ X_ext + assert_allclose(D_Lt_X.T @ D_Lt_X, H_ext) + assert_allclose(D_Lt_X @ v.ravel(order="C"), D_Lt_X_v.ravel(order="F"), atol=1e-8) + + # Construct matrix A explicitly. Note: no intercept, penalty is zero. + A_ext = np.r_[D_Lt_X, np.zeros((n_classes * n_dof, n_classes * n_dof))] + assert_allclose(A_ext.T @ A_ext, H_ext) + D_inv = np.sqrt(D) + D_inv[D_inv > 0] = 1 / D_inv[D_inv > 0] + D_inv[D_inv <= 0] = 0 + assert_allclose(L @ np.sqrt(D) @ D_inv @ linalg.solve(L, g_ext), g_ext) + assert_allclose(X_ext.T @ L @ np.sqrt(D) @ D_inv @ linalg.solve(L, g_ext), G_ext) + # Note that A_ext needs slicing u in a way not possible with ravel, i.e. + # treat u[:n_samples * n_classes, :] and u[n_samples * n_classes:, :] differently. + assert_allclose( + A_ext[: n_classes * n_samples, :].T @ u[:n_samples, :].ravel(order="F"), + A.T @ u.ravel(order="F"), + ) + + # Construct b explicitly. + b_ext = np.r_[-D_inv @ linalg.solve(L, g_ext), np.zeros(n_dof * n_classes)] + assert_allclose( + b_ext[: n_samples * n_classes], + b.reshape((-1, n_classes), order="F")[:n_samples, :].ravel(order="F"), + atol=1e-15, + ) + assert_allclose(A_ext.T @ b_ext, -G_ext) + + # Check equivalence of linear equation H @ x = H_v to least squares problem + # ||D_Lt_X @ x - D_Lt_X_v||. Note that H (unpenalized with all classes) is + # singular and we use `lstsq` instead of `solve`. + res = linalg.lstsq(H_ext, H_ext_v)[0].reshape((n_classes, -1)) + assert_allclose(res, v - np.mean(v, axis=0)) + res = linalg.lstsq(D_Lt_X, D_Lt_X_v.ravel(order="F"))[0].reshape((n_classes, -1)) + assert_allclose(res - np.mean(res, axis=0), v - np.mean(v, axis=0)) + + # Check Newton step, i.e. H @ x = -G. + # Here, we need to add a penalty term. Otherwise, the linear system would be + # singular. + alpha = 1e-4 + sol = NewtonLSMRSolver( + coef=coef, + linear_loss=multinomial_loss, + l2_reg_strength=alpha, + ) + sol.setup(X=X, y=y, sample_weight=sw) + sol.update_gradient_hessian(X=X, y=y, sample_weight=sw) + A, b = sol.compute_A_b(X=X, y=y, sample_weight=sw) + pen_ext = alpha * np.identity(n_classes * n_dof) + if fit_intercept: + pen_ext[:, n_dof - 1 :: n_dof] = 0 + assert_allclose( + A.T @ A @ v.ravel(order="C"), (H_ext + pen_ext) @ v.ravel(order="C") + ) + assert_allclose(A.T @ b, -(G_ext + pen_ext @ coef_ext)) + + # Check Newton step. + # Note: This works despite the fact that W=LDL is a singular matrix and therefore + # D has some zeros on the diagonal and is not strictly invertible. But this does + # not seem to matter. + if fit_intercept: + with warnings.catch_warnings(): + # Due to the intercept term not being penalized, the linear system might + # still be singular. + warnings.simplefilter("ignore", scipy.linalg.LinAlgWarning) + res1 = linalg.solve(H_ext + pen_ext, -(G_ext + pen_ext @ coef_ext)).reshape( + (n_classes, -1) + ) + # For the same reasons, the intercept might need class centering. + res1[:, -1] -= np.mean(res1[:, -1]) + else: + res1 = linalg.solve(H_ext + pen_ext, -(G_ext + pen_ext @ coef_ext)).reshape( + (n_classes, -1) + ) + # sum over classes = 0 + assert_allclose(res1.sum(axis=0), 0, atol=5e-9) + assert_allclose((H_ext + pen_ext) @ res1.ravel(), -(G_ext + pen_ext @ coef_ext)) + assert_allclose(A.T @ A @ res1.ravel(order="C"), A.T @ b) + res2 = lsmr(A, b, maxiter=(n_features + n_samples) * n_classes, show=True) + res2 = res2[0].reshape((n_classes, -1)) + assert_allclose(res1, res2) + + +@pytest.mark.parametrize("fit_intercept", [False, True]) +@pytest.mark.parametrize("l2_reg_strength", [0, 5.5]) +def test_NewtonLSMRSolver_multinomial_on_binary_problem( + fit_intercept, l2_reg_strength, global_random_seed +): + """Test NewtonLSMRSolver multinomial case on a binary problem.""" + n_samples, n_features, n_classes = 100, 3, 2 + rng = np.random.RandomState(global_random_seed) + X, y = make_classification( + n_samples=n_samples, + n_features=n_features, + n_classes=n_classes, + n_informative=n_features - 1 * 0, + n_redundant=1 * 0, + flip_y=0.1, + random_state=rng, + ) + y = y.astype(float) + + # Note that the factor of 2 for BinomialRegressor is to adjust for the + # parametrization difference to the multinomial loss. + bin = BinomialRegressor( + fit_intercept=fit_intercept, + alpha=l2_reg_strength / 2, + solver="newton-cholesky", + tol=1e-8, + ).fit(X, y) + bin_loss = LinearModelLoss( + base_loss=HalfBinomialLoss(), + fit_intercept=fit_intercept, + ) + + linear_loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes), + fit_intercept=fit_intercept, + ) + sol = NewtonLSMRSolver( + coef=linear_loss.init_zero_coef(X), + linear_loss=linear_loss, + l2_reg_strength=l2_reg_strength, + tol=1e-8, + verbose=0, + ) + sol.solve(X, y, None) + + assert_allclose(np.mean(sol.coef, axis=0), 0, atol=1e-13) + if fit_intercept: + coef_bin = np.r_[bin.coef_, bin.intercept_] + else: + coef_bin = bin.coef_ + assert_allclose( + linear_loss.loss(coef=sol.coef, X=X, y=y, l2_reg_strength=l2_reg_strength), + bin_loss.loss(coef=coef_bin, X=X, y=y, l2_reg_strength=l2_reg_strength / 2), + ) + assert_allclose(sol.coef[1, :], coef_bin / 2, rtol=5e-6) diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py index e8c1466b30623..1e96e687aac6e 100644 --- a/sklearn/linear_model/_linear_loss.py +++ b/sklearn/linear_model/_linear_loss.py @@ -2,6 +2,8 @@ Loss functions for linear models with raw_prediction = X @ coef """ +import warnings + import numpy as np from scipy import sparse @@ -19,7 +21,9 @@ class LinearModelLoss: loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept) + 1/2 * l2_reg_strength * ||coef||_2^2 - with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i. + with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i. For the + multiclass case, the squared L2 norm is replaced by the squared Frobenius norm, + which equals the L2 norm of the flattend/ravelled coefficient array. Gradient and hessian, for simplicity without intercept, are:: @@ -644,6 +648,10 @@ def hessp(s): # # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 # noqa def hessp(s): + # s.shape = (n_classes, n_features,) or (n_classes * n_features) + # It could be the coefficients (including the intercept if present). + # Returns an array of shape = (n_classes, n_dof) or (n_classes * n_dof) + # depending on the shape of s. s = s.reshape((n_classes, -1), order="F") # shape = (n_classes, n_dof) if self.fit_intercept: s_intercept = s[:, -1] @@ -670,3 +678,247 @@ def hessp(s): return grad.ravel(order="F"), hessp return grad, hessp + + +class Multinomial_LDL_Decomposition: + """A class for symbolic LDL' decomposition of multinomial hessian. + + The pointwise hessian of the multinomial loss with c = n_classes is given by + h = diag(p) - p' p or with indices + h_ij = p_i * delta_ij - p_i * p_j i and j are indices of classes. + This holds for every single point (sample). The LDL decomposition + of this 2-dim matrix is given in [1] for p_i > 0 and sum(p) <= 1 as (math indices) + h = L D L' with lower triangular L and diagonal D:: text + + q_0 = 1 + q_i = 1 - sum(p_k, k=1..c) + L_ii = 1 + for i > j: + L_ij = -p_i / q_j + D_ii = p_i * q_i / q_{i-1} + + If the p_i sum to 1, then q_c = 0 and D_cc = 0, with c = n_classes. + The inverse L^-1 is also lower triangular and given by:: text + + (L^-1)_ii = 1 + for i > j: + (L^-1)_ij = f_i = p_i / q_{i-1} = -L_{i, i-1} + + Note that with Python indices (zero-based), we have: p_i = p[:, i-1], + q_i = q[:, i-1] and we don't explicitly need q_0 = 1. + + The trick is to apply this LDL decomposition to all points (samples) at the same + time. This class therefore provides methods to apply products with the matrices L + and D to vectors/matrices. All objects have the 0-th dimension for n_samples and + the 1st dimention for n_classes, i.e. shape (n_samples, n_classes). + + Parameters + ---------- + proba : ndarray of shape (n_samples, n_classes) + Array of predicted probabilities per class (and sample). + + Attributes + ---------- + p : ndarray of shape (n_samples, n_classes) + Array of predicted probabilities per class (and sample). + + q_inv : ndarray of shape (n_samples, n_classes) + Helper array, inverse of q[:, j] = 1 - sum_{i=0}^j p[:, i], i.e. 1/q. + + sqrt_d : ndarray of shape (n_samples, n_classes) + Square root of the diagonal matrix D, D_ii = p_i * q_i / q_{i-1} + with q_{-1} = 1. + + proba_sum_to_1 : bool + True if propabilities p sum to 1. This is most often expected to be true. + + References + ---------- + .. [1] Kunio Tanabe and Masahiko Sagae. (1992) "An Exact Cholesky Decomposition and + the Generalized Inverse of the Variance-Covariance Matrix of the Multinomial + Distribution, with Applications" + https://doi.org/10.1111/j.2517-6161.1992.tb01875.x + """ + + def __init__(self, *, proba, proba_sum_to_1=True): + self.p = proba + q = 1 - np.cumsum(self.p, axis=1) # contiguity of p + self.proba_sum_to_1 = proba_sum_to_1 + if self.p.dtype == np.float32: + eps = 2 * np.finfo(np.float32).resolution + else: + eps = 2 * np.finfo(np.float64).resolution + if not np.allclose(q[:, -1], 0, atol=eps): + warnings.warn( + "Probabilities proba are assumed to sum to 1, but they don't.", + UserWarning, + ) + if self.proba_sum_to_1: + # If np.sum(p, axis=1) = 1 then q[:, -1] = d[:, -1] = 0. + q[:, -1] = 0.0 + # One might not need all classes for p to sum to 1, so we detect and + # correct all values close to zero. + q[q <= eps] = 0.0 + self.p[self.p <= eps] = 0.0 + d = self.p * q + # From now on, q is always used in the denominator. We handle q == 0 by + # setting q to 1 whenever q == 0 such that a division of q is a no-op in this + # case. + q[q == 0] = 1 + # And we use the inverse of q, 1/q. + self.q_inv = 1 / q + if self.proba_sum_to_1: + # If q_{i - 1} = 0, then also q_i = 0. + d[:, 1:-1] *= self.q_inv[:, :-2] # d[:, -1] = 0 anyway. + else: + d[:, 1:] *= self.q_inv[:, :-1] + self.sqrt_d = np.sqrt(d) + + def sqrt_D_Lt_matmul(self, x): + """Compute sqrt(D) L' x from the multinomial LDL' decomposition. + + L' is the transpose of L, Lij is the i-th row and j-th column of L: + Lij = -p[:, i] / q[:, j] + + For n_classes = 4, L' looks like:: text + + L' = (1 L10 L20 L30) + (0 1 L21 L31) + (0 0 1 L32) + (0 0 0 1) + + The carried out operation is matmul over n_classes (1st dimention) and + element-wise multiplication over n_samples (0th dimension): + + x_ij = sum_k sqrt_d_{i, j} L'_{i, j, k} x_{i, k} + + Parameters + ---------- + x : ndarray of shape (n_samples, n_classes) + This array is overwritten with the result. + + Return + ------ + x : ndarray of shape (n_samples, n_classes) + Input array x, filled with the result. + """ + n_classes = self.p.shape[1] + # precompute + px = np.einsum( + "ij,ij->i", + self.p[:, 1:], + x[:, 1:], + order="A", + ) + for i in range(0, n_classes - 1): # row i + # L_ij = -p_i / q_j, we need transpose L' + # for j in range(i + 1, n_classes): # column j + # x[:, i] -= self.p[:, j] / self.q[:, i] * x[:, j] + # The following is the same but faster. + # px = np.einsum( + # "ij,ij->i", + # self.p[:, i + 1 : n_classes], + # x[:, i + 1 : n_classes], + # order="A", + # ) + # And using precomputed px: + x[:, i] -= px * self.q_inv[:, i] + if i < n_classes - 2: + px -= self.p[:, i + 1] * x[:, i + 1] + x *= self.sqrt_d + return x + + def L_sqrt_D_matmul(self, x): + """Compute L sqrt(D) x from the multinomial LDL' decomposition. + + L is lower triangular and given by Lij = -p[:, i] / q[:, j] + + For n_classes = 4, L looks like:: text + + L = (1 0 0 0) + (L10 1 0 0) + (L20 L21 1 0) + (L30 L31 L32 1) + + The carried out operation is matmul over n_classes (1st dimention) and + element-wise multiplication over n_samples (0th dimension): + + x_ij = sum_k L'_{i, j, k} sqrt_d_{i, k} x_{i, k} + + Parameters + ---------- + x : ndarray of shape (n_samples, n_classes) + This array is overwritten with the result. + + Return + ------ + x : ndarray of shape (n_samples, n_classes) + Input array x, filled with the result. + """ + n_classes = self.p.shape[1] + x *= self.sqrt_d + # precompute + qx = np.einsum("ij,ij->i", self.q_inv[:, :-1], x[:, :-1], order="A") + for i in range(n_classes - 1, 0, -1): # row i + # L_ij = -p_i / q_j + # for j in range(0, i): # column j + # x[:, i] -= self.p[:, i] / self.q[:, j] * x[:, j] + # The following is the same but faster. + # qx = np.einsum("ij,ij->i", self.q_inv[:, :i], x[:, :i], order="A") + # And using precomputed qx: + x[:, i] -= qx * self.p[:, i] + if i > 1: + qx -= self.q_inv[:, i - 1] * x[:, i - 1] + return x + + def inverse_L_sqrt_D_matmul(self, x): + """Compute 1/sqrt(D) L^(-1) x from the multinomial LDL' decomposition. + + L^(-1) is again lower triangular and given by: + L^(-1)_ij = f[:, i] = p[:, i] / q[:, i-1] + + For n_classes = 4, L^(-1) looks like:: text + + L^(-1) = (1 0 0 0) + (f1 1 0 0) + (f2 f2 1 0) + (f3 f3 f3 1) + + Note that (L sqrt(D))^(-1) = 1/sqrt(D) L^(-1) + + Parameters + ---------- + x : ndarray of shape (n_samples, n_classes) + This array is overwritten with the result. + + Return + ------ + x : ndarray of shape (n_samples, n_classes) + Input array x, filled with the result. + """ + n_classes = self.p.shape[1] + x_sum = np.sum(x[:, :-1], axis=1) # precomputation + for i in range(n_classes - 1, 0, -1): # row i, here i > 0. + fj = self.p[:, i] * self.q_inv[:, i - 1] + # for i = 0 we would set: fj = self.p[:, i] + + # for j in range(0, i): # column j + # x[:, i] += fj * x[:, j] + # The following is the same but faster. + # x[:, i] += fj * np.sum(x[:, :i], axis=1) + # Using precomputation + x[:, i] += fj * x_sum + if i > 1: + x_sum -= x[:, i - 1] + if self.proba_sum_to_1: + # x[:, :-1] /= self.sqrt_d[:, :-1] + mask = self.sqrt_d[:, :-1] == 0 + x[:, :-1] *= (~mask) / (self.sqrt_d[:, :-1] + mask) + # Important Note: + # Strictly speaking, the inverse of D does not exist. + # We use 0 as the inverse of 0 and just set: + # x[:, -1] = 0 + x[:, -1] = 0 + else: + x /= self.sqrt_d + return x diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index a8ecc29715886..29130cd00ba37 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -51,7 +51,7 @@ check_is_fitted, ) from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin -from ._glm.glm import NewtonCholeskySolver +from ._glm.glm import NEWTON_SOLVER from ._linear_loss import LinearModelLoss from ._sag import sag_solver @@ -165,11 +165,11 @@ def _logistic_regression_path( where ``g_i`` is the i-th component of the gradient. verbose : int, default=0 - For the liblinear and lbfgs solvers set verbose to any positive - number for verbosity. + For most solver, except for 'newton-cg', set verbose to any positive number + for verbosity. - solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \ - default='lbfgs' + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \ + 'sag', 'saga'}, default='lbfgs' Numerical solver to use. coef : array-like of shape (n_features,), default=None @@ -318,7 +318,7 @@ def _logistic_regression_path( w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) mask = y == pos_class y_bin = np.ones(y.shape, dtype=X.dtype) - if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + if solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr"]: # HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead # of in [-1, 1]. mask_classes = np.array([0, 1]) @@ -335,9 +335,9 @@ def _logistic_regression_path( sample_weight *= class_weight_[le.fit_transform(y_bin)] else: - if solver in ["sag", "saga", "lbfgs", "newton-cg"]: - # SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder, - # not LabelBinarizer, i.e. y as a 1d-array of integers. + if solver in ["sag", "saga", "lbfgs", "newton-cg", "newton-lsmr"]: + # SAG, lbfgs, newton-cg and newton-lsmr multinomial solvers need + # LabelEncoder, not LabelBinarizer, i.e. y as a 1d-array of integers. # LabelEncoder also saves memory compared to LabelBinarizer, especially # when n_classes is large. le = LabelEncoder() @@ -360,7 +360,7 @@ def _logistic_regression_path( # C * sum(pointwise_loss) + penalty # instead of (as LinearModelLoss does) # mean(pointwise_loss) + 1/C * penalty - if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + if solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr"]: # This needs to be calculated after sample_weight is multiplied by # class_weight. It is even tested that passing class_weight is equivalent to # passing sample_weights according to class_weight. @@ -406,18 +406,19 @@ def _logistic_regression_path( w0[:, : coef.shape[1]] = coef if multi_class == "multinomial": + target = Y_multi + if solver in ["lbfgs", "newton-cg", "newton-lsmr"]: + loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=classes.size), + fit_intercept=fit_intercept, + ) if solver in ["lbfgs", "newton-cg"]: # scipy.optimize.minimize and newton-cg accept only ravelled parameters, # i.e. 1d-arrays. LinearModelLoss expects classes to be contiguous and # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F"). # As w0 is F-contiguous, ravel(order="F") also avoids a copy. w0 = w0.ravel(order="F") - loss = LinearModelLoss( - base_loss=HalfMultinomialLoss(n_classes=classes.size), - fit_intercept=fit_intercept, - ) - target = Y_multi - if solver == "lbfgs": + if solver in "lbfgs": func = loss.loss_gradient elif solver == "newton-cg": func = loss.loss @@ -438,7 +439,7 @@ def _logistic_regression_path( func = loss.loss grad = loss.gradient hess = loss.gradient_hessian_product # hess = [gradient, hessp] - elif solver == "newton-cholesky": + elif solver in ("newton-cholesky", "newton-lsmr"): loss = LinearModelLoss( base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept ) @@ -479,9 +480,9 @@ def _logistic_regression_path( w0, n_iter_i = _newton_cg( hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol ) - elif solver == "newton-cholesky": + elif solver in ("newton-cholesky", "newton-lsmr"): l2_reg_strength = 1.0 / (C * sw_sum) - sol = NewtonCholeskySolver( + sol = NEWTON_SOLVER[solver]( coef=w0, linear_loss=loss, l2_reg_strength=l2_reg_strength, @@ -656,10 +657,11 @@ def _log_reg_scoring_path( through the fit method) if sample_weight is specified. verbose : int - For the liblinear and lbfgs solvers set verbose to any positive - number for verbosity. + For most solvers, except for 'newton-cg', set verbose to any positive number + for verbosity. - solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'} + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \ + 'sag', 'saga'} Decides which solver to use. penalty : {'l1', 'l2', 'elasticnet'} @@ -892,16 +894,16 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details. - solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \ - default='lbfgs' + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \ + 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - For small datasets, 'liblinear' is a good choice, whereas 'sag' - and 'saga' are faster for large ones; - - For multiclass problems, only 'newton-cg', 'sag', 'saga' and - 'lbfgs' handle multinomial loss; + and 'saga' are faster for large ones. + - For multiclass problems, only 'newton-cg', 'newton-lsmr', 'sag', 'saga' + and 'lbfgs' handle multinomial loss. - 'liblinear' is limited to one-versus-rest schemes. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features`, especially with one-hot encoded categorical features with rare @@ -909,6 +911,15 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): one-versus-rest reduction for multiclass classification. Be aware that the memory usage of this solver has a quadratic dependency on `n_features` because it explicitly computes the Hessian matrix. + - 'newton-lsmr' is a good choice for multiclass classification. + Uses Newton-Raphson steps formulated as iteratively reweighted least + squares (IRLS), which is solved by LSMR. Contrary to `newton-cholesky`, + this solver does not explicitly materialize the Hessian matrix but + instead leverages knowledge about its structure to incrementally solve + the least squares problems via a series of matrix vector operations + where the matrices have size `(n_samples, n_features)`. + Additionaly, this is numerically more stable for ill-conditioned X + compared to `newton-cholesky`. .. warning:: The choice of the algorithm depends on the penalty chosen. @@ -918,6 +929,7 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): - 'liblinear' - ['l1', 'l2'] - 'newton-cg' - ['l2', None] - 'newton-cholesky' - ['l2', None] + - 'newton-lsmr' - ['l2', None] - 'sag' - ['l2', None] - 'saga' - ['elasticnet', 'l1', 'l2', None] @@ -940,6 +952,8 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2 newton-cholesky solver. + .. versionadded:: 1.3 + newton-lsmr solver. max_iter : int, default=100 Maximum number of iterations taken for the solvers to converge. @@ -958,8 +972,8 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): Default changed from 'ovr' to 'auto' in 0.22. verbose : int, default=0 - For the liblinear and lbfgs solvers set verbose to any positive - number for verbosity. + For most solver, except for 'newton-cg', set verbose to any positive number + for verbosity. warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as @@ -1093,7 +1107,15 @@ class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): "random_state": ["random_state"], "solver": [ StrOptions( - {"lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"} + { + "lbfgs", + "liblinear", + "newton-cg", + "newton-cholesky", + "newton-lsmr", + "sag", + "saga", + } ) ], "max_iter": [Interval(Integral, 0, None, closed="left")], @@ -1282,7 +1304,7 @@ def fit(self, X, y, sample_weight=None): # and multinomial multiclass classification and use joblib only for the # one-vs-rest multiclass case. if ( - solver in ["lbfgs", "newton-cg", "newton-cholesky"] + solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr"] and len(classes_) == 1 and effective_n_jobs(self.n_jobs) == 1 ): @@ -1472,16 +1494,16 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima that can be used, look at :mod:`sklearn.metrics`. The default scoring option used is 'accuracy'. - solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \ - default='lbfgs' + solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'newton-lsmr', \ + 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - For small datasets, 'liblinear' is a good choice, whereas 'sag' - and 'saga' are faster for large ones; - - For multiclass problems, only 'newton-cg', 'sag', 'saga' and - 'lbfgs' handle multinomial loss; + and 'saga' are faster for large ones. + - For multiclass problems, only 'newton-cg', 'newton-lsmr', 'sag', 'saga' + and 'lbfgs' handle multinomial loss. - 'liblinear' might be slower in :class:`LogisticRegressionCV` because it does not handle warm-starting. 'liblinear' is limited to one-versus-rest schemes. @@ -1500,6 +1522,7 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima - 'liblinear' - ['l1', 'l2'] - 'newton-cg' - ['l2'] - 'newton-cholesky' - ['l2'] + - 'newton-lsmr' - ['l2'] - 'sag' - ['l2'] - 'saga' - ['elasticnet', 'l1', 'l2'] @@ -1514,6 +1537,8 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima SAGA solver. .. versionadded:: 1.2 newton-cholesky solver. + .. versionadded:: 1.3 + newton-lsmr solver tol : float, default=1e-4 Tolerance for stopping criteria. @@ -1542,8 +1567,8 @@ class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstima for more details. verbose : int, default=0 - For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any - positive number for verbosity. + For most solver, except for 'newton-cg', set verbose to any positive number + for verbosity. refit : bool, default=True If set to True, the scores are averaged across all folds, and the diff --git a/sklearn/linear_model/tests/test_linear_loss.py b/sklearn/linear_model/tests/test_linear_loss.py index 230966db1ceaf..1a343493a9008 100644 --- a/sklearn/linear_model/tests/test_linear_loss.py +++ b/sklearn/linear_model/tests/test_linear_loss.py @@ -9,6 +9,7 @@ import pytest from numpy.testing import assert_allclose from scipy import linalg, optimize +from scipy.special import logit from sklearn._loss.loss import ( HalfBinomialLoss, @@ -16,7 +17,10 @@ HalfPoissonLoss, ) from sklearn.datasets import make_low_rank_matrix -from sklearn.linear_model._linear_loss import LinearModelLoss +from sklearn.linear_model._linear_loss import ( + LinearModelLoss, + Multinomial_LDL_Decomposition, +) from sklearn.utils.extmath import squared_norm from sklearn.utils.fixes import CSR_CONTAINERS @@ -353,5 +357,293 @@ def test_multinomial_coef_shape(fit_intercept): assert_allclose(g_r, g1_r) assert_allclose(g_r, g2_r) - assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F")) - assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F")) + assert_allclose(g, g_r.reshape((loss.base_loss.n_classes, -1), order="F")) + assert_allclose(h, h_r.reshape((loss.base_loss.n_classes, -1), order="F")) + + +def test_multinomial_identifiability_properties(global_random_seed): + """Test that multinomial LinearModelLoss fulfills identifiability properties. + + In our symmetrical overdetermined parametrization, we always expect + np.sum(coef, axis=0) = 0. But if we add the same value cj to all classes, i.e. + coef[:, j] += cj, certain properties hold: + 1. The unpenalized loss in invariant. + 2. The unpenalized Hessian @ c gives zero. + 3. The unpenalized Gradient @ c gives zero. + """ + n_samples, n_features, n_classes = 30, 4, 6 + l2_reg_strength = 0 + rng = np.random.RandomState(global_random_seed) + X = rng.standard_normal((n_samples, n_features)) + y = rng.randint(low=0, high=n_classes, size=(n_samples)).astype(float) + coef = rng.standard_normal((n_classes, n_features)) + coef -= np.mean(coef, axis=0) + assert_allclose(np.mean(coef, axis=0), 0, atol=1e-15) + + multinomial_loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes), + fit_intercept=False, + ) + gradient, hessp = multinomial_loss.gradient_hessian_product( + coef=coef, + X=X, + y=y, + l2_reg_strength=l2_reg_strength, + ) + + # Construct a coef-like array with same values for all classes, e.g. + # c = [[1, 2, 3], + # [1, 2, 3]] + c = np.tile(rng.standard_normal(n_features), (n_classes, 1)) + assert_allclose(hessp(c), 0, atol=1e-14) + assert_allclose(gradient.ravel() @ c.ravel(), 0, atol=1e-13) + + loss1 = multinomial_loss.loss(coef=coef, X=X, y=y, l2_reg_strength=l2_reg_strength) + loss2 = multinomial_loss.loss( + coef=coef + c, X=X, y=y, l2_reg_strength=l2_reg_strength + ) + assert_allclose(loss1, loss2) + + +def test_multinomial_LDL_decomposition_operates_inplace(global_random_seed): + n_samples, n_classes = 3, 5 + rng = np.random.RandomState(global_random_seed) + p = rng.uniform(low=0, high=1, size=(n_samples, n_classes)) + p /= np.sum(p, axis=1)[:, None] + assert_allclose(np.sum(p, axis=1), np.ones(shape=n_samples)) + + LDL = Multinomial_LDL_Decomposition(proba=p) + v = rng.standard_normal(size=(n_samples, n_classes)) + res = LDL.sqrt_D_Lt_matmul(v) + assert res is v + + v = rng.standard_normal(size=(n_samples, n_classes)) + res = LDL.L_sqrt_D_matmul(v) + assert res is v + + v = rng.standard_normal(size=(n_samples, n_classes)) + res = LDL.inverse_L_sqrt_D_matmul(v) + assert res is v + + +def test_multinomial_LDL_decomposition_binomial_single_point(): + """Test LDL' decomposition of multinomial hessian for simple cases. + + For the binomial case, we have p0 = 1 - p1 + LDL = [p0 * (1 - p0), -p0 * p1] = p0 * (1 - p0) * [1, -1] + [ -p0 * p1, p1 * (1 - p1)] [-1, 1] + + L = [ 1, 0] D = [p0 * (1 - p0), 0] + [-1, 1] [ 0, 0] + """ + p0, p1 = 0.2, 0.8 + p = np.array([[p0, p1]]) + # Note that LDL sets q = 1 whenever q = 0 for easier handling of divisions by zero. + # We compare those values to "zero", to make that clear. + zero = 1 + + LDL = Multinomial_LDL_Decomposition(proba=p) + assert_allclose(1 / LDL.q_inv[0, :], [1 - p0, zero]) + assert_allclose(LDL.sqrt_d[0, :] ** 2, [p0 * (1 - p0), 0]) + + # C = diag(D) L' x with x = [[1, 0]] (n_samples=1, n_classes=2) + C = LDL.sqrt_D_Lt_matmul(np.array([[1.0, 0.0]])) + assert_allclose(C[0, :], [np.sqrt(p0 * (1 - p0)), 0]) + + # C = diag(D) L' x with x = [[0, 1]] (n_samples=1, n_classes=2) + C = LDL.sqrt_D_Lt_matmul(np.array([[0.0, 1.0]])) + assert_allclose(C[0, :], [-np.sqrt(p1 * (1 - p1)), 0]) + + # Test with hessian product (hessp) of LinearModelLoss with X = [[1]] and coef such + # that probabilities are again (p0, p1). + # Hessian = X' LDL X + loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=2), + fit_intercept=False, + ) + # Note that y has no effect on the hessian. + coef = 0.5 * np.array([[logit(p0)], [logit(p1)]]) # tested below + X = np.array([[1.0]]) + raw = X @ coef.T # raw.shape = (n_samples, n_classes) = (1, 2) + assert_allclose(loss.base_loss.predict_proba(raw), [[p0, p1]]) + C = LDL.sqrt_D_Lt_matmul(raw.copy()) + grad, hessp = loss.gradient_hessian_product( + coef=coef, + X=X, + y=np.array([0.0]), + l2_reg_strength=0.0, + ) + # Note: hessp(coef).shape = (n_classes, n_features) = (2, 1) + assert_allclose( + hessp(coef), + p0 + * (1 - p0) + * np.array([raw[0, 0] - raw[0, 1], -raw[0, 0] + raw[0, 1]])[:, None], + ) + assert_allclose(C @ C.T, coef.T @ hessp(coef)) + + +def test_multinomial_LDL_decomposition_3_classes(): + """Test LDL' decomposition of multinomial hessian for 3 classes and 2 points. + + For n_classes = 3 and n_samples = 2, we have + p0 = [p0_0, p0_1] + p1 = [p1_0, p1_1] + p2 = [p2_0, p2_1] + and with 2 x 2 diagonal subblocks + LDL = [p0 * (1-p0), -p0 * p1, -p0 * p2] + [ -p0 * p1, p1 * (1-p1), -p1 * p2] + [ -p0 * p2, -p1 * p2, p2 * (1-p2)] + + L = [ 1, 0, 0] + [-p1 / (1-p0), 1, 0] + [-p2 / (1-p0), -p2 / (1-p0-p1), 1] + + D = [p0 * (1-p0), 0, 0] + [ 0, p1 * (1-p0-p1) / (1-p0), 0] + [ 0, 0, 0] + """ + n = 2 * 3 # n_samples * n_classes + p0 = np.array([0.7, 0.6]) + p1 = np.array([0.2, 0.25]) + p2 = np.array([0.1, 0.15]) + one = np.ones(2) + zero = np.zeros(2) + p0d, p1d, p2d, oned, zerod = ( + np.diag(p0), + np.diag(p1), + np.diag(p2), + np.diag(one), + np.diag(zero), + ) + H = np.block( + [ + [p0d * (oned - p0d), -p0d * p1d, -p0d * p2d], + [-p0d * p1d, p1d * (oned - p1d), -p1d * p2d], + [-p0d * p2d, -p1d * p2d, p2d * (oned - p2d)], + ] + ) + L = np.block( + [ + [oned, zerod, zerod], + [np.diag(-p1 / (one - p0)), oned, zerod], + [np.diag(-p2 / (one - p0)), np.diag(-p2 / (one - p0 - p1)), oned], + ] + ) + D = np.diag(np.r_[p0 * (1 - p0), p1 * (1 - p0 - p1) / (1 - p0), zero]) + L_inv = np.block( + [ + [oned, zerod, zerod], + [np.diag(p1 / (one - p0)), oned, zerod], + [np.diag(p2 / (one - p0 - p1)), np.diag(p2 / (one - p0 - p1)), oned], + ] + ) + D_inv = np.zeros_like(D) + D_inv[D > 0] = 1 / np.sqrt(D[D > 0]) + + lu, d, perm = linalg.ldl(H) + assert_allclose(lu @ d @ lu.T, H) + assert_allclose(L @ D @ L.T, H) + assert_allclose(L, lu) + assert_allclose(D, d, atol=1e-14) + assert_allclose(L_inv @ L, np.eye(n), atol=1e-14) + assert_allclose(L @ L_inv, np.eye(n), atol=1e-14) + + LDL = Multinomial_LDL_Decomposition(proba=np.c_[p0, p1, p2]) + v = 1.0 + np.arange(2 * 3) + DLt_v = LDL.sqrt_D_Lt_matmul(v.copy().reshape((2, 3), order="F")) + assert_allclose(DLt_v.ravel(order="F"), np.sqrt(D) @ L.T @ v) + LDL_v = LDL.L_sqrt_D_matmul(DLt_v.copy()) + assert_allclose(LDL_v.ravel(order="F"), H @ v) + + LD_v = LDL.L_sqrt_D_matmul(v.copy().reshape((2, 3), order="F")) + assert_allclose(LD_v.ravel(order="F"), L @ np.sqrt(D) @ v) + + x1 = LDL.inverse_L_sqrt_D_matmul(v.copy().reshape((2, 3), order="F")) + # This does not work as L @ D is singular. + # x2 = linalg.solve(L @ np.sqrt(D), v) + # We can just neglect the last class as it is redundant. + x2 = linalg.solve(L[:-2, :-2] @ np.sqrt(D[:-2, :-2]), v[:-2]) + assert_allclose(x1.ravel(order="F")[:-2], x2) + assert_allclose(x1.ravel(order="F"), D_inv @ L_inv @ v) + + # Test consistency of L_sqrt_D_matmul, sqrt_D_Lt_matmul and inverse_L_sqrt_D_matmul, + # i.e. reconstructing the matrices based on X @ unit_vector and compare with + # explicit matrices D and L. + unit_vec = np.zeros(n) + for op, result in ( + (LDL.sqrt_D_Lt_matmul, np.sqrt(D) @ L.T), + (LDL.L_sqrt_D_matmul, L @ np.sqrt(D)), + (LDL.inverse_L_sqrt_D_matmul, D_inv @ L_inv), + ): + X = np.zeros((n, n)) + for i in range(n): + unit_vec[i] = 1 + X[:, i] = op(unit_vec.copy().reshape((2, 3), order="F")).ravel(order="F") + unit_vec[i] = 0 + assert_allclose(X, result) + + +def test_multinomial_LDL_decomposition(global_random_seed): + """Test LDL' decomposition of multinomial hessian.""" + n_samples, n_features, n_classes = 3, 4, 5 + rng = np.random.RandomState(global_random_seed) + X = rng.standard_normal(size=(n_samples, n_features)) + coef = rng.standard_normal(size=(n_classes, n_features)) + raw_prediction = X @ coef.T # shape = (n_samples, n_classes) + loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes), + fit_intercept=False, + ) + p = loss.base_loss.predict_proba(raw_prediction=raw_prediction) + assert_allclose(np.sum(p, axis=1), np.ones(shape=n_samples)) + + LDL = Multinomial_LDL_Decomposition(proba=p) + # Note that LDL sets q = 1 whenever q = 0 for easier handling of divisions by zero. + # As q[:, -1] = 0 if p sums to 1, we do not compare the last values corresponding + # to the last class. + assert_allclose(1 / LDL.q_inv[:, :-1], 1 - np.cumsum(LDL.p, axis=1)[:, :-1]) + + # C = diag(D) L' x with x = X @ coef = raw_prediction + C = LDL.sqrt_D_Lt_matmul(raw_prediction.copy()) + + # Note that y has no effect on the hessian. + grad, hessp = loss.gradient_hessian_product( + coef=coef, + X=X, + y=rng.randint(low=0, high=n_classes, size=n_samples).astype(X.dtype), + l2_reg_strength=0.0, + ) + # C.shape = (n_samples, n_classes), hessp(coef).shape = (n_classes, n_features) + # LinearModelLoss normalizes loss as 1 / n_samples * sum(loss_i), therefore, we + # need to devide C'C by 1/n_samples. + assert_allclose( + np.tensordot(C, C, axes=2) / n_samples, np.tensordot(coef, hessp(coef), axes=2) + ) + CtC = LDL.L_sqrt_D_matmul(C.copy()) + CtC = np.tensordot(raw_prediction, CtC, axes=2) + assert_allclose(CtC, np.tensordot(C, C, axes=2)) + + +def test_multinomial_LDL_inverse_sqrt_D_Lt_matmul(global_random_seed): + """Test inverse of LDL' decomposition of multinomial hessian.""" + n_samples, n_classes = 4, 5 + rng = np.random.RandomState(global_random_seed) + p = rng.uniform(low=0, high=1, size=(n_samples, n_classes)) + p /= np.sum(p, axis=1)[:, None] + LDL = Multinomial_LDL_Decomposition(proba=p) + x = rng.standard_normal(size=(n_samples, n_classes)) + # Without centering, the last class, x[:, -1] would fail in the assert below. + x -= np.mean(x, axis=1)[:, None] + + invDL_x = LDL.inverse_L_sqrt_D_matmul(x.copy()) + LD_invDL_x = LDL.L_sqrt_D_matmul(invDL_x.copy()) + assert_allclose(LD_invDL_x, x) + + +def test_multinomial_LDL_warning(): + """Test that LDL warns if probabilities do not sum to 1.""" + p = np.arange(4, dtype=float).reshape((2, 2)) + msg = "Probabilities proba are assumed to sum to 1, but they don't." + with pytest.warns(UserWarning, match=msg): + Multinomial_LDL_Decomposition(proba=p, proba_sum_to_1=True) diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 30625ea9f2bef..01948a7f47b46 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -49,7 +49,15 @@ LogisticRegressionCV = partial(LogisticRegressionCVDefault, random_state=0) -SOLVERS = ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga") +SOLVERS = ( + "lbfgs", + "liblinear", + "newton-cg", + "newton-cholesky", + "newton-lsmr", + "sag", + "saga", +) X = [[-1, 0], [0, 1], [1, 1]] Y1 = [0, 1, 1] Y2 = [2, 1, 0] @@ -165,6 +173,9 @@ def test_predict_3_classes(csr_container): LogisticRegression( C=len(iris.data), solver="newton-cholesky", multi_class="ovr" ), + LogisticRegression( + C=len(iris.data), solver="newton-lsmr", multi_class="multinomial" + ), ], ) def test_predict_iris(clf): @@ -207,12 +218,12 @@ def test_check_solver_option(LR): lr.fit(X, y) # all solvers except 'liblinear' and 'saga' - for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag"]: + for solver in ["lbfgs", "newton-cg", "newton-cholesky", "newton-lsmr", "sag"]: msg = "Solver %s supports only 'l2' or None penalties," % solver lr = LR(solver=solver, penalty="l1", multi_class="ovr") with pytest.raises(ValueError, match=msg): lr.fit(X, y) - for solver in ["lbfgs", "newton-cg", "newton-cholesky", "sag", "saga"]: + for solver in set(SOLVERS) - set(["liblinear"]): msg = "Solver %s supports only dual=False, got dual=True" % solver lr = LR(solver=solver, dual=True, multi_class="ovr") with pytest.raises(ValueError, match=msg): @@ -393,7 +404,7 @@ def test_consistency_path(): ) # test for fit_intercept=True - for solver in ("lbfgs", "newton-cg", "newton-cholesky", "liblinear", "sag", "saga"): + for solver in set(SOLVERS): Cs = [1e3] coefs, Cs, _ = f(_logistic_regression_path)( X, @@ -767,6 +778,8 @@ def test_logistic_regressioncv_class_weights(weight, class_weight, global_random clf.set_params( tol=1e-18, max_iter=10000, random_state=global_random_seed + 1 ) + elif solver == "newton-lsmr": + clf.set_params(tol=1e-6) clf.fit(X, y) assert_allclose( @@ -795,7 +808,7 @@ def test_logistic_regression_sample_weights(): assert_allclose(clf_sw_none.coef_, clf_sw_ones.coef_, rtol=1e-4) # Test that sample weights work the same with the lbfgs, - # newton-cg, newton-cholesky and 'sag' solvers + # newton-cg, newton-cholesky, newton-lsmr and sag solvers clf_sw_lbfgs = LR(**kw, tol=1e-5) clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight) for solver in set(SOLVERS) - set(("lbfgs", "saga")): @@ -1331,7 +1344,7 @@ def test_saga_vs_liblinear(csr_container): @pytest.mark.parametrize("multi_class", ["ovr", "multinomial"]) @pytest.mark.parametrize( - "solver", ["liblinear", "newton-cg", "newton-cholesky", "saga"] + "solver", ["liblinear", "newton-cg", "newton-cholesky", "newton-lsmr", "saga"] ) @pytest.mark.parametrize("fit_intercept", [False, True]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS)