scikit-learn
diff --git a/‎sklearn/linear_model/_glm/glm.py
Lines changed: 16 additions & 13 deletions b/‎sklearn/linear_model/_glm/glm.py
Lines changed: 16 additions & 13 deletions
diff --git a/‎sklearn/linear_model/_linear_loss.py
Lines changed: 28 additions & 15 deletions b/‎sklearn/linear_model/_linear_loss.py
Lines changed: 28 additions & 15 deletions
diff --git a/‎sklearn/linear_model/_logistic.py
Lines changed: 25 additions & 28 deletions b/‎sklearn/linear_model/_logistic.py
Lines changed: 25 additions & 28 deletions
diff --git a/‎sklearn/linear_model/tests/test_logistic.py
Lines changed: 46 additions & 6 deletions b/‎sklearn/linear_model/tests/test_logistic.py
Lines changed: 46 additions & 6 deletions
@@ -207,10 +207,10 @@ def fit(self, X, y, sample_weight=None):
             loss_dtype = min(max(y.dtype, X.dtype), np.float64)
         y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
 
-        # TODO: We could support samples_weight=None as the losses support it.
-        # Note that _check_sample_weight calls check_array(order="C") required by
-        # losses.
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
 
         n_samples, n_features = X.shape
         self._base_loss = self._get_loss()
@@ -228,17 +228,20 @@ def fit(self, X, y, sample_weight=None):
 
         # TODO: if alpha=0 check that X is not rank deficient
 
-        # IMPORTANT NOTE: Rescaling of sample_weight:
+        # NOTE: Rescaling of sample_weight:
         # We want to minimize
-        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
+        #     obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance)
         #         + 1/2 * alpha * L2,
         # with
         #     deviance = 2 * loss.
         # The objective is invariant to multiplying sample_weight by a constant. We
-        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
+        # could choose this constant such that sum(sample_weight) = 1 in order to end
+        # up with
         #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
-        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
-        sample_weight = sample_weight / sample_weight.sum()
+        # But LinearModelLoss.loss() already computes
+        #     average(loss, weights=sample_weight)
+        # Thus, without rescaling, we have
+        #     obj = LinearModelLoss.loss(...)
 
         if self.warm_start and hasattr(self, "coef_"):
             if self.fit_intercept:
@@ -415,10 +418,10 @@ def score(self, X, y, sample_weight=None):
                 f" {base_loss.__name__}."
             )
 
-        # Note that constant_to_optimal_zero is already multiplied by sample_weight.
-        constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y))
-        if sample_weight is not None:
-            constant *= sample_weight.shape[0] / np.sum(sample_weight)
+        constant = np.average(
+            base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None),
+            weights=sample_weight,
+        )
 
         # Missing factor of 2 in deviance cancels out.
         deviance = base_loss(
 
@@ -12,18 +12,19 @@ class LinearModelLoss:
 
     Note that raw_prediction is also known as linear predictor.
 
-    The loss is the sum of per sample losses and includes a term for L2
+    The loss is the average of per sample losses and includes a term for L2
     regularization::
 
-        loss = sum_i s_i loss(y_i, X_i @ coef + intercept)
+        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
                + 1/2 * l2_reg_strength * ||coef||_2^2
 
-    with sample weights s_i=1 if sample_weight=None.
+    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
 
     Gradient and hessian, for simplicity without intercept, are::
 
-        gradient = X.T @ loss.gradient + l2_reg_strength * coef
-        hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
+        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
+                  + l2_reg_strength * identity
 
     Conventions:
         if fit_intercept:
@@ -182,7 +183,7 @@ def loss(
         n_threads=1,
         raw_prediction=None,
     ):
-        """Compute the loss as sum over point-wise losses.
+        """Compute the loss as weighted average over point-wise losses.
 
         Parameters
         ----------
@@ -209,7 +210,7 @@ def loss(
         Returns
         -------
         loss : float
-            Sum of losses per sample plus penalty.
+            Weighted average of losses per sample, plus penalty.
         """
         if raw_prediction is None:
             weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
@@ -219,10 +220,10 @@ def loss(
         loss = self.base_loss.loss(
             y_true=y,
             raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
+            sample_weight=None,
             n_threads=n_threads,
         )
-        loss = loss.sum()
+        loss = np.average(loss, weights=sample_weight)
 
         return loss + self.l2_penalty(weights, l2_reg_strength)
 
@@ -263,12 +264,12 @@ def loss_gradient(
         Returns
         -------
         loss : float
-            Sum of losses per sample plus penalty.
+            Weighted average of losses per sample, plus penalty.
 
         gradient : ndarray of shape coef.shape
              The gradient of the loss.
         """
-        n_features, n_classes = X.shape[1], self.base_loss.n_classes
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
 
         if raw_prediction is None:
@@ -282,9 +283,12 @@ def loss_gradient(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
-        loss = loss.sum()
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        loss = loss.sum() / sw_sum
         loss += self.l2_penalty(weights, l2_reg_strength)
 
+        grad_pointwise /= sw_sum
+
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
@@ -340,7 +344,7 @@ def gradient(
         gradient : ndarray of shape coef.shape
              The gradient of the loss.
         """
-        n_features, n_classes = X.shape[1], self.base_loss.n_classes
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
 
         if raw_prediction is None:
@@ -354,6 +358,8 @@ def gradient(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
 
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
@@ -439,6 +445,9 @@ def gradient_hessian(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
+        hess_pointwise /= sw_sum
 
         # For non-canonical link functions and far away from the optimum, the pointwise
         # hessian can be negative. We take care that 75% of the hessian entries are
@@ -543,6 +552,7 @@ def gradient_hessian_product(
         (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
         weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
 
         if not self.base_loss.is_multiclass:
             grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
@@ -551,6 +561,8 @@ def gradient_hessian_product(
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
+            grad_pointwise /= sw_sum
+            hess_pointwise /= sw_sum
             grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
             if self.fit_intercept:
@@ -603,6 +615,7 @@ def hessp(s):
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
+            grad_pointwise /= sw_sum
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
@@ -644,9 +657,9 @@ def hessp(s):
                 # hess_prod = empty_like(grad), but we ravel grad below and this
                 # function is run after that.
                 hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
-                hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
+                hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s
                 if self.fit_intercept:
-                    hess_prod[:, -1] = tmp.sum(axis=0)
+                    hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum
                 if coef.ndim == 1:
                     return hess_prod.ravel(order="F")
                 else:
 
@@ -292,33 +292,27 @@ def _logistic_regression_path(
         # np.unique(y) gives labels in sorted order.
         pos_class = classes[1]
 
-    # If sample weights exist, convert them to array (support for lists)
-    # and check length
-    # Otherwise set them to 1 for all examples
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
-
-    if solver == "newton-cholesky":
-        # IMPORTANT NOTE: Rescaling of sample_weight:
-        # Same as in _GeneralizedLinearRegressor.fit().
-        # We want to minimize
-        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
-        #         + 1/2 * alpha * L2,
-        # with
-        #     deviance = 2 * log_loss.
-        # The objective is invariant to multiplying sample_weight by a constant. We
-        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
-        #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
-        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
-        #
-        # This rescaling has to be done before multiplying by class_weights.
-        sw_sum = sample_weight.sum()  # needed to rescale penalty, nasty matter!
-        sample_weight = sample_weight / sw_sum
+    if sample_weight is not None or class_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
+    # IMPORTANT NOTE:
+    # All solvers relying on LinearModelLoss need to scale the penalty with n_samples
+ 
10000
   # or the sum of sample weights as the here implemented logistic regression
+    # objective is (unfortunately)
+    #     C * sum(pointwise_loss) + penalty
+    # instead of (as LinearModelLoss does)
+    #     mean(pointwise_loss) + 1/C * penalty
+    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+        # This needs to be calculated before sample_weight is multiplied by
+        # class_weight.
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
 
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
     # the class_weights are assigned after masking the labels with a OvR.
     le = LabelEncoder()
-    if isinstance(class_weight, dict) or multi_class == "multinomial":
+    if isinstance(class_weight, dict) or (
+        multi_class == "multinomial" and class_weight is not None
+    ):
         class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)
         sample_weight *= class_weight_[le.fit_transform(y)]
 
@@ -445,7 +439,7 @@ def _logistic_regression_path(
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     for i, C in enumerate(Cs):
         if solver == "lbfgs":
-            l2_reg_strength = 1.0 / C
+            l2_reg_strength = 1.0 / (C * sw_sum)
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)
             ]
@@ -455,7 +449,12 @@ def _logistic_regression_path(
                 method="L-BFGS-B",
                 jac=True,
                 args=(X, target, sample_weight, l2_reg_strength, n_threads),
-                options={"iprint": iprint, "gtol": tol, "maxiter": max_iter},
+                options={
+                    "iprint": iprint,
+                    "gtol": tol,
+                    "maxiter": max_iter,
+                    "ftol": 64 * np.finfo(float).eps,
+                },
             )
             n_iter_i = _check_optimize_result(
                 solver,
@@ -465,15 +464,13 @@ def _logistic_regression_path(
             )
             w0, loss = opt_res.x, opt_res.fun
         elif solver == "newton-cg":
-            l2_reg_strength = 1.0 / C
+            l2_reg_strength = 1.0 / (C * sw_sum)
             args = (X, target, sample_weight, l2_reg_strength, n_threads)
             w0, n_iter_i = _newton_cg(
                 hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
             )
         elif solver == "newton-cholesky":
-            # The division by sw_sum is a consequence of the rescaling of
-            # sample_weight, see comment above.
-            l2_reg_strength = 1.0 / C / sw_sum
+            l2_reg_strength = 1.0 / (C * sw_sum)
             sol = NewtonCholeskySolver(
                 coef=w0,
                 linear_loss=loss,
 
@@ -702,14 +702,17 @@ def test_logistic_regression_solvers_multiclass():
     }
 
     for solver_1, solver_2 in itertools.combinations(regressors, r=2):
-        assert_array_almost_equal(
-            regressors[solver_1].coef_, regressors[solver_2].coef_, decimal=4
+        assert_allclose(
+            regressors[solver_1].coef_,
+            regressors[solver_2].coef_,
+            rtol=5e-3 if solver_2 == "saga" else 1e-3,
+            err_msg=f"{solver_1} vs {solver_2}",
         )
 
 
 @pytest.mark.parametrize("weight", [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}])
 @pytest.mark.parametrize("class_weight", ["weight", "balanced"])
-def test_logistic_regressioncv_class_weights(weight, class_weight):
+def test_logistic_regressioncv_class_weights(weight, class_weight, global_random_seed):
     """Test class_weight for LogisticRegressionCV."""
     n_classes = len(weight)
     if class_weight == "weight":
@@ -722,23 +725,60 @@ def test_logistic_regressioncv_class_weights(weight, class_weight):
         n_informative=3,
         n_redundant=0,
         n_classes=n_classes,
-        random_state=0,
+        random_state=global_random_seed,
     )
     params = dict(
         Cs=1,
         fit_intercept=False,
         multi_class="ovr",
         class_weight=class_weight,
+        tol=1e-8,
     )
     clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params)
     clf_lbfgs.fit(X, y)
 
+    from sklearn.linear_model._linear_loss import LinearModelLoss
+    from sklearn._loss.loss import HalfMultinomialLoss, HalfBinomialLoss
+
+    if n_classes > 2:
+        loss = LinearModelLoss(
+            base_loss=HalfMultinomialLoss(n_classes=n_classes),
+            fit_intercept=False,
+        )
+    else:
+        loss = LinearModelLoss(
+            base_loss=HalfBinomialLoss(),
+            fit_intercept=False,
+        )
+    l_lbfgs = loss.loss(
+        coef=clf_lbfgs.coef_.squeeze(),
+        X=X,
+        y=LabelEncoder().fit_transform(y).astype(float),
+        sample_weight=None,
+        l2_reg_strength=1 / 20,
+    )
+    print(f"loss lbfgs = {l_lbfgs} C_={clf_lbfgs.C_}")
+
     for solver in set(SOLVERS) - set(["lbfgs"]):
         clf = LogisticRegressionCV(solver=solver, **params)
         if solver in ("sag", "saga"):
-            clf.set_params(tol=1e-5, max_iter=10000, random_state=0)
+            clf.set_params(
+                tol=1e-18, max_iter=10000, random_state=global_random_seed + 1
+            )
         clf.fit(X, y)
-        assert_allclose(clf.coef_, clf_lbfgs.coef_, rtol=1e-3)
+
+        l_solver = loss.loss(
+            coef=clf.coef_.squeeze(),
+            X=X,
+            y=LabelEncoder().fit_transform(y).astype(float),
+            sample_weight=None,
+            l2_reg_strength=1 / 20,
+        )
+        print(f"loss {solver} = {l_solver} C_={clf.C_}")
+
+        assert_allclose(
+            clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs"
+        )
 
 
 def test_logistic_regression_sample_weights():