scikit-learn · OmarManzoor · Jan 17, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/doc/whats_new/upcoming_changes/sklearn.linear_model/30644.fix.rst b/doc/whats_new/upcoming_changes/sklearn.linear_model/30644.fix.rst
@@ -0,0 +1,3 @@
+-  The update and initialization of the hyperparameters now properly handle
+   sample weights in :class:`linear_model.BayesianRidge`.
+   By :user:`Antoine Baker <antoinebaker>`.
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
@@ -244,9 +244,15 @@ def fit(self, X, y, sample_weight=None):
             y_numeric=True,
         )
         dtype = X.dtype
+        n_samples, n_features = X.shape
 
+        sw_sum = n_samples
+        y_var = y.var()
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
+            sw_sum = sample_weight.sum()
+            y_mean = np.average(y, weights=sample_weight)
+            y_var = np.average((y - y_mean) ** 2, weights=sample_weight)
 
         X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
             X,
@@ -262,16 +268,14 @@ def fit(self, X, y, sample_weight=None):
 
         self.X_offset_ = X_offset_
         self.X_scale_ = X_scale_
-        n_samples, n_features = X.shape
 
         # Initialization of the values of the parameters
         eps = np.finfo(np.float64).eps
-        # Add `eps` in the denominator to omit division by zero if `np.var(y)`
-        # is zero
+        # Add `eps` in the denominator to omit division by zero
         alpha_ = self.alpha_init
         lambda_ = self.lambda_init
         if alpha_ is None:
-            alpha_ = 1.0 / (np.var(y) + eps)
+            alpha_ = 1.0 / (y_var + eps)
         if lambda_ is None:
             lambda_ = 1.0
 
@@ -295,21 +299,28 @@ def fit(self, X, y, sample_weight=None):
         # Convergence loop of the bayesian ridge regression
         for iter_ in range(self.max_iter):
             # update posterior mean coef_ based on alpha_ and lambda_ and
-            # compute corresponding rmse
-            coef_, rmse_ = self._update_coef_(
+            # compute corresponding sse (sum of squared errors)
+            coef_, sse_ = self._update_coef_(
                 X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
             )
             if self.compute_score:
                 # compute the log marginal likelihood
                 s = self._log_marginal_likelihood(
-                    n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
+                    n_samples,
+                    n_features,
+                    sw_sum,
+                    eigen_vals_,
+                    alpha_,
+                    lambda_,
+                    coef_,
+                    sse_,
                 )
                 self.scores_.append(s)
 
             # Update alpha and lambda according to (MacKay, 1992)
             gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
             lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
-            alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
+            alpha_ = (sw_sum - gamma_ + 2 * alpha_1) / (sse_ + 2 * alpha_2)
 
             # Check for convergence
             if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
@@ -324,13 +335,20 @@ def fit(self, X, y, sample_weight=None):
         # log marginal likelihood and posterior covariance
         self.alpha_ = alpha_
         self.lambda_ = lambda_
-        self.coef_, rmse_ = self._update_coef_(
+        self.coef_, sse_ = self._update_coef_(
             X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
         )
         if self.compute_score:
             # compute the log marginal likelihood
             s = self._log_marginal_likelihood(
-                n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
+                n_samples,
+                n_features,
+                sw_sum,
+                eigen_vals_,
+                alpha_,
+                lambda_,
+                coef_,
+                sse_,
             )
             self.scores_.append(s)
             self.scores_ = np.array(self.scores_)
@@ -378,7 +396,7 @@ def predict(self, X, return_std=False):
     def _update_coef_(
         self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
     ):
-        """Update posterior mean and compute corresponding rmse.
+        """Update posterior mean and compute corresponding sse (sum of squared errors).
 
         Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
         scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
@@ -394,12 +412,14 @@ def _update_coef_(
                 [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
             )
 
-        rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+        # Note: we do not need to explicitly use the weights in this sum because
+        # y and X were preprocessed by _rescale_data to handle the weights.
+        sse_ = np.sum((y - np.dot(X, coef_)) ** 2)
 
-        return coef_, rmse_
+        return coef_, sse_
 
     def _log_marginal_likelihood(
-        self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
+        self, n_samples, n_features, sw_sum, eigen_vals, alpha_, lambda_, coef, sse
     ):
         """Log marginal likelihood."""
         alpha_1 = self.alpha_1
@@ -421,11 +441,11 @@ def _log_marginal_likelihood(
         score += alpha_1 * log(alpha_) - alpha_2 * alpha_
         score += 0.5 * (
             n_features * log(lambda_)
-            + n_samples * log(alpha_)
-            - alpha_ * rmse
            + sw_sum * log(alpha_)
+            - alpha_ * sse
             - lambda_ * np.sum(coef**2)
             + logdet_sigma
-            - n_samples * log(2 * np.pi)
+            - sw_sum * log(2 * np.pi)
         )
 
         return score
@@ -684,14 +704,12 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
             coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
 
             # Update alpha and lambda
-            rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
+            sse_ = np.sum((y - np.dot(X, coef_)) ** 2)
             gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
             lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
                 (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
             )
-            alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
-                rmse_ + 2.0 * alpha_2
-            )
+            alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (sse_ + 2.0 * alpha_2)
 
             # Prune the weights with a precision over a threshold
             keep_lambda = lambda_ < self.threshold_lambda
@@ -706,7 +724,7 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
                     + n_samples * log(alpha_)
                     + np.sum(np.log(lambda_))
                 )
-                s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
+                s -= 0.5 * (alpha_ * sse_ + (lambda_ * coef_**2).sum())
                 self.scores_.append(s)
 
             # Check for convergence

diff --git a/sklearn/utils/_test_common/instance_generator.py b/sklearn/utils/_test_common/instance_generator.py
@@ -836,15 +836,6 @@ def _yield_instances_for_check(check, estimator_orig):
             "sample_weight is not equivalent to removing/repeating samples."
         ),
     },
-    BayesianRidge: {
-        # TODO: fix sample_weight handling of this estimator, see meta-issue #16298
-        "check_sample_weight_equivalence_on_dense_data": (
-            "sample_weight is not equivalent to removing/repeating samples."
-        ),
-        "check_sample_weight_equivalence_on_sparse_data": (
-            "sample_weight is not equivalent to removing/repeating samples."
-        ),
-    },
     BernoulliRBM: {
         "check_methods_subset_invariance": ("fails for the decision_function method"),
         "check_methods_sample_order_invariance": ("fails for the score_samples method"),