scikit-learn
diff --git a/‎sklearn/linear_model/_base.py‎
Lines changed: 36 additions & 19 deletions b/‎sklearn/linear_model/_base.py‎
Lines changed: 36 additions & 19 deletions
diff --git a/‎sklearn/linear_model/tests/test_base.py‎
Lines changed: 56 additions & 0 deletions b/‎sklearn/linear_model/tests/test_base.py‎
Lines changed: 56 additions & 0 deletions
@@ -187,6 +187,7 @@ def _preprocess_data(
     fit_intercept,
     normalize=False,
     copy=True,
+    copy_y=True,
     sample_weight=None,
     check_input=True,
 ):
@@ -230,13 +231,14 @@ def _preprocess_data(
 
     if check_input:
         X = check_array(X, copy=copy, accept_sparse=["csr", "csc"], dtype=FLOAT_DTYPES)
-    elif copy:
-        if sp.issparse(X):
-            X = X.copy()
-        else:
-            X = X.copy(order="K")
-
-    y = np.asarray(y, dtype=X.dtype)
+        y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False)
+    else:
+        y = y.astype(X.dtype, copy=copy_y)
+        if copy:
+            if sp.issparse(X):
+                X = X.copy()
+            else:
+                X = X.copy(order="K")
 
     if fit_intercept:
         if sp.issparse(X):
@@ -276,7 +278,7 @@ def _preprocess_data(
             X_scale = np.ones(X.shape[1], dtype=X.dtype)
 
         y_offset = np.average(y, axis=0, weights=sample_weight)
-        y = y - y_offset
+        y -= y_offset
     else:
         X_offset = np.zeros(X.shape[1], dtype=X.dtype)
         X_scale = np.ones(X.shape[1], dtype=X.dtype)
@@ -293,7 +295,7 @@ def _preprocess_data(
 # sample_weight makes the refactoring tricky.
 
 
-def _rescale_data(X, y, sample_weight):
+def _rescale_data(X, y, sample_weight, inplace=False):
     """Rescale data sample-wise by square root of sample_weight.
 
     For many linear models, this enables easy support for sample_weight because
@@ -328,18 +330,24 @@ def _rescale_data(X, y, sample_weight):
     if sp.issparse(X):
         X = safe_sparse_dot(sw_matrix, X)
     else:
-        # XXX: we do not do inplace multiplication on X for consistency
-        # with the sparse case and because the _rescale_data currently
-        # does not make it explicit if it's ok to do it or not.
-        X = X * sample_weight_sqrt[:, np.newaxis]
+        if inplace:
+            X *= sample_weight_sqrt[:, np.newaxis]
+        else:
+            X = X * sample_weight_sqrt[:, np.newaxis]
 
     if sp.issparse(y):
         y = safe_sparse_dot(sw_matrix, y)
     else:
-        if y.ndim == 1:
-            y = y * sample_weight_sqrt
+        if inplace:
+            if y.ndim == 1:
+                y *= sample_weight_sqrt
+            else:
+                y *= sample_weight_sqrt[:, np.newaxis]
         else:
-            y = y * sample_weight_sqrt[:, np.newaxis]
+            if y.ndim == 1:
+                y = y * sample_weight_sqrt
+            else:
+                y = y * sample_weight_sqrt[:, np.newaxis]
     return X, y, sample_weight_sqrt
 
 
@@ -674,17 +682,26 @@ def fit(self, X, y, sample_weight=None):
                 sample_weight, X, dtype=X.dtype, only_non_negative=True
             )
 
+        # Note that neither _rescale_data nor the rest of the fit method of
+        # LinearRegression can benefit from in-place operations when X is a
+        # sparse matrix. Therefore, let's not copy X when it is sparse.
+        copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X)
+
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X,
             y,
             fit_intercept=self.fit_intercept,
-            copy=self.copy_X,
+            copy=copy_X_in_preprocess_data,
             sample_weight=sample_weight,
         )
 
-        # Sample weight can be implemented via a simple rescaling.
         if has_sw:
-            X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
+            # Sample weight can be implemented via a simple rescaling. Note
+            # that we safely do inplace rescaling when _preprocess_data has
+            # already made a copy if requested.
+            X, y, sample_weight_sqrt = _rescale_data(
+                X, y, sample_weight, inplace=copy_X_in_preprocess_data
+            )
 
         if self.positive:
             if y.ndim < 2:
 
@@ -315,6 +315,62 @@ def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_s
     assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
 
 
+@pytest.mark.parametrize("sparse_X", [True, False])
+@pytest.mark.parametrize("use_sw", [True, False])
+def test_inplace_data_preprocessing(sparse_X, use_sw, global_random_seed):
+    # Check that the data is not modified inplace by the linear regression
+    # estimator.
+    rng = np.random.RandomState(global_random_seed)
+    original_X_data = rng.randn(10, 12)
+    original_y_data = rng.randn(10, 2)
+    orginal_sw_data = rng.rand(10)
+
+    if sparse_X:
+        X = sparse.csr_matrix(original_X_data)
+    else:
+        X = original_X_data.copy()
+    y = original_y_data.copy()
+    # XXX: Note hat y_sparse is not supported (broken?) in the current
+    # implementation of LinearRegression.
+
+    if use_sw:
+        sample_weight = orginal_sw_data.copy()
+    else:
+        sample_weight = None
+
+    # Do not allow inplace preprocessing of X and y:
+    reg = LinearRegression()
+    reg.fit(X, y, sample_weight=sample_weight)
+    if sparse_X:
+        assert_allclose(X.toarray(), original_X_data)
+    else:
+        assert_allclose(X, original_X_data)
+    assert_allclose(y, original_y_data)
+
+    if use_sw:
+        assert_allclose(sample_weight, orginal_sw_data)
+
+    # Allow inplace preprocessing of X and y
     reg = LinearRegression(copy_X=False)
+    reg.fit(X, y, sample_weight=sample_weight)
+    if sparse_X:
+        # No optimization relying on the inplace modification of sparse input
+        # data has been implemented at this time.
+        assert_allclose(X.toarray(), original_X_data)
+    else:
+        # X has been offset (and optionally rescaled by sample weights)
+        # inplace. The 0.42 threshold is arbitrary and has been found to be
+        # robust to any random seed in the admissible range.
+        assert np.linalg.norm(X - original_X_data) > 0.42
+
+    # y should not have been modified inplace by LinearRegression.fit.
+    assert_allclose(y, original_y_data)
+
+    if use_sw:
+        # Sample weights have no reason to ever be modified inplace.
+        assert_allclose(sample_weight, orginal_sw_data)
+
+
 def test_linear_regression_pd_sparse_dataframe_warning():
     pd = pytest.importorskip("pandas")