scikit-learn · glemaitre · Aug 5, 2021 · Aug 1, 2021 · Aug 1, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -669,6 +669,9 @@ Changelog
   `n_features_in_` and will be removed in 1.2. :pr:`20240` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Efficiency| `preprocessing.StandardScaler` is faster and more memory
+  efficient. :pr:`20652` by `Thomas Fan`_.
+
 :mod:`sklearn.tree`
 ...................
 

diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
@@ -955,24 +955,30 @@ def _incremental_mean_and_var(
     # new = the current increment
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
+    X_nan_mask = np.isnan(X)
+    if np.any(X_nan_mask):
+        sum_op = np.nansum
+    else:
+        sum_op = np.sum
     if sample_weight is not None:
         if np_version >= parse_version("1.16.6"):
             # equivalent to np.nansum(X * sample_weight, axis=0)
             # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
             # dtype arg of np.matmul only exists since version 1.16
             new_sum = _safe_accumulator_op(
-                np.matmul, sample_weight, np.where(np.isnan(X), 0, X)
+                np.matmul, sample_weight, np.where(X_nan_mask, 0, X)
             )
         else:
             new_sum = _safe_accumulator_op(
                 np.nansum, X * sample_weight[:, None], axis=0
             )
         new_sample_count = _safe_accumulator_op(
-            np.sum, sample_weight[:, None] * (~np.isnan(X)), axis=0
+            np.sum, sample_weight[:, None] * (~X_nan_mask), axis=0
         )
     else:
-        new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
-        new_sample_count = np.sum(~np.isnan(X), axis=0)
+        new_sum = _safe_accumulator_op(sum_op, X, axis=0)
+        n_samples = X.shape[0]
+        new_sample_count = n_samples - np.sum(X_nan_mask, axis=0)
 
     updated_sample_count = last_sample_count + new_sample_count
 
@@ -982,29 +988,31 @@ def _incremental_mean_and_var(
         updated_variance = None
     else:
         T = new_sum / new_sample_count
+        temp = X - T
         if sample_weight is not None:
             if np_version >= parse_version("1.16.6"):
                 # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0)
                 # safer because np.float64(X*W) != np.float64(X)*np.float64(W)
                 # dtype arg of np.matmul only exists since version 1.16
-                new_unnormalized_variance = _safe_accumulator_op(
-                    np.matmul, sample_weight, np.where(np.isnan(X), 0, (X - T) ** 2)
-                )
                 correction = _safe_accumulator_op(
-                    np.matmul, sample_weight, np.where(np.isnan(X), 0, X - T)
+                    np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
                 )
-            else:
+                temp **= 2
                 new_unnormalized_variance = _safe_accumulator_op(
-                    np.nansum, (X - T) ** 2 * sample_weight[:, None], axis=0
+                    np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
                 )
+            else:
                 correction = _safe_accumulator_op(
-                    np.nansum, (X - T) * sample_weight[:, None], axis=0
+                    sum_op, temp * sample_weight[:, None], axis=0
+                )
+                temp *= temp
+                new_unnormalized_variance = _safe_accumulator_op(
+                    sum_op, temp * sample_weight[:, None], axis=0
                 )
         else:
-            new_unnormalized_variance = _safe_accumulator_op(
-                np.nansum, (X - T) ** 2, axis=0
-            )
-            correction = _safe_accumulator_op(np.nansum, X - T, axis=0)
+            correction = _safe_accumulator_op(sum_op, temp, axis=0)
+            temp **= 2
+            new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0)
 
         # correction term of the corrected 2 pass algorithm.
         # See "Algorithms for computing the sample variance: analysis