From fcb338e1c3e89fd280b04766305c746cc3f70486 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 1 Aug 2021 17:00:31 -0400 Subject: [PATCH 1/2] ENH Improves memory usage for standard scalar --- doc/whats_new/v1.0.rst | 3 +++ sklearn/utils/extmath.py | 38 +++++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 7f44c62eb7329..f8d31c6197415 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -669,6 +669,9 @@ Changelog `n_features_in_` and will be removed in 1.2. :pr:`20240` by :user:`Jérémie du Boisberranger `. +- |Efficiency| `preprocessing.StandardScaler` is faster and more memory + efficient. :pr:`_____` by `Thomas Fan`_. + :mod:`sklearn.tree` ................... diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index b6a5f3f8a914a..565ce20a4ba2b 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -955,24 +955,30 @@ def _incremental_mean_and_var( # new = the current increment # updated = the aggregated stats last_sum = last_mean * last_sample_count + X_nan_mask = np.isnan(X) + if np.any(X_nan_mask): + sum_op = np.nansum + else: + sum_op = np.sum if sample_weight is not None: if np_version >= parse_version("1.16.6"): # equivalent to np.nansum(X * sample_weight, axis=0) # safer because np.float64(X*W) != np.float64(X)*np.float64(W) # dtype arg of np.matmul only exists since version 1.16 new_sum = _safe_accumulator_op( - np.matmul, sample_weight, np.where(np.isnan(X), 0, X) + np.matmul, sample_weight, np.where(X_nan_mask, 0, X) ) else: new_sum = _safe_accumulator_op( np.nansum, X * sample_weight[:, None], axis=0 ) new_sample_count = _safe_accumulator_op( - np.sum, sample_weight[:, None] * (~np.isnan(X)), axis=0 + np.sum, sample_weight[:, None] * (~X_nan_mask), axis=0 ) else: - new_sum = _safe_accumulator_op(np.nansum, X, axis=0) - new_sample_count = np.sum(~np.isnan(X), axis=0) + new_sum = _safe_accumulator_op(sum_op, X, axis=0) + n_samples = X.shape[0] + new_sample_count = n_samples - np.sum(X_nan_mask, axis=0) updated_sample_count = last_sample_count + new_sample_count @@ -982,29 +988,31 @@ def _incremental_mean_and_var( updated_variance = None else: T = new_sum / new_sample_count + temp = X - T if sample_weight is not None: if np_version >= parse_version("1.16.6"): # equivalent to np.nansum((X-T)**2 * sample_weight, axis=0) # safer because np.float64(X*W) != np.float64(X)*np.float64(W) # dtype arg of np.matmul only exists since version 1.16 - new_unnormalized_variance = _safe_accumulator_op( - np.matmul, sample_weight, np.where(np.isnan(X), 0, (X - T) ** 2) - ) correction = _safe_accumulator_op( - np.matmul, sample_weight, np.where(np.isnan(X), 0, X - T) + np.matmul, sample_weight, np.where(X_nan_mask, 0, temp) ) - else: + temp **= 2 new_unnormalized_variance = _safe_accumulator_op( - np.nansum, (X - T) ** 2 * sample_weight[:, None], axis=0 + np.matmul, sample_weight, np.where(X_nan_mask, 0, temp) ) + else: correction = _safe_accumulator_op( - np.nansum, (X - T) * sample_weight[:, None], axis=0 + sum_op, temp * sample_weight[:, None], axis=0 + ) + temp *= temp + new_unnormalized_variance = _safe_accumulator_op( + sum_op, temp * sample_weight[:, None], axis=0 ) else: - new_unnormalized_variance = _safe_accumulator_op( - np.nansum, (X - T) ** 2, axis=0 - ) - correction = _safe_accumulator_op(np.nansum, X - T, axis=0) + correction = _safe_accumulator_op(sum_op, temp, axis=0) + temp **= 2 + new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0) # correction term of the corrected 2 pass algorithm. # See "Algorithms for computing the sample variance: analysis From 9e75fb0abb61dcc7e6de696feeff6a869b2b5f38 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 1 Aug 2021 17:21:03 -0400 Subject: [PATCH 2/2] DOC Adds PR number --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index f8d31c6197415..bc255a619a938 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -670,7 +670,7 @@ Changelog :user:`Jérémie du Boisberranger `. - |Efficiency| `preprocessing.StandardScaler` is faster and more memory - efficient. :pr:`_____` by `Thomas Fan`_. + efficient. :pr:`20652` by `Thomas Fan`_. :mod:`sklearn.tree` ...................