8000 ENH Improves memory usage for standard scalar by thomasjpfan · Pull Request #20652 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content
8000

ENH Improves memory usage for standard scalar #20652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

8000
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,9 @@ Changelog
`n_features_in_` and will be removed in 1.2. :pr:`20240` by
:user:`Jérémie du Boisberranger <jeremiedbb>`.

- |Efficiency| `preprocessing.StandardScaler` is faster and more memory
efficient. :pr:`20652` by `Thomas Fan`_.

:mod:`sklearn.tree`
...................

Expand Down
38 changes: 23 additions & 15 deletions sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,24 +955,30 @@ def _incremental_mean_and_var(
# new = the current increment
# updated = the aggregated stats
last_sum = last_mean * last_sample_count
X_nan_mask = np.isnan(X)
if np.any(X_nan_mask):
sum_op = np.nansum
else:
sum_op = np.sum
if sample_weight is not None:
if np_version >= parse_version("1.16.6"):
# equivalent to np.nansum(X * sample_weight, axis=0)
# safer because np.float64(X*W) != np.float64(X)*np.float64(W)
# dtype arg of np.matmul only exists since version 1.16
new_sum = _safe_accumulator_op(
np.matmul, sample_weight, np.where(np.isnan(X), 0, X)
np.matmul, sample_weight, np.where(X_nan_mask, 0, X)
)
else:
new_sum = _safe_accumulator_op(
np.nansum, X * sample_weight[:, None], axis=0
)
new_sample_count = _safe_accumulator_op(
np.sum, sample_weight[:, None] * (~np.isnan(X)), axis=0
np.sum, sample_weight[:, None] * (~X_nan_mask), axis=0
)
else:
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
new_sample_count = np.sum(~np.isnan(X), axis=0)
new_sum = _safe_accumulator_op(sum_op, X, axis=0)
n_samples = X.shape[0]
new_sample_count = n_samples - np.sum(X_nan_mask, axis=0)

updated_sample_count = last_sample_count + new_sample_count

Expand All @@ -982,29 +988,31 @@ def _incremental_mean_and_var(
updated_variance = None
else:
T = new_sum / new_sample_count
temp = X - T
if sample_weight is not None:
if np_version >= parse_version("1.16.6"):
# equivalent to np.nansum((X-T)**2 * sample_weight, axis=0)
# safer because np.float64(X*W) != np.float64(X)*np.float64(W)
# dtype arg of np.matmul only exists since version 1.16
new_unnormalized_variance = _safe_accumulator_op(
np.matmul, sample_weight, np.where(np.isnan(X), 0, (X - T) ** 2)
)
correction = _safe_accumulator_op(
np.matmul, sample_weight, np.where(np.isnan(X), 0, X - T)
np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
)
else:
temp **= 2
new_unnormalized_variance = _safe_accumulator_op(
np.nansum, (X - T) ** 2 * sample_weight[:, None], axis=0
np.matmul, sample_weight, np.where(X_nan_mask, 0, temp)
)
else:
correction = _safe_accumulator_op(
np.nansum, (X - T) * sample_weight[:, None], axis=0
sum_op, temp * sample_weight[:, None], axis=0
)
temp *= temp
new_unnormalized_variance = _safe_accumulator_op(
sum_op, temp * sample_weight[:, None], axis=0
)
else:
new_unnormalized_variance = _safe_accumulator_op(
np.nansum, (X - T) ** 2, axis=0
)
correction = _safe_accumulator_op(np.nansum, X - T, axis=0)
correction = _safe_accumulator_op(sum_op, temp, axis=0)
temp **= 2
new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0)

# correction term of the corrected 2 pass algorithm.
# See "Algorithms for computing the sample variance: analysis
Expand Down
0