diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 2fb76ebd2f33d..763334e7628dd 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -215,6 +215,10 @@ Support for Python 3.4 and below has been officially dropped. in the dense case. Also added a new parameter ``order`` which controls output order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. +- |Fix| Fixed the calculation overflow when using a float16 dtype with + :class:`preprocessing.StandardScaler`. :issue:`13007` by + :user:`Raffaello Baluyot ` + :mod:`sklearn.tree` ................... - |Feature| Decision Trees can now be plotted with matplotlib using diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 1a5ad20d32ef4..b387379be2cee 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -450,6 +450,31 @@ def test_scaler_2d_arrays(): assert X_scaled is not X +def test_scaler_float16_overflow(): + # Test if the scaler will not overflow on float16 numpy arrays + rng = np.random.RandomState(0) + # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000 + # which is enough to overflow the data type + X = rng.uniform(5, 10, [200000, 1]).astype(np.float16) + + with np.errstate(over='raise'): + scaler = StandardScaler().fit(X) + X_scaled = scaler.transform(X) + + # Calculate the float64 equivalent to verify result + X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64)) + + # Overflow calculations may cause -inf, inf, or nan. Since there is no nan + # input, all of the outputs should be finite. This may be redundant since a + # FloatingPointError exception will be thrown on overflow above. + assert np.all(np.isfinite(X_scaled)) + + # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the + # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are + # checked to account for precision differences. + assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2) + + def test_handle_zeros_in_scale(): s1 = np.array([0, 1, 2, 3]) s2 = _handle_zeros_in_scale(s1, copy=True) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index fef2c7aff7971..44c6a392d9c6c 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -658,6 +658,38 @@ def make_nonnegative(X, min_value=0): return X +# Use at least float64 for the accumulating functions to avoid precision issue +# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained +# as it is in case the float overflows +def _safe_accumulator_op(op, x, *args, **kwargs): + """ + This function provides numpy accumulator functions with a float64 dtype + when used on a floating point input. This prevents accumulator overflow on + smaller floating point dtypes. + + Parameters + ---------- + op : function + A numpy accumulator function such as np.mean or np.sum + x : numpy array + A numpy array to apply the accumulator function + *args : positional arguments + Positional arguments passed to the accumulator function after the + input x + **kwargs : keyword arguments + Keyword arguments passed to the accumulator function + + Returns + ------- + result : The output of the accumulator function passed to this function + """ + if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8: + result = op(x, *args, **kwargs, dtype=np.float64) + else: + result = op(x, *args, **kwargs) + return result + + def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count): """Calculate mean update and a Youngs and Cramer variance update. @@ -708,12 +740,7 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count): # new = the current increment # updated = the aggregated stats last_sum = last_mean * last_sample_count - if np.issubdtype(X.dtype, np.floating) and X.dtype.itemsize < 8: - # Use at least float64 for the accumulator to avoid precision issues; - # see https://github.com/numpy/numpy/issues/9393 - new_sum = np.nansum(X, axis=0, dtype=np.float64).astype(X.dtype) - else: - new_sum = np.nansum(X, axis=0) + new_sum = _safe_accumulator_op(np.nansum, X, axis=0) new_sample_count = np.sum(~np.isnan(X), axis=0) updated_sample_count = last_sample_count + new_sample_count @@ -723,7 +750,8 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count): if last_variance is None: updated_variance = None else: - new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count + new_unnormalized_variance = ( + _safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count) last_unnormalized_variance = last_variance * last_sample_count with np.errstate(divide='ignore', invalid='ignore'): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index fc882e0719a8d..379aa738f7124 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -34,14 +34,18 @@ def _assert_all_finite(X, allow_nan=False): """Like assert_all_finite, but only for ndarray.""" + # validation is also imported in extmath + from .extmath import _safe_accumulator_op + if _get_config()['assume_finite']: return X = np.asanyarray(X) # First try an O(n) time, O(1) space solution for the common case that # everything is finite; fall back to O(n) space np.isfinite to prevent - # false positives from overflow in sum method. + # false positives from overflow in sum method. The sum is also calculated + # safely to reduce dtype induced overflows. is_float = X.dtype.kind in 'fc' - if is_float and np.isfinite(X.sum()): + if is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: msg_err = "Input contains {} or a value too large for {!r}."