scikit-learn
diff --git a/� 8000 ��sklearn/utils/extmath.py
Lines changed: 26 additions & 16 deletions b/� 8000 ��sklearn/utils/extmath.py
Lines changed: 26 additions & 16 deletions
diff --git a/‎sklearn/utils/tests/test_extmath.py
Lines changed: 58 additions & 20 deletions b/‎sklearn/utils/tests/test_extmath.py
Lines changed: 58 additions & 20 deletions
@@ -707,7 +707,8 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
     return result
 
 
-def _incremental_weighted_mean_and_var(X, sample_weight, last_weighted_mean,
+def _incremental_weighted_mean_and_var(X, sample_weight,
+                                       last_weighted_mean,
                                        last_weighted_variance,
                                        last_weight_sum):
     """Calculate weighted mean and variance batch update
@@ -762,32 +763,41 @@ def _incremental_weighted_mean_and_var(X, sample_weight, last_weighted_mean,
     # updated = the aggregated stats
 
     M = np.isnan(X)
-    X = np.where(np.isnan(X), 0, X)
-    new_weight_sum = np.dot(np.transpose(np.reshape(sample_weight, (-1, 1))), ~M).ravel()
-    total_weight_sum = np.sum(sample_weight, axis=0)
-
-    new_weighted_mean = np.average(X, weights=sample_weight, axis=0)
-    new_weighted_mean = (new_weighted_mean * total_weight_sum) / new_weight_sum
+    sample_weight_T = np.transpose(np.reshape(sample_weight, (-1, 1)))
+    new_weight_sum = _safe_accumulator_op(np.dot, sample_weight_T, ~M).ravel()
+    total_weight_sum = _safe_accumulator_op(np.sum, sample_weight, axis=0)
+
+    X_0 = np.where(np.isnan(X), 0, X)
+    new_weighted_mean = \
+        _safe_accumulator_op(np.average, X_0, weights=sample_weight, axis=0)
+    new_weighted_mean *= total_weight_sum / new_weight_sum
     updated_weight_sum = last_weight_sum + new_weight_sum
-    updated_weighted_mean = (last_weight_sum * last_weighted_mean
-                             + new_weight_sum * new_weighted_mean) / updated_weight_sum
+    updated_weighted_mean = (
+            (last_weight_sum * last_weighted_mean +
+             new_weight_sum * new_weighted_mean) / updated_weight_sum)
 
     if last_weighted_variance is None:
         updated_weighted_variance = None
     else:
-        new_weighted_variance = (np.average(
-
X ** 2, weights=sample_weight, axis=0) * total_weight_sum / new_weight_sum) - new_weighted_mean ** 2            X ** 2, weights=sample_weight, axis=0) * total_weight_sum / new_weight_sum) - new_weighted_mean ** 2
-        new_element = new_weight_sum * \
-            (new_weighted_variance + (new_weighted_mean - updated_weighted_mean) ** 2)
-        last_element = last_weight_sum * \
-            (last_weighted_variance + (last_weighted_mean - updated_weighted_mean) ** 2)
+        X_0 = np.where(np.isnan(X), 0, (X-new_weighted_mean)**2)
+        new_weighted_variance = \
+            _safe_accumulator_op(
+                np.average, X_0, weights=sample_weight, axis=0)
+        new_weighted_variance *= total_weight_sum / new_weight_sum
+        new_element = (
+                new_weight_sum *
+                (new_weighted_variance +
+                 (new_weighted_mean - updated_weighted_mean) ** 2))
+        last_element = (
+                last_weight_sum *
+                (last_weighted_variance +
+                 (last_weighted_mean - updated_weighted_mean) ** 2))
         updated_weighted_variance = (
             new_element + last_element) / updated_weight_sum
 
     return updated_weighted_mean, updated_weighted_variance, updated_weight_sum
 
 
-
 def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     """Calculate mean update and a Youngs and Cramer variance update.
 
 
@@ -22,7 +22,7 @@
 from sklearn.utils._testing import assert_warns_message
 from sklearn.utils._testing import skip_if_32bit
 
-from sklearn.utils.extmath import density
+from sklearn.utils.extmath import density, _safe_accumulator_op
 from sklearn.utils.extmath import randomized_svd
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.extmath import weighted_mode
@@ -470,32 +470,70 @@ def test_incremental_weighted_mean_and_variance_simple():
 
 
 def test_incremental_weighted_mean_and_variance():
-    rng = np.random.RandomState(42)
-    mult = 10
-    X = rng.rand(1000, 20)*mult
-    sample_weight = rng.rand(X.shape[0]) * mult
 
-    n = X.shape[0]
-    last_mean, last_weight_sum, last_var = 0, 0, 0
-    mean_exp = np.average(X, weights=sample_weight, axis=0)
-    var_exp = np.average(X ** 2, weights=sample_weight, axis=0) - mean_exp ** 2
-    for chunk_size in [1, 2, 50, n, n + 42]:
-        for batch in gen_batches(n, chunk_size):
-            last_mean, last_var, last_weight_sum = \
-                _incremental_weighted_mean_and_var(X[batch],
-                                                   sample_weight[batch],
-                                                   last_mean,
-                                                   last_var,
-                                                   last_weight_sum)
-        assert_almost_equal(last_mean, mean_exp)
-        assert_almost_equal(last_var, var_exp)
+    # Testing of correctness and numerical stability
+    def test(X, sample_weight, mean_exp=None, var_exp=None):
+        n = X.shape[0]
+        if mean_exp is None:
+            mean_exp = \
+                _safe_accumulator_op(
+                    np.average, X, weights=sample_weight, axis=0)
+        if var_exp is None:
+            var_exp = \
+                _safe_accumulator_op(
+                    np.average, (X-mean_exp)**2, weights=sample_weight, axis=0)
+        for chunk_size in [1, n//10 + 1, n//4 + 1, n//2 + 1, n]:
+            last_mean, last_weight_sum, last_var = 0, 0, 0
+            for batch in gen_batches(n, chunk_size):
+                last_mean, last_var, last_weight_sum = \
+                    _incremental_weighted_mean_and_var(X[batch],
+                                                       sample_weight[batch],
+                                                       last_mean,
+                                                       last_var,
+                                                       last_weight_sum)
+            assert_almost_equal(last_mean, mean_exp)
+            assert_almost_equal(last_var, var_exp, 6)
+
+    HIGH_MEAN = 10e6
+    LOW_VAR = 10e-7
+    NORMAL_MEAN = 0.0
+    NORMAL_VAR = 1.0
+    SIZE = (100, 20)
+
+    rng = np.random.RandomState(42)
+    NORMAL_WEIGHT = \
+        rng.normal(loc=NORMAL_MEAN, scale=NORMAL_VAR, size=(SIZE[0],))
+    ALMOST_ZERO_WEIGHT = \
+        rng.normal(loc=NORMAL_MEAN, scale=LOW_VAR, size=(SIZE[0],))
+    ALMOST_ONES_WEIGHT = rng.normal(loc=1.0, scale=LOW_VAR, size=(SIZE[0],))
+    JUST_WEIGHT = rng.normal(loc=10.0, scale=NORMAL_VAR, size=(SIZE[0],))
+    ONES_WEIGHT = np.ones(SIZE[0])
+
+    means = [NORMAL_MEAN, HIGH_MEAN]
+    vars = [NORMAL_VAR, LOW_VAR]
+    weights = \
+        [NORMAL_WEIGHT, ALMOST_ONES_WEIGHT, JUST_WEIGHT, ALMOST_ZERO_WEIGHT]
+    means_vars = ((m, v) for m in means for v in vars)
+
+    # Comparing with weighted np.average
+    for mean, var in means_vars:
+        X = rng.normal(loc=mean, scale=var, size=SIZE)
+        print(mean, var)
+        for weight in weights:
+            test(X, weight)
+
+    # Comparing with unweighted np.average
+    for mean, var in means_vars:
+        X = rng.normal(loc=mean, scale=var, size=SIZE)
+        mean_exp = _safe_accumulator_op(np.mean, X, axis=0)
+        var_exp = _safe_accumulator_op(np.var, X, axis=0)
+        test(X, ONES_WEIGHT, mean_exp, var_exp)
 
 
 def test_incremental_weighted_mean_and_variance_ignore_nan():
     old_means = np.array([535., 535., 535., 535.])
     old_variances = np.array([4225., 4225., 4225., 4225.])
     old_weight_sum = np.array([2, 2, 2, 2], dtype=np.int32)
-
     sample_weights_X = np.ones(3)
     sample_weights_X_nan = np.ones(4)