thomasjpfan
diff --git a/‎doc/whats_new/v0.24.rst
Lines changed: 5 additions & 0 deletions b/‎doc/whats_new/v0.24.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎sklearn/utils/sparsefuncs.py
Lines changed: 28 additions & 18 deletions b/‎sklearn/utils/sparsefuncs.py
Lines changed: 28 additions & 18 deletions
diff --git a/‎sklearn/utils/sparsefuncs_fast.pyx
Lines changed: 102 additions & 70 deletions b/‎sklearn/utils/sparsefuncs_fast.pyx
Lines changed: 102 additions & 70 deletions
@@ -703,6 +703,11 @@ Changelog
   By :user:`Alex Gramfort <agramfort>`.
 
 
+- |Enhancement| Add support for weights in
+  :func:`utils.sparse_func.incr_mean_variance_axis`.
+  By :user:`Maria Telenczuk <maikia>` and :user:`Alex Gramfort <agramfort>`.
+
+
 Miscellaneous
 .............
 
 
@@ -11,6 +11,7 @@
     csr_mean_variance_axis0 as _csr_mean_var_axis0,
     csc_mean_variance_axis0 as _csc_mean_var_axis0,
     incr_mean_variance_axis0 as _incr_mean_var_axis0)
+from ..utils.validation import _check_sample_weight
 
 
 def _raise_typeerror(X):
@@ -100,7 +101,8 @@ def mean_variance_axis(X, axis):
 
 
 @_deprecate_positional_args
-def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n):
+def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n,
+                            weights=None):
     """Compute incremental mean and variance along an axix on a CSR or
     CSC matrix.
 
@@ -125,9 +127,17 @@ def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n):
         Array of variances to update with the new data X.
         Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
 
-    last_n : ndarray of shape (n_features,) or (n_samples,), dtype=integral
+    last_n : float or ndarray of shape (n_features,) or (n_samples,), \
+            dtype=floating
         Sum of the weights seen so far, excluding the current weights
-        Should be of shape (n_samples,) if axis=0 or (n_features,) if axis=1.
+        If not float, it should be of shape (n_samples,) if
+        axis=0 or (n_features,) if axis=1. If float it corresponds to
+        having same weights for all samples (or features).
+
+    weights : ndarray, shape (n_samples,) or (n_features,) | None
+        if axis is set to 0 shape is (n_samples,) or
+        if axis is set to 1 shape is (n_features,).
+        If it is set to None, then samples are equally weighted.
 
     Returns
     -------
@@ -143,16 +153,22 @@ def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n):
         Updated number of seen samples per feature if axis=0
         or number of seen features per sample if axis=1.
 
+        If weights is not None, n is a sum of the weights of the seen
+        samples or features instead of the actual number of seen
+        samples or features.
+
     Notes
     -----
     NaNs are ignored in the algorithm.
-
     """
     _raise_error_wrong_axis(axis)
 
     if not isinstance(X, (sp.csr_matrix, sp.csc_matrix)):
         _raise_typeerror(X)
 
+    if np.size(last_n) == 1:
+        last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)
+
     if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):
         raise ValueError(
             "last_mean, last_var, last_n do not have the same shapes."
@@ -171,20 +187,14 @@ def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n):
                 f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."
             )
 
-    if isinstance(X, sp.csr_matrix):
-        if axis == 0:
-            return _incr_mean_var_axis0(X, last_mean=last_mean,
-                                        last_var=last_var, last_n=last_n)
-        else:
-            return _incr_mean_var_axis0(X.T, last_mean=last_mean,
-                                        last_var=last_var, last_n=last_n)
-    elif isinstance(X, sp.csc_matrix):
-        if axis == 0:
-            return _incr_mean_var_axis0(X, last_mean=last_mean,
-                                        last_var=last_var, last_n=last_n)
-        else:
-            return _incr_mean_var_axis0(X.T, last_mean=last_mean,
-                                        last_var=last_var, last_n=last_n)
+    X = X.T if axis == 1 else X
+
+    if weights is not None:
+        weights = _check_sample_weight(weights, X, dtype=X.dtype)
+
+    return _incr_mean_var_axis0(X, last_mean=last_mean,
+                                last_var=last_var, last_n=last_n,
+                                weights=weights)
 
 
 def inplace_column_scale(X, scale):
 
@@ -12,7 +12,6 @@
 from libc.math cimport fabs, sqrt, pow
 cimport numpy as np
 import numpy as np
-import scipy.sparse as sp
 cimport cython
 from cython cimport floating
 from numpy.math cimport isnan
@@ -74,21 +73,27 @@ def csr_mean_variance_axis0(X):
     """
     if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
-    means, variances, _ =  _csr_mean_variance_axis0(X.data, X.shape[0],
-                                                    X.shape[1], X.indices)
+
+    weights = np.ones(X.shape[0], dtype=X.dtype)
+
+    means, variances, _ = _csr_mean_variance_axis0(
+        X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
+
     return means, variances
 
 
 def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
                              unsigned long long n_samples,
                              unsigned long long n_features,
-                             np.ndarray[integral, ndim=1] X_indices):
+                             np.ndarray[integral, ndim=1] X_indices,
+                             np.ndarray[integral, ndim=1] X_indptr,
+                             np.ndarray[floating, ndim=1] weights):
     # Implement the function here since variables using fused types
     # cannot be declared directly and can only be passed as function arguments
     cdef:
         np.npy_intp i
-        unsigned long long non_zero = X_indices.shape[0]
-        np.npy_intp col_ind
+        unsigned long long row_ind
+        integral col_ind
         floating diff
         # means[j] contains the mean of feature j
         np.ndarray[floating, ndim=1] means
@@ -104,29 +109,29 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
     variances = np.zeros_like(means, dtype=dtype)
 
     cdef:
-        # counts[j] contains the number of samples where feature j is non-zero
-        np.ndarray[np.int64_t, ndim=1] counts = np.zeros(n_features,
-                                                         dtype=np.int64)
-        # counts_nan[j] contains the number of NaNs for feature j
-        np.ndarray[np.int64_t, ndim=1] counts_nan = np.zeros(n_features,
-                                                             dtype=np.int64)
-
-    for i in range(non_zero):
-        col_ind = X_indices[i]
-        if not isnan(X_data[i]):
-            means[col_ind] += X_data[i]
-        else:
-            counts_nan[col_ind] += 1
+        np.ndarray[floating, ndim=1] counts = np.zeros(
+            n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] counts_nan = np.zeros(
+            n_features, dtype=dtype)
+
+    for row_ind in range(len(X_indptr) - 1):
+        for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
+            col_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                means[col_ind] += (X_data[i] * weights[row_ind])
+            else:
+                counts_nan[col_ind] += weights[row_ind]
 
     for i in range(n_features):
         means[i] /= (n_samples - counts_nan[i])
 
-    for i in range(non_zero):
-        col_ind = X_indices[i]
-        if not isnan(X_data[i]):
-            diff = X_data[i] - means[col_ind]
-            variances[col_ind] += diff * diff
-            counts[col_ind] += 1
+    for row_ind in range(len(X_indptr) - 1):
+        for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
+            col_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                diff = X_data[i] - means[col_ind]
+                variances[col_ind] += diff * diff * weights[row_ind]
+                counts[col_ind] += weights[row_ind]
 
     for i in range(n_features):
         variances[i] += (n_samples - counts_nan[i] - counts[i]) * means[i]**2
@@ -154,23 +159,25 @@ def csc_mean_variance_axis0(X):
     """
     if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
-    means, variances, _ = _csc_mean_variance_axis0(X.data, X.shape[0],
-                                                   X.shape[1], X.indices,
-                                                  X.indptr)
+
+    weights = np.ones(X.shape[0], dtype=X.dtype)
+    means, variances, _ = _csc_mean_variance_axis0(
+        X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)
     return means, variances
 
 
-def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
+def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
                              unsigned long long n_samples,
                              unsigned long long n_features,
                              np.ndarray[integral, ndim=1] X_indices,
-                             np.ndarray[integral, ndim=1] X_indptr):
+                             np.ndarray[integral, ndim=1] X_indptr,
+                             np.ndarray[floating, ndim=1] weights):
     # Implement the function here since variables using fused types
     # cannot be declared directly and can only be passed as function arguments
     cdef:
-        np.npy_intp i, j
-        unsigned long long counts
-        unsigned long long startptr
+        np.npy_intp i
+        unsigned long long col_ind
+        integral row_ind
         floating diff
         # means[j] contains the mean of feature j
         np.ndarray[floating, ndim=1] means
@@ -185,35 +192,39 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
     means = np.zeros(n_features, dtype=dtype)
     variances = np.zeros_like(means, dtype=dtype)
 
-    cdef np.ndarray[np.int64_t, ndim=1] counts_nan = np.zeros(n_features,
-                                                              dtype=np.int64)
+    cdef:
+        np.ndarray[floating, ndim=1] counts = \
+            np.zeros(n_features, dtype=dtype)
+        np.ndarray[floating, ndim=1] counts_nan = \
+            np.zeros(n_features, dtype=dtype)
+
+    for col_ind in range(n_features):
+        for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
+            row_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                means[col_ind] += (X_data[i] * weights[row_ind])
+            else:
+                counts_nan[col_ind] += weights[row_ind]
 
     for i in range(n_features):
-
-        startptr = X_indptr[i]
-        endptr = X_indptr[i + 1]
-        counts = endptr - startptr
-
-        for j in range(startptr, endptr):
-            if not isnan(X_data[j]):
-                means[i] += X_data[j]
-            else:
-                counts_nan[i] += 1
-        counts -= counts_nan[i]
         means[i] /= (n_samples - counts_nan[i])
 
-        for j in range(startptr, endptr):
-            if not isnan(X_data[j]):
-                diff = X_data[j] - means[i]
-                variances[i] += diff * diff
+    for col_ind in range(n_features):
+        for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
+            row_ind = X_indices[i]
+            if not isnan(X_data[i]):
+                diff = X_data[i] - means[col_ind]
+                variances[col_ind] += diff * diff * weights[row_ind]
+                counts[col_ind] += weights[row_ind]
 
-        variances[i] += (n_samples - counts_nan[i] - counts) * means[i]**2
+    for i in range(n_features):
+        variances[i] += (n_samples - counts_nan[i] - counts[i]) * means[i]**2
         variances[i] /= (n_samples - counts_nan[i])
 
     return means, variances, counts_nan
 
 
-def incr_mean_variance_axis0(X, last_mean, last_var, last_n):
+def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
     """Compute mean and variance along axis 0 on a CSR or CSC matrix.
 
     last_mean, last_var are the statistics computed at the last step by this
@@ -231,8 +242,12 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n):
     last_var : float array with shape (n_features,)
       Array of feature-wise var to update with the new data X.
 
-    last_n : int array with shape (n_features,)
-      Number of samples seen so far, before X.
+    last_n : float array with shape (n_features,)
+      Sum of the weights seen so far (if weights are all set to 1
+      this will be the same as number of samples seen so far, before X).
+
+    weights : float array with shape (n_samples,) or None. If it is set
+      to None samples will be equally weighted.
 
     Returns
     -------
@@ -261,20 +276,38 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n):
     """
     if X.dtype not in [np.float32, np.float64]:
         X = X.astype(np.float64)
-    return _incr_mean_variance_axis0(X.data, X.shape[0], X.shape[1], X.indices,
-                                     X.indptr, X.format, last_mean, last_var,
-                                     last_n)
+    X_dtype = X.dtype
+    if weights is None:
+        weights = np.ones(X.shape[0], dtype=X_dtype)
+    elif weights.dtype not in [np.float32, np.float64]:
+        weights = weights.astype(np.float64, copy=False)
+    if last_n.dtype not in [np.float32, np.float64]:
+        last_n = last_n.astype(np.float64, copy=False)
+
+    return _incr_mean_variance_axis0(X.data,
+                                     np.sum(weights),
+                                     X.shape[1],
+                                     X.indices,
+                                     X.indptr,
+                                     X.format,
+                                     last_mean.astype(X_dtype, copy=False),
+                                     last_var.astype(X_dtype, copy=False),
+                                     last_n.astype(X_dtype, copy=False),
+                                     weights.astype(X_dtype, copy=False))
 
 
 def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
-                              unsigned long long n_samples,
+                              floating n_samples,
                               unsigned long long n_features,
-                              np.ndarray[integral, ndim=1] X_indices,
+                              np.ndarray[int, ndim=1] X_indices,
+                              # X_indptr might be either in32 or int64
                               np.ndarray[integral, ndim=1] X_indptr,
                               str X_format,
                               np.ndarray[floating, ndim=1] last_mean,
                               np.ndarray[floating, ndim=1] last_var,
-                              np.ndarray[np.int64_t, ndim=1] last_n):
+                              np.ndarray[floating, ndim=1] last_n,
+                              # previous sum of the weights (ie float)
+                              np.ndarray[floating, ndim=1] weights):
     # Implement the function here since variables using fused types
     # cannot be declared directly and can only be passed as function arguments
     cdef:
@@ -301,24 +334,23 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
     updated_var = np.zeros_like(new_mean, dtype=dtype)
 
     cdef:
-        np.ndarray[np.int64_t, ndim=1] new_n
-        np.ndarray[np.int64_t, ndim=1] updated_n
+        np.ndarray[floating, ndim=1] new_n
+        np.ndarray[floating, ndim=1] updated_n
         np.ndarray[floating, ndim=1] last_over_new_n
-        np.ndarray[np.int64_t, ndim=1] counts_nan
+        np.ndarray[floating, ndim=1] counts_nan
 
     # Obtain new stats first
-    new_n = np.full(n_features, n_samples, dtype=np.int64)
-    updated_n = np.zeros_like(new_n, dtype=np.int64)
+    new_n = np.full(n_features, n_samples, dtype=dtype)
+    updated_n = np.zeros_like(new_n, dtype=dtype)
     last_over_new_n = np.zeros_like(new_n, dtype=dtype)
 
+    # X can be a CSR or CSC matrix
     if X_format == 'csr':
-        # X is a CSR matrix
         new_mean, new_var, counts_nan = _csr_mean_variance_axis0(
-            X_data, n_samples, n_features, X_indices)
-    else:
-        # X is a CSC matrix
+            X_data, n_samples, n_features, X_indices, X_indptr, weights)
+    else:  # X_format == 'csc'
         new_mean, new_var, counts_nan = _csc_mean_variance_axis0(
-            X_data, n_samples, n_features, X_indices, X_indptr)
+            X_data, n_samples, n_features, X_indices, X_indptr, weights)
 
     for i in range(n_features):
         new_n[i] -= counts_nan[i]