scikit-learn
diff --git a/‎sklearn/utils/sparsefuncs_fast.pyx
Lines changed: 76 additions & 33 deletions b/‎sklearn/utils/sparsefuncs_fast.pyx
Lines changed: 76 additions & 33 deletions
@@ -67,24 +67,34 @@ def csr_mean_variance_axis0(X):
         Feature-wise variances
 
     """
-    cdef unsigned int n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
+    if X.dtype == np.int32 or X.dtype == np.int64:
+        X = X.astype(np.float64)
+    return _csr_mean_variance_axis0(X.data, X.shape, X.indices)
 
-    cdef np.ndarray[DOUBLE, ndim=1, mode="c"] X_data
-    X_data = np.asarray(X.data, dtype=np.float64)     # might copy!
-    cdef np.ndarray[int, ndim=1] X_indices = X.indices
+
+def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
+                             shape,
+                             np.ndarray[int, ndim=1] X_indices):
+    cdef unsigned int n_samples = shape[0]
+    cdef unsigned int n_features = shape[1]
 
     cdef unsigned int i
     cdef unsigned int non_zero = X_indices.shape[0]
     cdef unsigned int col_ind
-    cdef double diff
+    cdef floating diff
 
     # means[j] contains the mean of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
-                                                     dtype=np.float64)
-
+    cdef np.ndarray[floating, ndim=1] means
     # variances[j] contains the variance of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
+    cdef np.ndarray[floating, ndim=1] variances
+
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    means = np.zeros(n_features, dtype=dtype)
+    variances = np.zeros_like(means, dtype=dtype)
 
     # counts[j] contains the number of samples where feature j is non-zero
     cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features,
@@ -130,27 +140,36 @@ def csc_mean_variance_axis0(X):
         Feature-wise variances
 
     """
-    cdef unsigned int n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
+    if X.dtype == np.int32 or X.dtype == np.int64:
+        X = X.astype(np.float64)
+    return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr)
 
-    cdef np.ndarray[DOUBLE, ndim=1] X_data
-    X_data = np.asarray(X.data, dtype=np.float64)     # might copy!
-    cdef np.ndarray[int, ndim=1] X_indices = X.indices
-    cdef np.ndarray[int, ndim=1] X_indptr = X.indptr
+
+def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
+                             shape,
+                             np.ndarray[int, ndim=1] X_indices,
+                             np.ndarray[int, ndim=1] X_indptr):
+    cdef unsigned int n_samples = shape[0]
+    cdef unsigned int n_features =shape[1]
 
     cdef unsigned int i
     cdef unsigned int j
     cdef unsigned int counts
     cdef unsigned int startptr
     cdef unsigned int endptr
-    cdef double diff
+    cdef floating diff
 
     # means[j] contains the mean of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
-                                                     dtype=np.float64)
-
+    cdef np.ndarray[floating, ndim=1] means
     # variances[j] contains the variance of feature j
-    cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
+    cdef np.ndarray[floating, ndim=1] variances
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    means = np.zeros(n_features, dtype=dtype)
+    variances = np.zeros_like(means, dtype=dtype)
 
     for i in xrange(n_features):
 
@@ -219,29 +238,53 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
     `utils.extmath._batch_mean_variance_update`.
 
     """
-    cdef unsigned long n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
+    if X.dtype == np.int32 or X.dtype == np.int64:
+        X = X.astype(np.float64)
+    return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
+                                     last_mean, last_var, last_n)
+
+
+def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
+                              shape,
+                              np.ndarray[int, ndim=1] X_indices,
+                              np.ndarray[int, ndim=1] X_indptr,
+                              last_mean, last_var, unsigned long last_n):
+    cdef unsigned long n_samples = shape[0]
+    cdef unsigned int n_features = shape[1]
     cdef unsigned int i
 
     # last = stats until now
     # new = the current increment
     # updated = the aggregated stats
     # when arrays, they are indexed by i per-feature
-    cdef np.ndarray[DOUBLE, ndim=1] new_mean = np.zeros(n_features,
-                                                      dtype=np.float64)
-    cdef np.ndarray[DOUBLE, ndim=1] new_var = np.zeros_like(new_mean)
+    cdef np.ndarray[floating, ndim=1] new_mean
+    cdef np.ndarray[floating, ndim=1] new_var
+    cdef np.ndarray[floating, ndim=1] updated_var
+    if floating is float:
+        dtype = np.float32
+    else:
+        dtype = np.float64
+
+    new_mean = np.zeros(n_features, dtype=dtype)
+    new_var = np.zeros_like(new_mean, dtype=dtype)
+    updated_mean = np.zeros_like(new_mean, dtype=dtype)
+    updated_var = np.zeros_like(new_mean, dtype=dtype)
+
     cdef unsigned long new_n
-    cdef np.ndarray[DOUBLE, ndim=1] updated_mean = np.zeros_like(new_mean)
-    cdef np.ndarray[DOUBLE, ndim=1] updated_var = np.zeros_like(new_mean)
     cdef unsigned long updated_n
-    cdef DOUBLE last_over_new_n
+    cdef floating last_over_new_n
 
     # Obtain new stats first
     new_n = n_samples
-    if isinstance(X, sp.csr_matrix):
-        new_mean, new_var = csr_mean_variance_axis0(X)
-    elif isinstance(X, sp.csc_matrix):
-        new_mean, new_var = csc_mean_variance_axis0(X)
+
+    if len(X_indptr) == shape[0] + 1:
+        # X is a CSR matrix
+        new_mean, new_v
5EF4
ar = _csr_mean_variance_axis0(X_data, shape, X_indices)
+    else:
+        # X is a CSC matrix
+        new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
+                                                    X_indptr)
 
     # First pass
     if last_n == 0: