12
12
from libc.math cimport fabs, sqrt, pow
13
13
cimport numpy as np
14
14
import numpy as np
15
- import scipy.sparse as sp
16
15
cimport cython
17
16
from cython cimport floating
18
17
from numpy.math cimport isnan
@@ -74,21 +73,27 @@ def csr_mean_variance_axis0(X):
74
73
"""
75
74
if X.dtype not in [np.float32, np.float64]:
76
75
X = X.astype(np.float64)
77
- means, variances, _ = _csr_mean_variance_axis0(X.data, X.shape[0 ],
78
- X.shape[1 ], X.indices)
76
+
77
+ weights = np.ones(X.shape[0 ], dtype = X.dtype)
78
+
79
+ means, variances, _ = _csr_mean_variance_axis0(
80
+ X.data, X.shape[0 ], X.shape[1 ], X.indices, X.indptr, weights)
81
+
79
82
return means, variances
80
83
81
84
82
85
def _csr_mean_variance_axis0 (np.ndarray[floating , ndim = 1 , mode = " c" ] X_data,
83
86
unsigned long long n_samples ,
84
87
unsigned long long n_features ,
85
- np.ndarray[integral , ndim = 1 ] X_indices):
88
+ np.ndarray[integral , ndim = 1 ] X_indices,
89
+ np.ndarray[integral , ndim = 1 ] X_indptr,
90
+ np.ndarray[floating , ndim = 1 ] weights):
86
91
# Implement the function here since variables using fused types
87
92
# cannot be declared directly and can only be passed as function arguments
88
93
cdef:
89
94
np.npy_intp i
90
- unsigned long long non_zero = X_indices.shape[ 0 ]
91
- np.npy_intp col_ind
95
+ unsigned long long row_ind
96
+ integral col_ind
92
97
floating diff
93
98
# means[j] contains the mean of feature j
94
99
np.ndarray[floating, ndim= 1 ] means
@@ -104,29 +109,29 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
104
109
variances = np.zeros_like(means, dtype = dtype)
105
110
106
111
cdef:
107
- # counts[j] contains the number of samples where feature j is non-zero
108
- np.ndarray[np.int64_t, ndim= 1 ] counts = np.zeros(n_features,
109
- dtype = np.int64)
110
- # counts_nan[j] contains the number of NaNs for feature j
111
- np.ndarray[np.int64_t, ndim= 1 ] counts_nan = np.zeros(n_features,
112
- dtype = np.int64)
113
-
114
- for i in range (non_zero):
115
- col_ind = X_indices[i]
116
- if not isnan(X_data[i]):
117
- means[col_ind] += X_data[i]
118
- else :
119
- counts_nan[col_ind] += 1
112
+ np.ndarray[floating, ndim= 1 ] counts = np.zeros(
113
+ n_features, dtype = dtype)
114
+ np.ndarray[floating, ndim= 1 ] counts_nan = np.zeros(
115
+ n_features, dtype = dtype)
116
+
117
+ for row_ind in range (len (X_indptr) - 1 ):
118
+ for i in range (X_indptr[row_ind], X_indptr[row_ind + 1 ]):
119
+ col_ind = X_indices[i]
120
+ if not isnan(X_data[i]):
121
+ means[col_ind] += (X_data[i] * weights[row_ind])
122
+ else :
123
+ counts_nan[col_ind] += weights[row_ind]
120
124
121
125
for i in range (n_features):
122
126
means[i] /= (n_samples - counts_nan[i])
123
127
124
- for i in range (non_zero):
125
- col_ind = X_indices[i]
126
- if not isnan(X_data[i]):
127
- diff = X_data[i] - means[col_ind]
128
- variances[col_ind] += diff * diff
129
- counts[col_ind] += 1
128
+ for row_ind in range (len (X_indptr) - 1 ):
129
+ for i in range (X_indptr[row_ind], X_indptr[row_ind + 1 ]):
130
+ col_ind = X_indices[i]
131
+ if not isnan(X_data[i]):
132
+ diff = X_data[i] - means[col_ind]
133
+ variances[col_ind] += diff * diff * weights[row_ind]
134
+ counts[col_ind] += weights[row_ind]
130
135
131
136
for i in range (n_features):
132
137
variances[i] += (n_samples - counts_nan[i] - counts[i]) * means[i]** 2
@@ -154,23 +159,25 @@ def csc_mean_variance_axis0(X):
154
159
"""
155
160
if X.dtype not in [np.float32, np.float64]:
156
161
X = X.astype(np.float64)
157
- means, variances, _ = _csc_mean_variance_axis0(X.data, X.shape[0 ],
158
- X.shape[1 ], X.indices,
159
- X.indptr)
162
+
163
+ weights = np.ones(X.shape[0 ], dtype = X.dtype)
164
+ means, variances, _ = _csc_mean_variance_axis0(
165
+ X.data, X.shape[0 ], X.shape[1 ], X.indices, X.indptr, weights)
160
166
return means, variances
161
167
162
168
163
- def _csc_mean_variance_axis0 (np.ndarray[floating , ndim = 1 ] X_data,
169
+ def _csc_mean_variance_axis0 (np.ndarray[floating , ndim = 1 , mode = " c " ] X_data,
164
170
unsigned long long n_samples ,
165
171
unsigned long long n_features ,
166
172
np.ndarray[integral , ndim = 1 ] X_indices,
167
- np.ndarray[integral , ndim = 1 ] X_indptr):
173
+ np.ndarray[integral , ndim = 1 ] X_indptr,
174
+ np.ndarray[floating , ndim = 1 ] weights):
168
175
# Implement the function here since variables using fused types
169
176
# cannot be declared directly and can only be passed as function arguments
170
177
cdef:
171
- np.npy_intp i, j
172
- unsigned long long counts
173
- unsigned long long startptr
178
+ np.npy_intp i
179
+ unsigned long long col_ind
180
+ integral row_ind
174
181
floating diff
175
182
# means[j] contains the mean of feature j
176
183
np.ndarray[floating, ndim= 1 ] means
@@ -185,35 +192,39 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
185
192
means = np.zeros(n_features, dtype = dtype)
186
193
variances = np.zeros_like(means, dtype = dtype)
187
194
188
- cdef np.ndarray[np.int64_t, ndim= 1 ] counts_nan = np.zeros(n_features,
189
- dtype = np.int64)
195
+ cdef:
196
+ np.ndarray[floating, ndim= 1 ] counts = \
197
+ np.zeros(n_features, dtype = dtype)
198
+ np.ndarray[floating, ndim= 1 ] counts_nan = \
199
+ np.zeros(n_features, dtype = dtype)
200
+
201
+ for col_ind in range (n_features):
202
+ for i in range (X_indptr[col_ind], X_indptr[col_ind + 1 ]):
203
+ row_ind = X_indices[i]
204
+ if not isnan(X_data[i]):
205
+ means[col_ind] += (X_data[i] * weights[row_ind])
206
+ else :
207
+ counts_nan[col_ind] += weights[row_ind]
190
208
191
209
for i in range (n_features):
192
-
193
- startptr = X_indptr[i]
194
- endptr = X_indptr[i + 1 ]
195
- counts = endptr - startptr
196
-
197
- for j in range (startptr, endptr):
198
- if not isnan(X_data[j]):
199
- means[i] += X_data[j]
200
- else :
201
- counts_nan[i] += 1
202
- counts -= counts_nan[i]
203
210
means[i] /= (n_samples - counts_nan[i])
204
211
205
- for j in range (startptr, endptr):
206
- if not isnan(X_data[j]):
207
- diff = X_data[j] - means[i]
208
- variances[i] += diff * diff
212
+ for col_ind in range (n_features):
213
+ for i in range (X_indptr[col_ind], X_indptr[col_ind + 1 ]):
214
+ row_ind = X_indices[i]
215
+ if not isnan(X_data[i]):
216
+ diff = X_data[i] - means[col_ind]
217
+ variances[col_ind] += diff * diff * weights[row_ind]
218
+ counts[col_ind] += weights[row_ind]
209
219
210
- variances[i] += (n_samples - counts_nan[i] - counts) * means[i]** 2
220
+ for i in range (n_features):
221
+ variances[i] += (n_samples - counts_nan[i] - counts[i]) * means[i]** 2
211
222
variances[i] /= (n_samples - counts_nan[i])
212
223
213
224
return means, variances, counts_nan
214
225
215
226
216
- def incr_mean_variance_axis0 (X , last_mean , last_var , last_n ):
227
+ def incr_mean_variance_axis0 (X , last_mean , last_var , last_n , weights = None ):
217
228
""" Compute mean and variance along axis 0 on a CSR or CSC matrix.
218
229
219
230
last_mean, last_var are the statistics computed at the last step by this
@@ -231,8 +242,12 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n):
231
242
last_var : float array with shape (n_features,)
232
243
Array of feature-wise var to update with the new data X.
233
244
234
- last_n : int array with shape (n_features,)
235
- Number of samples seen so far, before X.
245
+ last_n : float array with shape (n_features,)
246
+ Sum of the weights seen so far (if weights are all set to 1
247
+ this will be the same as number of samples seen so far, before X).
248
+
249
+ weights : float array with shape (n_samples,) or None. If it is set
250
+ to None samples will be equally weighted.
236
251
237
252
Returns
238
253
-------
@@ -261,20 +276,38 @@ def incr_mean_variance_axis0(X, last_mean, last_var, last_n):
261
276
"""
262
277
if X.dtype not in [np.float32, np.float64]:
263
278
X = X.astype(np.float64)
264
- return _incr_mean_variance_axis0(X.data, X.shape[0 ], X.shape[1 ], X.indices,
265
- X.indptr, X.format, last_mean, last_var,
266
- last_n)
279
+ X_dtype = X.dtype
280
+ if weights is None :
281
+ weights = np.ones(X.shape[0 ], dtype = X_dtype)
282
+ elif weights.dtype not in [np.float32, np.float64]:
283
+ weights = weights.astype(np.float64, copy = False )
284
+ if last_n.dtype not in [np.float32, np.float64]:
285
+ last_n = last_n.astype(np.float64, copy = False )
286
+
287
+ return _incr_mean_variance_axis0(X.data,
288
+ np.sum(weights),
289
+ X.shape[1 ],
290
+ X.indices,
291
+ X.indptr,
292
+ X.format,
293
+ last_mean.astype(X_dtype, copy = False ),
294
+ last_var.astype(X_dtype, copy = False ),
295
+ last_n.astype(X_dtype, copy = False ),
296
+ weights.astype(X_dtype, copy = False ))
267
297
268
298
269
299
def _incr_mean_variance_axis0 (np.ndarray[floating , ndim = 1 ] X_data,
270
- unsigned long long n_samples ,
300
+ floating n_samples ,
271
301
unsigned long long n_features ,
272
- np.ndarray[integral , ndim = 1 ] X_indices,
302
+ np.ndarray[int , ndim = 1 ] X_indices,
303
+ # X_indptr might be either in32 or int64
273
304
np.ndarray[integral , ndim = 1 ] X_indptr,
274
305
str X_format ,
275
306
np.ndarray[floating , ndim = 1 ] last_mean,
276
307
np.ndarray[floating , ndim = 1 ] last_var,
277
- np.ndarray[np.int64_t , ndim = 1 ] last_n):
308
+ np.ndarray[floating , ndim = 1 ] last_n,
309
+ # previous sum of the weights (ie float )
310
+ np.ndarray[floating , ndim = 1 ] weights):
278
311
# Implement the function here since variables using fused types
279
312
# cannot be declared directly and can only be passed as function arguments
280
313
cdef:
@@ -301,24 +334,23 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
301
334
updated_var = np.zeros_like(new_mean, dtype = dtype)
302
335
303
336
cdef:
304
- np.ndarray[np.int64_t , ndim= 1 ] new_n
305
- np.ndarray[np.int64_t , ndim= 1 ] updated_n
337
+ np.ndarray[floating , ndim= 1 ] new_n
338
+ np.ndarray[floating , ndim= 1 ] updated_n
306
339
np.ndarray[floating, ndim= 1 ] last_over_new_n
307
- np.ndarray[np.int64_t , ndim= 1 ] counts_nan
340
+ np.ndarray[floating , ndim= 1 ] counts_nan
308
341
309
342
# Obtain new stats first
310
- new_n = np.full(n_features, n_samples, dtype = np.int64 )
311
- updated_n = np.zeros_like(new_n, dtype = np.int64 )
343
+ new_n = np.full(n_features, n_samples, dtype = dtype )
344
+ updated_n = np.zeros_like(new_n, dtype = dtype )
312
345
last_over_new_n = np.zeros_like(new_n, dtype = dtype)
313
346
347
+ # X can be a CSR or CSC matrix
314
348
if X_format == ' csr' :
315
- # X is a CSR matrix
316
349
new_mean, new_var, counts_nan = _csr_mean_variance_axis0(
317
- X_data, n_samples, n_features, X_indices)
318
- else :
319
- # X is a CSC matrix
350
+ X_data, n_samples, n_features, X_indices, X_indptr, weights)
351
+ else : # X_format == 'csc'
320
352
new_mean, new_var, counts_nan = _csc_mean_variance_axis0(
321
- X_data, n_samples, n_features, X_indices, X_indptr)
353
+ X_data, n_samples, n_features, X_indices, X_indptr, weights )
322
354
323
355
for i in range (n_features):
324
356
new_n[i] -= counts_nan[i]
0 commit comments