8000 [MRG+2] Use fused types in sparse mean variance functions (#6593) · mannby/scikit-learn@22c3af3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 22c3af3

Browse files
yenchenlinClaes-Fredrik Mannby
authored and
Claes-Fredrik Mannby
committed
[MRG+2] Use fused types in sparse mean variance functions (scikit-learn#6593)
* Use fused types in mean variance functions * Add test for mean-variance functions using fused trpes * Add whats_new
1 parent 167b0e5 commit 22c3af3

File tree

3 files changed

+135
-95
lines changed

3 files changed

+135
-95
lines changed

doc/whats_new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ Enhancements
131131
- Add option to show ``indicator features`` in the output of Imputer.
132132
By `Mani Teja`_.
133133

134+
- Reduce the memory usage for 32-bit float input arrays of :func:`utils.mean_variance_axis` and
135+
:func:`utils.incr_mean_variance_axis` by supporting cython fused types. By `YenChen Lin`_.
136+
134137
Bug fixes
135138
.........
136139

sklearn/utils/sparsefuncs_fast.pyx

Lines changed: 85 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -64,24 +64,36 @@ def csr_mean_variance_axis0(X):
6464
Feature-wise variances
6565
6666
"""
67-
cdef unsigned int n_samples = X.shape[0]
68-
cdef unsigned int n_features = X.shape[1]
67+
if X.dtype != np.float32:
68+
X = X.astype(np.float64)
69+
return _csr_mean_variance_axis0(X.data, X.shape, X.indices)
6970

70-
cdef np.ndarray[DOUBLE, ndim=1, mode="c"] X_data
71-
X_data = np.asarray(X.data, dtype=np.float64) # might copy!
72-
cdef np.ndarray[int, ndim=1] X_indices = X.indices
71+
72+
def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
73+
shape,
74+
np.ndarray[int, ndim=1] X_indices):
75+
# Implement the function here since variables using fused types
76+
# cannot be declared directly and can only be passed as function arguments
77+
cdef unsigned int n_samples = shape[0]
78+
cdef unsigned int n_features = shape[1]
7379

7480
cdef unsigned int i
7581
cdef unsigned int non_zero = X_indices.shape[0]
7682
cdef unsigned int col_ind
77-
cdef double diff
83+
cdef floating diff
7884

7985
# means[j] contains the mean of feature j
80-
cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
81-
dtype=np.float64)
82-
86+
cdef np.ndarray[floating, ndim=1] means
8387
# variances[j] contains the variance of feature j
84-
cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
88+
cdef np.ndarray[floating, ndim=1] variances
89+
90+
if floating is float:
91+
dtype = np.float32
92+
else:
93+
dtype = np.float64
94+
95+
means = np.zeros(n_features, dtype=dtype)
96+
variances = np.zeros_like(means, dtype=dtype)
8597

8698
# counts[j] contains the number of samples where feature j is non-zero
8799
cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features,
@@ -124,27 +136,38 @@ def csc_mean_variance_axis0(X):
124136
Feature-wise variances
125137
126138
"""
127-
cdef unsigned int n_samples = X.shape[0]
128-
cdef unsigned int n_features = X.shape[1]
139+
if X.dtype != np.float32:
140+
X = X.astype(np.float64)
141+
return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr)
142+
129143

130-
cdef np.ndarray[DOUBLE, ndim=1] X_data
131-
X_data = np.asarray(X.data, dtype=np.float64) # might copy!
132-
cdef np.ndarray[int, ndim=1] X_indices = X.indices
133-
cdef np.ndarray[int, ndim=1] X_indptr = X.indptr
144+
def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
145+
shape,
146+
np.ndarray[int, ndim=1] X_indices,
147+
np.ndarray[int, ndim=1] X_indptr):
148+
# Implement the function here since variables using fused types
149+
# cannot be declared directly and can only be passed as function arguments
150+
cdef unsigned int n_samples = shape[0]
151+
cdef unsigned int n_features = shape[1]
134152

135153
cdef unsigned int i
136154
cdef unsigned int j
137155
cdef unsigned int counts
138156
cdef unsigned int startptr
139157
cdef unsigned int endptr
140-
cdef double diff
158+
cdef floating diff
141159

142160
# means[j] contains the mean of feature j
143-
cdef np.ndarray[DOUBLE, ndim=1] means = np.zeros(n_features,
144-
dtype=np.float64)
145-
161+
cdef np.ndarray[floating, ndim=1] means
146162
# variances[j] contains the variance of feature j
147-
cdef np.ndarray[DOUBLE, ndim=1] variances = np.zeros_like(means)
163+
cdef np.ndarray[floating, ndim=1] variances
164+
if floating is float:
165+
dtype = np.float32
166+
else:
167+
dtype = np.float64
168+
169+
means = np.zeros(n_features, dtype=dtype)
170+
variances = np.zeros_like(means, dtype=dtype)
148171

149172
for i in xrange(n_features):
150173

@@ -210,29 +233,58 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
210233
`utils.extmath._batch_mean_variance_update`.
211234
212235
"""
213-
cdef unsigned long n_samples = X.shape[0]
214-
cdef unsigned int n_features = X.shape[1]
236+
if X.dtype != np.float32:
237+
X = X.astype(np.float64)
238+
return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
239+
X.format, last_mean, last_var, last_n)
240+
241+
242+
def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
243+
shape,
244+
np.ndarray[int, ndim=1] X_indices,
245+
np.ndarray[int, ndim=1] X_indptr,
246+
X_format,
247+
last_mean,
248+
last_var,
249+
unsigned long last_n):
250+
# Implement the function here since variables using fused types
251+
# cannot be declared directly and can only be passed as function arguments
252+
cdef unsigned long n_samples = shape[0]
253+
cdef unsigned int n_features = shape[1]
215254
cdef unsigned int i
216255

217256
# last = stats until now
218257
# new = the current increment
219258
# updated = the aggregated stats
220259
# when arrays, they are indexed by i per-feature
221-
cdef np.ndarray[DOUBLE, ndim=1] new_mean = np.zeros(n_features,
222-
dtype=np.float64)
223-
cdef np.ndarray[DOUBLE, ndim=1] new_var = np.zeros_like(new_mean)
260+
cdef np.ndarray[floating, ndim=1] new_mean
261+
cdef np.ndarray[floating, ndim=1] new_var
262+
cdef np.ndarray[floating, ndim=1] updated_mean
263+
cdef np.ndarray[floating, ndim=1] updated_var
264+
if floating is float:
265+
dtype = np.float32
266+
else:
267+
dtype = np.float64
268+
269+
new_mean = np.zeros(n_features, dtype=dtype)
270+
new_var = np.zeros_like(new_mean, dtype=dtype)
271+
updated_mean = np.zeros_like(new_mean, dtype=dtype)
272+
updated_var = np.zeros_like(new_mean, dtype=dtype)
273+
224274
cdef unsigned long new_n
225-
cdef np.ndarray[DOUBLE, ndim=1] updated_mean = np.zeros_like(new_mean)
226-
cdef np.ndarray[DOUBLE, ndim=1] updated_var = np.zeros_like(new_mean)
227275
cdef unsigned long updated_n
228-
cdef DOUBLE last_over_new_n
276+
cdef floating last_over_new_n
229277

230278
# Obtain new stats first
231279
new_n = n_samples
232-
if isinstance(X, sp.csr_matrix):
233-
new_mean, new_var = csr_mean_variance_axis0(X)
234-
elif isinstance(X, sp.csc_matrix):
235-
new_mean, new_var = csc_mean_variance_axis0(X)
280+
281+
if X_format == 'csr':
282+
# X is a CSR matrix
283+
new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices)
284+
else:
285+
# X is a CSC matrix
286+
new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
287+
X_indptr)
236288

237289
# First pass
238290
if last_n == 0:

sklearn/utils/tests/test_sparsefuncs.py

Lines changed: 47 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -30,29 +30,26 @@ def test_mean_variance_axis0():
3030
X_lil = sp.lil_matrix(X)
3131
X_lil[1, 0] = 0
3232
X[1, 0] = 0
33-
X_csr = sp.csr_matrix(X_lil)
3433

35-
X_means, X_vars = mean_variance_axis(X_csr, axis=0)
36-
assert_array_almost_equal(X_means, np.mean(X, axis=0))
37-
assert_array_almost_equal(X_vars, np.var(X, axis=0))
34+
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
3835

36+
X_csr = sp.csr_matrix(X_lil)
3937
X_csc = sp.csc_matrix(X_lil)
40-
X_means, X_vars = mean_variance_axis(X_csc, axis=0)
4138

42-
assert_array_almost_equal(X_means, np.mean(X, axis=0))
43-
assert_array_almost_equal(X_vars, np.var(X, axis=0))
44-
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
39+
expected_dtypes = [(np.float32, np.float32),
40+
(np.float64, np.float64),
41+
(np.int32, np.float64),
42+
(np.int64, np.float64)]
4543

46-
X = X.astype(np.float32)
47-
X_csr = X_csr.astype(np.float32)
48-
X_csc = X_csr.astype(np.float32)
49-
X_means, X_vars = mean_variance_axis(X_csr, axis=0)
50-
assert_array_almost_equal(X_means, np.mean(X, axis=0))
51-
assert_array_almost_equal(X_vars, np.var(X, axis=0))
52-
X_means, X_vars = mean_variance_axis(X_csc, axis=0)
53-
assert_array_almost_equal(X_means, np.mean(X, axis=0))
54-
assert_array_almost_equal(X_vars, np.var(X, axis=0))
55-
assert_raises(TypeError, mean_variance_axis, X_lil, axis=0)
44+
for input_dtype, output_dtype in expected_dtypes:
45+
X_test = X.astype(input_dtype)
46+
for X_sparse in (X_csr, X_csc):
47+
X_sparse = X_sparse.astype(input_dtype)
48+
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
49+
assert_equal(X_means.dtype, output_dtype)
50+
assert_equal(X_vars.dtype, output_dtype)
51+
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
52+
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
5653

5754

5855
def test_mean_variance_axis1():
@@ -64,29 +61,26 @@ def test_mean_variance_axis1():
6461
X_lil = sp.lil_matrix(X)
6562
X_lil[1, 0] = 0
6663
X[1, 0] = 0
67-
X_csr = sp.csr_matrix(X_lil)
6864

69-
X_means, X_vars = mean_variance_axis(X_csr, axis=1)
70-
assert_array_almost_equal(X_means, np.mean(X, axis=1))
71-
assert_array_almost_equal(X_vars, np.var(X, axis=1))
65+
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
7266

67+
X_csr = sp.csr_matrix(X_lil)
7368
X_csc = sp.csc_matrix(X_lil)
74-
X_means, X_vars = mean_variance_axis(X_csc, axis=1)
7569

76-
assert_array_almost_equal(X_means, np.mean(X, axis=1))
77-
assert_array_almost_equal(X_vars, np.var(X, axis=1))
78-
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
70+
expected_dtypes = [(np.float32, np.float32),
71+
(np.float64, np.float64),
72+
(np.int32, np.float64),
73+
(np.int64, np.float64)]
7974

80-
X = X.astype(np.float32)
81-
X_csr = X_csr.astype(np.float32)
82-
X_csc = X_csr.astype(np.float32)
83-
X_means, X_vars = mean_variance_axis(X_csr, axis=1)
84-
assert_array_almost_equal(X_means, np.mean(X, axis=1))
85-
assert_array_almost_equal(X_vars, np.var(X, axis=1))
86-
X_means, X_vars = mean_variance_axis(X_csc, axis=1)
87-
assert_array_almost_equal(X_means, np.mean(X, axis=1))
88-
assert_array_almost_equal(X_vars, np.var(X, axis=1))
89-
assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)
75+
for input_dtype, output_dtype in expected_dtypes:
76+
X_test = X.astype(input_dtype)
77+
for X_sparse in (X_csr, X_csc):
78+
X_sparse = X_sparse.astype(input_dtype)
79+
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
80+
assert_equal(X_means.dtype, output_dtype)
81+
assert_equal(X_vars.dtype, output_dtype)
82+
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
83+
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
9084

9185

9286
def test_incr_mean_variance_axis():
@@ -132,34 +126,25 @@ def test_incr_mean_variance_axis():
132126
X = np.vstack(data_chunks)
133127
X_lil = sp.lil_matrix(X)
134128
X_csr = sp.csr_matrix(X_lil)
135-
X_means, X_vars = mean_variance_axis(X_csr, axis)
136-
X_means_incr, X_vars_incr, n_incr = \
137-
incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
138-
assert_array_almost_equal(X_means, X_means_incr)
139-
assert_array_almost_equal(X_vars, X_vars_incr)
140-
assert_equal(X.shape[axis], n_incr)
141-
142129
X_csc = sp.csc_matrix(X_lil)
143-
X_means, X_vars = mean_variance_axis(X_csc, axis)
144-
assert_array_almost_equal(X_means, X_means_incr)
145-
assert_array_almost_equal(X_vars, X_vars_incr)
146-
assert_equal(X.shape[axis], n_incr)
147130

148-
# All data but as float
149-
X = X.astype(np.float32)
150-
X_csr = X_csr.astype(np.float32)
151-
X_means, X_vars = mean_variance_axis(X_csr, axis)
152-
X_means_incr, X_vars_incr, n_incr = \
153-
incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n)
154-
assert_array_almost_equal(X_means, X_means_incr)
155-
assert_array_almost_equal(X_vars, X_vars_incr)
156-
assert_equal(X.shape[axis], n_incr)
157-
158-
X_csc = X_csr.astype(np.float32)
159-
X_means, X_vars = mean_variance_axis(X_csc, axis)
160-
assert_array_almost_equal(X_means, X_means_incr)
161-
assert_array_almost_equal(X_vars, X_vars_incr)
162-
assert_equal(X.shape[axis], n_incr)
131+
expected_dtypes = [(np.float32, np.float32),
132+
(np.float64, np.float64),
133+
(np.int32, np.float64),
134+
(np.int64, np.float64)]
135+
136+
for input_dtype, output_dtype in expected_dtypes:
137+
for X_sparse in (X_csr, X_csc):
138+
X_sparse = X_sparse.astype(input_dtype)
139+
X_means, X_vars = mean_variance_axis(X_sparse, axis)
140+
X_means_incr, X_vars_incr, n_incr = \
141+
incr_mean_variance_axis(X_sparse, axis, last_mean,
142+
last_var, last_n)
143+
assert_equal(X_means_incr.dtype, output_dtype)
144+
assert_equal(X_vars_incr.dtype, output_dtype)
145+
assert_array_almost_equal(X_means, X_means_incr)
146+
assert_array_almost_equal(X_vars, X_vars_incr)
147+
assert_equal(X.shape[axis], n_incr)
163148

164149

165150
def test_mean_variance_illegal_axis():

0 commit comments

Comments
 (0)
0