8000 Add ddof parameter to sparsefuncs variance · scikit-learn/scikit-learn@633e3ca · GitHub
[go: up one dir, main page]

Skip to content

Commit 633e3ca

Browse files
Add ddof parameter to sparsefuncs variance
1 parent 4a7075a commit 633e3ca

File tree

3 files changed

+74
-17
lines changed

3 files changed

+74
-17
lines changed

sklearn/utils/sparsefuncs.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ def inplace_csr_row_scale(X, scale):
6161
X.data *= np.repeat(scale, np.diff(X.indptr))
6262

6363

64-
def mean_variance_axis(X, axis):
65-
"""Compute mean and variance along an axix on a CSR or CSC matrix
64+
def mean_variance_axis(X, axis, ddof=0):
65+
"""Compute mean and variance along an axis on a CSR or CSC matrix
6666
6767
Parameters
6868
----------
@@ -72,6 +72,11 @@ def mean_variance_axis(X, axis):
7272
axis : int (either 0 or 1)
7373
Axis along which the axis should be computed.
7474
75+
ddof : int, optional
76+
“Delta Degrees of Freedom”: the divisor used in the calculation is
77+
``N - ddof``, where ``N`` represents the number of elements. By default
78+
ddof is zero.
79+
7580
Returns
7681
-------
7782
@@ -82,18 +87,24 @@ def mean_variance_axis(X, axis):
8287
Feature-wise variances
8388
8489
"""
90+
if ddof < 0:
91+
raise ValueError('ddof cannot be <0')
92+
93+
if ddof >= X.shape[axis]:
94+
raise ValueError('ddof cannot be <N')
95+
8596
_raise_error_wrong_axis(axis)
8697

8798
if isinstance(X, sp.csr_matrix):
8899
if axis == 0:
89-
return _csr_mean_var_axis0(X)
100+
return _csr_mean_var_axis0(X, ddof=ddof)
90101
else:
91-
return _csc_mean_var_axis0(X.T)
102+
return _csc_mean_var_axis0(X.T, ddof=ddof)
92103
elif isinstance(X, sp.csc_matrix):
93104
if axis == 0:
94-
return _csc_mean_var_axis0(X)
105+
return _csc_mean_var_axis0(X, ddof=ddof)
< 10000 /code>
95106
else:
96-
return _csr_mean_var_axis0(X.T)
107+
return _csr_mean_var_axis0(X.T, ddof=ddof)
97108
else:
98109
_raise_typeerror(X)
99110

sklearn/utils/sparsefuncs_fast.pyx

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,19 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
5555
return norms
5656

5757

58-
def csr_mean_variance_axis0(X):
58+
def csr_mean_variance_axis0(X, ddof=0):
5959
"""Compute mean and variance along axis 0 on a CSR matrix
6060
6161
Parameters
6262
----------
6363
X : CSR sparse matrix, shape (n_samples, n_features)
6464
Input data.
6565
66+
ddof : int, optional
67+
“Delta Degrees of Freedom”: the divisor used in the calculation is
68+
``N - ddof``, where ``N`` represents the number of elements. By default
69+
ddof is zero.
70+
6671
Returns
6772
-------
6873
means : float array with shape (n_features,)
@@ -74,15 +79,17 @@ def csr_mean_variance_axis0(X):
7479
"""
7580
if X.dtype not in [np.float32, np.float64]:
7681
X = X.astype(np.float64)
77-
means, variances, _ = _csr_mean_variance_axis0(X.data, X.shape[0],
78-
X.shape[1], X.indices)
82+
means, variances, _ = _csr_mean_variance_axis0(
83+
X.data, X.shape[0], X.shape[1], X.indices, ddof
84+
)
7985
return means, variances
8086

8187

8288
def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
8389
unsigned long long n_samples,
8490
unsigned long long n_features,
85-
np.ndarray[integral, ndim=1] X_indices):
91+
np.ndarray[integral, ndim=1] X_indices,
92+
unsigned long long ddof=0):
8693
# Implement the function here since variables using fused types
8794
# cannot be declared directly and can only be passed as function arguments
8895
cdef:
@@ -130,19 +137,24 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
130137

131138
for i in xrange(n_features):
132139
variances[i] += (n_samples - counts_nan[i] - counts[i]) * means[i]**2
133-
variances[i] /= (n_samples - counts_nan[i])
140+
variances[i] /= (n_samples - ddof - counts_nan[i])
134141

135142
return means, variances, counts_nan
136143

137144

138-
def csc_mean_variance_axis0(X):
145+
def csc_mean_variance_axis0(X, ddof=0):
139146
"""Compute mean and variance along axis 0 on a CSC matrix
140147
141148
Parameters
142149
----------
143150
X : CSC sparse matrix, shape (n_samples, n_features)
144151
Input data.
145152
153+
ddof : int, optional
154+
“Delta Degrees of Freedom”: the divisor used in the calculation is
155+
``N - ddof``, where ``N`` represents the number of elements. By default
156+
ddof is zero.
157+
146158
Returns
147159
-------
148160
means : float array with shape (n_features,)
@@ -154,17 +166,18 @@ def csc_mean_variance_axis0(X):
154166
"""
155167
if X.dtype not in [np.float32, np.float64]:
156168
X = X.astype(np.float64)
157-
means, variances, _ = _csc_mean_variance_axis0(X.data, X.shape[0],
158-
X.shape[1], X.indices,
159-
X.indptr)
169+
means, variances, _ = _csc_mean_variance_axis0(
170+
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, ddof
171+
)
160172
return means, variances
161173

162174

163175
def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
164176
unsigned long long n_samples,
165177
unsigned long long n_features,
166178
np.ndarray[integral, ndim=1] X_indices,
167-
np.ndarray[integral, ndim=1] X_indptr):
179+
np.ndarray[integral, ndim=1] X_indptr,
180+
unsigned long long ddof=0):
168181
# Implement the function here since variables using fused types
169182
# cannot be declared directly and can only be passed as function arguments
170183
cdef:
@@ -209,7 +222,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
209222
variances[i] += diff * diff
210223

211224
variances[i] += (n_samples - counts_nan[i] - counts) * means[i]**2
212-
variances[i] /= (n_samples - counts_nan[i])
225+
variances[i] /= (n_samples - ddof - counts_nan[i])
213226

214227
return means, variances, counts_nan
215228

sklearn/utils/tests/test_sparsefuncs.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,39 @@ def test_mean_variance_axis1():
8585
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
8686

8787

88+
@pytest.mark.parametrize('axis', (0, 1))
89+
def test_mean_variance_with_ddof(axis):
90+
X, _ = make_classification(5, 4, random_state=0)
91+
# Sparsify the array a little bit
92+
X[[0, 2, 4], [0, 1, 3]] = 0
93+
94+
# Test on csr matrices
95+
X_csr = sp.csr_matrix(X)
96+
mean, variance = mean_variance_axis(X_csr, axis=axis, ddof=1)
97+
assert_allclose(mean, np.mean(X, axis=axis))
98+
assert_allclose(variance, np.var(X, axis=axis, ddof=1))
99+
100+
# Test on csc matrices
101+
X_csc = sp.csc_matrix(X)
102+
mean, variance = mean_variance_axis(X_csc, axis=axis, ddof=1)
103+
assert_allclose(mean, np.mean(X, axis=axis))
104+
assert_allclose(variance, np.var(X, axis=axis, ddof=1))
105+
106+
107+
def test_negative_ddof():
108+
X, _ = make_classification(5, 4, random_state=0)
109+
X = sp.csr_matrix(X)
110+
111+
assert_raises(ValueError, mean_variance_axis, X, axis=0, ddof=-5)
112+
113+
114+
def test_too_large_ddof():
115+
X, _ = make_classification(5, 4, random_state=0)
116+
X = sp.csr_matrix(X)
117+
118+
assert_raises(ValueError, mean_variance_axis, X, axis=0, ddof=10)
119+
120+
88121
def test_incr_mean_variance_axis():
89122
for axis in [0, 1]:
90123
rng = np.random.RandomState(0)

0 commit comments

Comments
 (0)
0