@@ -33,42 +33,32 @@ def _get_mask(X, value_to_mask):
33
33
return X == value_to_mask
34
34
35
35
36
- def _get_median (negative_elements , n_zeros , positive_elements ):
37
- """Compute the median of the array formed by negative_elements,
38
- n_zeros zeros and positive_elements. This function is used
39
- to support sparse matrices."""
40
- negative_elements = np .sort (negative_elements , kind = 'heapsort' )
41
- positive_elements = np .sort (positive_elements , kind = 'heapsort' )
42
-
43
- n_elems = len (negative_elements ) + n_zeros + len (positive_elements )
36
+ def _get_median (data , n_zeros ):
37
+ """Compute the median of data with n_zeros additional zeros.
38
+
39
+ This function is used to support sparse matrices; it modifies data in-place
40
+ """
41
+ n_elems = len (data ) + n_zeros
44
42
if not n_elems :
45
43
return np .nan
44
+ n_negative = np .count_nonzero (data < 0 )
45
+ middle , is_odd = divmod (n_elems , 2 )
46
+ data .sort ()
46
47
47
- median_position = (n_elems - 1 ) / 2.0
48
+ if is_odd :
49
+ return _get_elem_at_rank (middle , data , n_negative , n_zeros )
48
50
49
- if round (median_position ) == median_position :
50
- median = _get_elem_at_rank (negative_elements , n_zeros ,
51
- positive_elements , median_position )
52
- else :
53
- a = _get_elem_at_rank (negative_elements , n_zeros ,
54
- positive_elements , math .floor (median_position ))
55
- b = _get_elem_at_rank (negative_elements , n_zeros ,
56
- positive_elements , math .ceil (median_position ))
57
- median = (a + b ) / 2.0
58
-
59
- return median
60
-
61
-
62
- def _get_elem_at_rank (negative_elements , n_zeros , positive_elements , k ):
63
- """Compute the kth largest element of the array formed by
64
- negative_elements, n_zeros zeros and positive_elements."""
65
- len_neg = len (negative_elements )
66
- if k < len_neg :
67
- return negative_elements [k ]
68
- elif k >= len_neg + n_zeros :
69
- return positive_elements [k - len_neg - n_zeros ]
70
- else :
51
+ return (_get_elem_at_rank (middle - 1 , data , n_negative , n_zeros ) +
52
+ _get_elem_at_rank (middle , data , n_negative , n_zeros )) / 2.
53
+
54
+
55
+ def _get_elem_at_rank (rank , data , n_negative , n_zeros ):
56
+ """Find the value in data augmented with n_zeros for the given rank"""
57
+ if rank < n_negative :
58
+ return data [rank ]
59
+ if rank - n_negative < n_zeros :
71
60
return 0
61
+ return data [rank - n_zeros ]
72
62
73
63
74
64
def _most_frequent (array , extra_value , n_repeat ):
@@ -137,8 +127,8 @@ class Imputer(BaseEstimator, TransformerMixin):
137
127
138
128
Attributes
139
129
----------
140
- `statistics_` : array of shape (n_features,) or (n_samples,)
141
- The statistics along the imputation axis.
130
+ `statistics_` : array of shape (n_features,)
131
+ The imputation fill value for each feature if axis == 0 .
142
132
143
133
Notes
144
134
-----
@@ -211,7 +201,7 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
211
201
212
202
# Count the zeros
213
203
if missing_values == 0 :
214
- n_zeros_axis = np .zeros (X .shape [not axis ])
204
+ n_zeros_axis = np .zeros (X .shape [not axis ], dtype = int )
215
205
else :
216
206
n_zeros_axis = X .shape [axis ] - np .diff (X .indptr )
217
207
@@ -257,19 +247,15 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
257
247
mask_valids = np .hsplit (np .logical_not (mask_missing_values ),
258
248
X .indptr [1 :- 1 ])
259
249
260
- columns = [col [mask .astype (np .bool )]
250
+ # astype necessary for bug in numpy.hsplit before v1.9
251
+ columns = [col [mask .astype (bool , copy = False )]
261
252
for col , mask in zip (columns_all , mask_valids )]
262
253
263
254
# Median
264
255
if strategy == "median" :
265
256
median = np .empty (len (columns ))
266
257
for i , column in enumerate (columns ):
267
-
268
- negatives = column [column < 0 ]
269
- positives = column [column > 0 ]
270
- median [i ] = _get_median (negatives ,
271
- n_zeros_axis [i ],
272
- positives )
258
+ median [i ] = _get_median (column , n_zeros_axis [i ])
273
259
274
260
return median
275
261
0 commit comments