8000 FIX: Modify the indicator features functionality · maniteja123/scikit-learn@e015eca · GitHub
[go: up one dir, main page]

Skip to content

Commit e015eca

Browse files
committed
FIX: Modify the indicator features functionality
1 parent 8f199fe commit e015eca

File tree

2 files changed

+77
-30
lines changed

2 files changed

+77
-30
lines changed

sklearn/preprocessing/imputation.py

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,13 @@ def fit(self, X, y=None):
177177
self.strategy,
178178
self.missing_values,
179179
self.axis)
180+
statistics = self.statistics_
181+
invalid_mask = np.isnan(statistics)
182+
valid_mask = np.logical_not(invalid_mask)
183+
valid_idx = np.where(valid_mask)[0]
184+
# print X, self.axis,valid_idx, self.features_with_missing_values_
185+
self.imputed_features_ = valid_idx
186+
180187

181188
return self
182189

@@ -189,6 +196,13 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
189196
else:
190197
X = X.tocsc()
191198

199+
mask = _get_mask(X.data, self.missing_values)
200+
mask_matrix = X.__class__((mask, X.indices.copy(),
201+
X.indptr.copy()), shape=X.shape,
202+
dtype=X.dtype)
203+
mask_matrix.eliminate_zeros() # removes explicit False entries
204+
self.features_with_missing_values_ = mask_matrix.sum(axis=0).A.nonzero()[1]
205+
self.imputed_mask_ = mask_matrix
192206
# Count the zeros
193207
if missing_values == 0:
194208
n_zeros_axis = np.zeros(X.shape[not axis], dtype=int)
@@ -265,7 +279,9 @@ def _dense_fit(self, X, strategy, missing_values, axis):
265279
X = check_array(X, force_all_finite=False)
266280
mask = _get_mask(X, missing_values)
267281
masked_X = ma.masked_array(X, mask=mask)
268-
282+
self.features_with_missing_values_ = np.where(np.any
283+
(mask, axis=0))[0]
284+
self.imputed_mask_ = mask
269285
# Mean
270286
if strategy == "mean":
271287
mean_masked = np.ma.mean(masked_X, axis=axis)
@@ -320,20 +336,8 @@ def _sparse_transform(self, X, valid_stats, valid_idx):
320336
X.data[mask] = astype(valid_stats[indexes], X.dtype,
321337
copy=False)
322338

323-
mask_matrix = X.__class__((mask, X.indices.copy(),
324-
X.indptr.copy()), shape=X.shape,
325-
dtype=X.dtype)
326-
mask_matrix.eliminate_zeros() # removes explicit False entries
327-
features_with_missing_values = mask_matrix.sum(axis=0).A.nonzero()[1]
328-
features_mask = safe_mask(mask_matrix, features_with_missing_values)
329-
imputed_mask = mask_matrix[:, features_mask]
330-
if self.axis == 0:
331-
self.imputed_features_ = valid_idx[features_with_missing_values]
332-
else:
333-
self.imputed_features_ = features_with_missing_values
334-
335339
if self.add_indicator_features:
336-
X = sparse.hstack((X, imputed_mask))
340+
X = sparse.hstack((X, self.imputed_mask_))
337341

338342
return X
339343

@@ -350,16 +354,8 @@ def _dense_transform(self, X, valid_stats, valid_idx):
350354

351355
X[coordinates] = values
352356

353-
features_with_missing_values = np.where(np.any
354-
(mask, axis=0))[0]
355-
imputed_mask = mask[:, features_with_missing_values]
356-
if self.axis == 0:
357-
self.imputed_features_ = valid_idx[features_with_missing_values]
358-
else:
359-
self.imputed_features_ = features_with_missing_values
360-
361357
if self.add_indicator_features:
362-
X = np.hstack((X, imputed_mask))
358+
X = np.hstack((X, self.imputed_mask_))
363359

364360
return X
365361

sklearn/preprocessing/tests/test_imputation.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -367,14 +367,13 @@ def check_indicator(X, expected_imputed_features, axis):
367367
imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
368368
Xt = imputer.fit_transform(X)
369369
Xt_with_in = imputer_with_in.fit_transform(X)
370-
imputed_features_mask = X[:, expected_imputed_features] == -1
370+
imputed_features_mask = X == -1
371371
n_features_new = Xt.shape[1]
372-
n_imputed_features = len(imputer_with_in.imputed_features_)
373-
assert_array_equal(imputer.imputed_features_, expected_imputed_features)
374-
assert_array_equal(imputer_with_in.imputed_features_,
375-
expected_imputed_features)
376-
assert_equal(Xt_with_in.shape,
377-
(n_samples, n_features_new + n_imputed_features))
372+
if axis==0:
373+
assert_array_equal(imputer.imputed_features_, expected_imputed_features)
374+
assert_array_equal(imputer_with_in.imputed_features_,
375+
expected_imputed_features)
376+
assert_equal(Xt_with_in.shape, (n_samples, n_features + n_features_new))
378377
assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
379378
imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
380379
assert_array_equal(Xt_with_in,
@@ -406,3 +405,55 @@ def test_indicator_features():
406405
])
407406
check_indicator(X, np.array([0, 3]), axis=0)
408407
check_indicator(X, np.array([0, 1, 3]), axis=1)
408+
409+
def test_indicator_features_shape():
410+
411+
X1 = np.array([
412+
[-1, -1, 2, 3],
413+
[4, -1, 6, -1],
414+
[8, -1, 10, 11],
415+
[12, -1, -1, 15],
416+
[16, -1, 18, 19]
417+
])
418+
419+
X2 = np.array([
420+
[-1, -1, 1, 3],
421+
[4, -1, 0, -1],
422+
[8, -1, 1, 0],
423+
[0, -1, 0, 15],
424+
[16, -1, 1, 19]
425+
])
426+
427+
n_samples, n_features = X1.shape
428+
imputer = Imputer(missing_values=-1, strategy='mean', axis=0)
429+
imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
430+
imputer.fit(X1)
431+
imputer_with_in.fit(X1)
432+
Xt = imputer.transform(X2)
433+
Xt_with_in = imputer_with_in.transform(X2)
434+
imputed_features_mask = X1 == -1
435+
n_features_new = Xt.shape[1]
436+
# assert_array_equal(imputer.imputed_features_, expected_imputed_features)
437+
# assert_array_equal(imputer_with_in.imputed_features_,
438+
# expected_imputed_features)
439+
assert_equal(Xt_with_in.shape, (n_samples, n_features + n_features_new))
440+
assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
441+
assert_array_equal(Xt_with_in,
442+
imputer_with_in.transform(sparse.csc_matrix(X2)).A)
443+
assert_array_equal(Xt_with_in,
444+
imputer_with_in.transform(sparse.csr_matrix(X2)).A)
445+
446+
imputer = Imputer(missing_values=-1, strategy='mean', axis=1)
447+
imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
448+
imputer.fit(X1)
449+
imputer_with_in.fit(X1)
450+
Xt = imputer.transform(X2)
451+
Xt_with_in = imputer_with_in.transform(X2)
452+
imputed_features_mask = X2 == -1
453+
n_features_new = Xt.shape[1]
454+
assert_equal(Xt_with_in.shape, (n_samples, n_features + n_features_new))
455+
assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
456+
assert_array_equal(Xt_with_in,
457+
imputer_with_in.transform(sparse.csc_matrix(X2)).A)
458+
assert_array_equal(Xt_with_in,
459+
imputer_with_in.transform(sparse.csr_matrix(X2)).A)

0 commit comments

Comments
 (0)
0