8000 Add comprehensive tests · scikit-learn/scikit-learn@5357a1b · GitHub
[go: up one dir, main page]

Skip to content

Commit 5357a1b

Browse files
committed
Add comprehensive tests
1 parent cf7ad5d commit 5357a1b

File tree

3 files changed

+62
-49
lines changed

3 files changed

+62
-49
lines changed

doc/modules/preprocessing.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ estimator that supports imputation. See :ref:`sphx_glr_auto_examples_missing_val
485485
Transformer indicating missing values
486486
=====================================
487487

488-
MissingIndicator transformer is useful to transform a dataset into corresponding
488+
:class:`MissingIndicator` transformer is useful to transform a dataset into corresponding
489489
binary matrix indicating the presence of missing values in the dataset.
490490
The knowledge of which features were imputed can be exploited by a downstream
491491
estimator by adding features that indicate which elements have been imputed.
@@ -508,9 +508,9 @@ estimator by adding features that indicate which elements have been imputed.
508508
MissingIndicator(features='train', missing_values=-1, sparse='auto')
509509
>>> X2_tr = MI.transform(X2)
510510
>>> X2_tr
511-
array([[False, False, True],
512-
[ True, True, False],
513-
[False, False, False]], dtype=bool)
511+
array([[0, 0, 1],
512+
[1, 1, 0],
513+
[0, 0, 0]], dtype=int32)
514514

515515

516516
.. _polynomial_features:

sklearn/preprocessing/imputation.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -425,9 +425,9 @@ class MissingIndicator(BaseEstimator, TransformerMixin):
425425
MissingIndicator(features='train', missing_values=-1, sparse='auto')
426426
>>> X2_tr = MI.transform(X2)
427427
>>> X2_tr
428-
array([[False, True],
429-
[ True, False],
430-
[False, False]], dtype=bool)
428+
array([[0, 1],
429+
[1, 0],
430+
[0, 0]], dtype=int32)
431431
432432
"""
433433

@@ -438,11 +438,13 @@ def __init__(self, missing_values="NaN", features="train", sparse="auto"):
438438

439439
def fit(self, X):
440440
"""Fit the transformer on X.
441+
441442
Parameters
442443
----------
443444
X : {array-like, sparse matrix}, shape (n_samples, n_features)
444445
Input data, where ``n_samples`` is the number of samples and
445446
``n_features`` is the number of features.
447+
446448
Returns
447449
-------
448450
self : object
@@ -470,32 +472,33 @@ def fit(self, X):
470472
return self
471473

472474
def transform(self, X):
473-
"""Impute all missing values in X.
475+
"""Generate missing values indicator for X.
476+
474477
Parameters
475478
----------
476479
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
477480
The input data to complete.
481+
478482
Returns
479483
-------
480-
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
481-
The transformerwith missing indicator.
484+
Xt : {array-like, sparse matrix}, shape = [n_samples, n_features]
485+
The missing indicator for input data
482486
483487
"""
484488
if self.features == "train":
485489
check_is_fitted(self, "feat_with_missing_")
486490

487491
X = check_array(X, accept_sparse=('csc', 'csr'), dtype=np.float64,
488492
force_all_finite=False)
489-
490493
imputer_mask, feat_with_missing = self._get_missing_features_info(X)
491494

492495
if self.features == "train":
493496
features = np.setdiff1d(feat_with_missing,
494497
self.feat_with_missing_)
495498
if features.size:
496-
warnings.warn("The features %s have missing "
497-
"values in transform but have no missing values"
498-
" in fit " % features, RuntimeWarning,
499+
warnings.warn("The features %s have missing values "
500+
"in transform but have no missing values "
501+
"in fit " % features, RuntimeWarning,
499502
stacklevel=1)
500503
imputer_mask = imputer_mask[:, self.feat_with_missing_]
501504

@@ -522,6 +525,7 @@ def _get_missing_features_info(self, X):
522525
if sparse.issparse(X):
523526
X = X.toarray()
524527
imputer_mask = _get_mask(X, self.missing_values)
528+
imputer_mask = imputer_mask.astype(np.int32, copy=False)
525529
feat_with_missing = np.where(np.any(imputer_mask, axis=0))[0]
526530

527531
if self.sparse is True:
@@ -531,5 +535,8 @@ def _get_missing_features_info(self, X):
531535
imputer_mask = sparse.csc_matrix(imputer_mask)
532536
elif self.sparse is False and sparse.issparse(imputer_mask):
533537
imputer_mask = imputer_mask.toarray()
538+
elif self.sparse == 'auto' and self.missing_values != 0:
539+
if sparse.issparse(imputer_mask):
540+
imputer_mask = imputer_mask.tocsc()
534541

535542
return imputer_mask, feat_with_missing

sklearn/preprocessing/tests/test_imputation.py

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -380,55 +380,61 @@ def test_missing_indicator():
380380
[11, -1, 1, 1]
381381
])
382382

383-
def assert_type(actual, expect, sp, missing_values):
384-
if sp is True and missing_values != 0:
383+
def assert_type(actual, is_sparse, sp, missing_values):
384+
if sp is True :
385385
assert_equal(actual, sparse.csc_matrix)
386-
elif (sp is True and missing_values == 0) or \
387-
sp is False:
386+
elif (sp is "auto" and missing_values == 0 ) \
387+
or sp is False:
388388
assert_equal(actual, np.ndarray)
389389
else:
390-
print type(retype(X2)), sp, missing_values, type(X2_tr)
391-
assert_equal(actual, expect)
390+
if is_sparse:
391+
assert_equal(actual, sparse.csc_matrix)
392+
else:
393+
assert_equal(actual, np.ndarray)
392394

393395
def assert_mask(actual, expected, features):
394396
if hasattr(actual, 'toarray'):
395397
assert_array_equal(actual.toarray(), expected[:, features])
396398
else:
397399
assert_array_equal(actual, expected[:, features])
398400

399-
for X1, X2, missing_values in [(X1_orig, X2_orig, -1),
400-
(X1_orig + 1, X2_orig + 1, 0)]:
401+
def _check_missing_indicator(X1, X2, retype, sp, missing_values):
401402
mask = X2 == missing_values
402403
expect_feat_missing = np.where(np.any(X1 == missing_values, axis=0))[0]
403-
for retype in [np.array, sparse.csr_matrix,
404+
405+
X1_in = retype(X1)
406+
X2_in = retype(X2)
407+
# features = "train":
408+
MI = MissingIndicator(missing_values=missing_values,
409+
sparse = sp)
410+
411+
MI.fit(X1_in)
412+
X2_tr = MI.transform(X2_in)
413+
features = MI.feat_with_missing_
414+
assert_array_equal(expect_feat_missing, features)
415+
assert_type(type(X2_tr),sparse.issparse(X2_in), sp, missing_values)
416+
assert_mask(X2_tr, mask, features)
417+
418+
# features = "all"
419+
MI = clone(MI).set_params(features="all")
420+
MI.fit(X1_in)
421+
X2_tr = MI.transform(X2_in)
422+
features = np.arange(X2.shape[1])
423+
assert_mask(X2_tr, mask, features)
424+
425+
# features = [1, 2]
426+
features = [1, 2]
427+
MI = clone(MI).set_params(features=features)
428+
MI.fit(X1_in)
429+
X2_tr = MI.transform(X2_in)
430+
assert_mask(X2_tr, mask, features)
431+
432+
for X1, X2, missing_values in [(X1_orig, X2_orig, -1),
433+
(X1_orig + 1, X2_orig + 1, 0)]:
434+
for retype in [lambda x: x.tolist(), np.array, sparse.csr_matrix,
404435
sparse.csc_matrix, sparse.lil_matrix]:
405436
for sp in [True, False, 'auto']:
406-
X1_ft = retype(X1)
407-
X2_t = retype(X2)
408-
# features = "train":
409-
MI = MissingIndicator(missing_values=missing_values,
410-
sparse = sp)
411-
412-
MI.fit(X1_ft)
413-
X2_tr = MI.transform(X2_t)
414-
features = MI.feat_with_missing_
415-
assert_array_equal(expect_feat_missing, features)
416-
assert_type(type(X2_tr), type(X2_t), sp, missing_values)
417-
assert_mask(X2_tr, mask, features)
418-
419-
# features = "all"
420-
MI = clone(MI).set_params(features="all")
421-
MI.fit(X1_ft)
422-
X2_tr = MI.transform(retype(X2))
423-
features = np.arange(X2.shape[1])
424-
assert_mask(X2_tr, mask, features)
425-
426-
# features = [1, 2]
427-
features = [1, 2]
428-
MI = clone(MI).set_params(features=features)
429-
MI.fit(X1_ft)
430-
X2_tr = MI.transform(X2_t)
431-
assert_mask(X2_tr, mask, features)
437+
_check_missing_indicator(X1, X2, retype, sp, missing_values)
432438

433439

434440
def test_missing_indicator_warning():

0 commit comments

Comments
 (0)
0