10000 inverse_tranform for 'all-zero'; force_all_finite as class variable · scikit-learn/scikit-learn@ac0d240 · GitHub
[go: up one dir, main page]

Skip to content

Commit ac0d240

Browse files
author
Katrina Ni
committed
inverse_tranform for 'all-zero'; force_all_finite as class variable
1 parent 8d847cf commit ac0d240

File tree

2 files changed

+19
-52
lines changed

2 files changed

+19
-52
lines changed

sklearn/preprocessing/_encoders.py

Lines changed: 15 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class _BaseEncoder(TransformerMixin, BaseEstimator):
2727
2828
"""
2929

30-
def _check_X(self, X, force_all_finite=False):
30+
def _check_X(self, X):
3131
"""
3232
Perform custom check_array:
3333
- convert list of strings to object dtype
@@ -42,7 +42,7 @@ def _check_X(self, X, force_all_finite=False):
4242
if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
4343
# if not a dataframe, do normal check_array validation
4444
X_temp = check_array(
45-
X, dtype=None, force_all_finite=force_all_finite)
45+
X, dtype=None, force_all_finite=self.force_all_finite)
4646
if (not hasattr(X, 'dtype')
4747
and np.issubdtype(X_temp.dtype, np.str_)):
4848
X = check_array(X, dtype=np.object)
@@ -58,7 +58,7 @@ def _check_X(self, X, force_all_finite=False):
5858
for i in range(n_features):
5959
Xi = self._get_feature(X, feature_idx=i)
6060
Xi = check_array(Xi, ensure_2d=False, dtype=None,
61-
force_all_finite=force_all_finite)
61+
force_all_finite=self.force_all_finite)
6262
X_columns.append(Xi)
6363

6464
return X_columns, n_samples, n_features
@@ -71,8 +71,7 @@ def _get_feature(self, X, feature_idx):
7171
return X[:, feature_idx]
7272

7373
def _fit(self, X, handle_unknown='error'):
74-
# ignore NaNs during fit
75-
X_list, n_samples, n_features = self._check_X(X, force_all_finite=False)
74+
X_list, n_samples, n_features = self._check_X(X)
7675

7776
if self.categories != 'auto':
7877
if len(self.categories) != n_features:
@@ -84,6 +83,7 @@ def _fit(self, X, handle_unknown='error'):
8483
for i in range(n_features):
8584
Xi = X_list[i]
8685
if self.categories == 'auto':
86+
# NaNs don't count as categoreis during fit
8787
cats = _encode(Xi[~_object_dtype_isnan(Xi)])
8888
else:
8989
cats = np.array(self.categories[i], dtype=Xi.dtype)
@@ -92,6 +92,7 @@ def _fit(self, X, handle_unknown='error'):
9292
raise ValueError("Unsorted categories are not "
9393
"supported for numerical categories")
9494
if handle_unknown == 'error':
95+
# NaNs don't count as categoreis during fit
9596
diff = _encode_check_unknown(Xi[~_object_dtype_isnan(Xi)], cats)
9697
if diff:
9798
msg = ("Found unknown categories {0} in column {1}"
@@ -100,12 +101,8 @@ def _fit(self, X, handle_unknown='error'):
100101
self.categories_.append(cats)
101102

102103
def _transform(self, X, handle_unknown='error', handle_missing=None):
103-
if handle_missing is None:
104-
force_all_finite = True
105-
else:
106-
force_all_finite = False
107-
108-
X_list, n_samples, n_features = self._check_X(X, force_all_finite)
104+
X_list, n_samples, n_features = self._check_X(
105+
X)
109106
# from now on, either X is w.o. NaNs or w. NaNs yet handle_missing != None.
110107
# in the later case, since we'll handle NaNs separately,
111108
# NaNs don't count as unknown categories
@@ -125,7 +122,7 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
125122
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
126123
return_mask=True)
127124
# NaNs don't count as unknown categories
128-
na_valid_mask = valid_mask | pd.isna(Xi)
125+
na_valid_mask = valid_mask | _object_dtype_isnan(Xi)
129126

130127
if not np.all(valid_mask):
131128
if not np.all(na_valid_mask) and handle_unknown == 'error':
@@ -318,8 +315,9 @@ def __init__(self, categories='auto', drop=None, sparse=True,
318315
self.sparse = sparse
319316
self.dtype = dtype
320317
self.handle_unknown = handle_unknown
321-
self.handle_missing = handle_missing
322318
self.drop = drop
319+
self.handle_missing = handle_missing
320+
self.force_all_finite = True if handle_missing is None else 'allow-nan'
323321

324322
def _validate_keywords(self):
325323
if self.handle_unknown not in ('error', 'ignore'):
@@ -488,43 +486,6 @@ def transform(self, X):
488486
else:
489487
return out
490488

491-
# def transform(self, X):
492-
# """Transform X using one-hot encoding.
493-
494-
# Parameters
495-
# ----------
496-
# X : array-like, shape [n_samples, n_features]
497-
# The data to encode.
498-
499-
# Returns
500-
# -------
501-
# X_out : sparse matrix if sparse=True else a 2-d array
502-
# Transformed input.
503-
# """
504-
505-
# if not self.handle_missing or self.handle_missing not in ["all-missing",
506-
# "all-zero", "category"]:
507-
# raise ValueError("Wrong 'handle_missing' value specified. "
508-
# "'handle_missing' should be one of either "
509-
# "['all-missing', 'all-zero', 'category']. "
510-
# "Getting {0}".format(self.handle_missing))
511-
# missing_indices = np.argwhere(np.isnan(X)) if self.missing_values == "NaN" else \
512-
# np.argwhere(X == self.missing_values)
513-
# if self.handle_missing == "all-missing":
514-
# for i in missing_indices:
515-
# X[i] = np.nan
516-
# if self.handle_missing == "all-zero":
517-
# for i in missing_indices:
518-
# X[i] = 0
519-
# else:
520-
# # Replace with a seperate one-hot column
521-
# pass
522-
523-
# if self._legacy_mode:
524-
# return _transform_selected(X, self._legacy_transform,
525-
# self.dtype,
526-
# self._categorical_features, copy=True)
527-
# return self._transform_new(X)
528489

529490
def inverse_transform(self, X):
530491
"""
@@ -588,7 +549,7 @@ def inverse_transform(self, X):
588549
# for sparse X argmax returns 2D matrix, ensure 1D array
589550
labels = np.asarray(sub.argmax(axis=1)).flatten()
590551
X_tr[:, i] = cats[labels]
591-
if self.handle_unknown == 'ignore':
552+
if self.handle_unknown == 'ignore' or self.handle_missing == 'all-zero':
592553
unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
593554
# ignored unknown categories: we have a row of all zero
594555
if unknown.any():
@@ -713,9 +674,11 @@ class OrdinalEncoder(_BaseEncoder):
713674
['Female', 2]], dtype=object)
714675
"""
715676

716-
def __init__(self, categories='auto', dtype=np.float64):
677+
def __init__(self, categories='auto', dtype=np.float64, handle_missing=None):
717678
self.categories = categories
718679
self.dtype = dtype
680+
self.handle_missing = handle_missing
681+
self.force_all_finite = True if handle_missing is None else 'allow-nan'
719682

720683
def fit(self, X, y=None):
721684
"""

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,10 @@ def test_one_hot_encoder_handle_missing(X, as_data_frame, handle_unknown):
507507
[0, 1]], dtype='int64')
508508
assert_array_equal(enc_zero.fit_transform(X), exp_zero.astype('float64'))
509509

510+
X_inv = np.array(X, dtype=object)
511+
X_inv[2, 0] = None
512+
assert_array_equal(enc_zero.inverse_transform(exp_zero), X_inv)
513+
510514

511515
@pytest.mark.parametrize("X", [
512516
[['abc', 2, 55], ['def', 1, 55]],

0 commit comments

Comments
 (0)
0