@@ -27,7 +27,7 @@ class _BaseEncoder(TransformerMixin, BaseEstimator):
27
27
28
28
"""
29
29
30
- def _check_X (self , X , force_all_finite = False ):
30
+ def _check_X (self , X ):
31
31
"""
32
32
Perform custom check_array:
33
33
- convert list of strings to object dtype
@@ -42,7 +42,7 @@ def _check_X(self, X, force_all_finite=False):
42
42
if not (hasattr (X , 'iloc' ) and getattr (X , 'ndim' , 0 ) == 2 ):
43
43
# if not a dataframe, do normal check_array validation
44
44
X_temp = check_array (
45
- X , dtype = None , force_all_finite = force_all_finite )
45
+ X , dtype = None , force_all_finite = self . force_all_finite )
46
46
if (not hasattr (X , 'dtype' )
47
47
and np .issubdtype (X_temp .dtype , np .str_ )):
48
48
X = check_array (X , dtype = np .object )
@@ -58,7 +58,7 @@ def _check_X(self, X, force_all_finite=False):
58
58
for i in range (n_features ):
59
59
Xi = self ._get_feature (X , feature_idx = i )
60
60
Xi = check_array (Xi , ensure_2d = False , dtype = None ,
61
- force_all_finite = force_all_finite )
61
+ force_all_finite = self . force_all_finite )
62
62
X_columns .append (Xi )
63
63
64
64
return X_columns , n_samples , n_features
@@ -71,8 +71,7 @@ def _get_feature(self, X, feature_idx):
71
71
return X [:, feature_idx ]
72
72
73
73
def _fit (self , X , handle_unknown = 'error' ):
74
- # ignore NaNs during fit
75
- X_list , n_samples , n_features = self ._check_X (X , force_all_finite = False )
74
+ X_list , n_samples , n_features = self ._check_X (X )
76
75
77
76
if self .categories != 'auto' :
78
77
if len (self .categories ) != n_features :
@@ -84,6 +83,7 @@ def _fit(self, X, handle_unknown='error'):
84
83
for i in range (n_features ):
85
84
Xi = X_list [i ]
86
85
if self .categories == 'auto' :
86
+ # NaNs don't count as categoreis during fit
87
87
cats = _encode (Xi [~ _object_dtype_isnan (Xi )])
88
88
else :
89
89
cats = np .array (self .categories [i ], dtype = Xi .dtype )
@@ -92,6 +92,7 @@ def _fit(self, X, handle_unknown='error'):
92
92
raise ValueError ("Unsorted categories are not "
93
93
"supported for numerical categories" )
94
94
if handle_unknown == 'error' :
95
+ # NaNs don't count as categoreis during fit
95
96
diff = _encode_check_unknown (Xi [~ _object_dtype_isnan (Xi )], cats )
96
97
if diff :
97
98
msg = ("Found unknown categories {0} in column {1}"
@@ -100,12 +101,8 @@ def _fit(self, X, handle_unknown='error'):
100
101
self .categories_ .append (cats )
101
102
102
103
def _transform (self , X , handle_unknown = 'error' , handle_missing = None ):
103
- if handle_missing is None :
104
- force_all_finite = True
105
- else :
106
- force_all_finite = False
107
-
108
- X_list , n_samples , n_features = self ._check_X (X , force_all_finite )
104
+ X_list , n_samples , n_features = self ._check_X (
105
+ X )
109
106
# from now on, either X is w.o. NaNs or w. NaNs yet handle_missing != None.
110
107
# in the later case, since we'll handle NaNs separately,
111
108
# NaNs don't count as unknown categories
@@ -125,7 +122,7 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
125
122
diff , valid_mask = _encode_check_unknown (Xi , self .categories_ [i ],
126
123
return_mask = True )
127
124
# NaNs don't count as unknown categories
128
- na_valid_mask = valid_mask | pd . isna (Xi )
125
+ na_valid_mask = valid_mask | _object_dtype_isnan (Xi )
129
126
130
127
if not np .all (valid_mask ):
131
128
if not np .all (na_valid_mask ) and handle_unknown == 'error' :
@@ -318,8 +315,9 @@ def __init__(self, categories='auto', drop=None, sparse=True,
318
315
self .sparse = sparse
319
316
self .dtype = dtype
320
317
self .handle_unknown = handle_unknown
321
- self .handle_missing = handle_missing
322
318
self .drop = drop
319
+ self .handle_missing = handle_missing
320
+ self .force_all_finite = True if handle_missing is None else 'allow-nan'
323
321
324
322
def _validate_keywords (self ):
325
323
if self .handle_unknown not in ('error' , 'ignore' ):
@@ -488,43 +486,6 @@ def transform(self, X):
488
486
else :
489
487
return out
490
488
491
- # def transform(self, X):
492
- # """Transform X using one-hot encoding.
493
-
494
- # Parameters
495
- # ----------
496
- # X : array-like, shape [n_samples, n_features]
497
- # The data to encode.
498
-
499
- # Returns
500
- # -------
501
- # X_out : sparse matrix if sparse=True else a 2-d array
502
- # Transformed input.
503
- # """
504
-
505
- # if not self.handle_missing or self.handle_missing not in ["all-missing",
506
- # "all-zero", "category"]:
507
- # raise ValueError("Wrong 'handle_missing' value specified. "
508
- # "'handle_missing' should be one of either "
509
- # "['all-missing', 'all-zero', 'category']. "
510
- # "Getting {0}".format(self.handle_missing))
511
- # missing_indices = np.argwhere(np.isnan(X)) if self.missing_values == "NaN" else \
512
- # np.argwhere(X == self.missing_values)
513
- # if self.handle_missing == "all-missing":
514
- # for i in missing_indices:
515
- # X[i] = np.nan
516
- # if self.handle_missing == "all-zero":
517
- # for i in missing_indices:
518
- # X[i] = 0
519
- # else:
520
- # # Replace with a seperate one-hot column
521
- # pass
522
-
523
- # if self._legacy_mode:
524
- # return _transform_selected(X, self._legacy_transform,
525
- # self.dtype,
526
- # self._categorical_features, copy=True)
527
- # return self._transform_new(X)
528
489
529
490
def inverse_transform (self , X ):
530
491
"""
@@ -588,7 +549,7 @@ def inverse_transform(self, X):
588
549
# for sparse X argmax returns 2D matrix, ensure 1D array
589
550
labels = np .asarray (sub .argmax (axis = 1 )).flatten ()
590
551
X_tr [:, i ] = cats [labels ]
591
- if self .handle_unknown == 'ignore' :
552
+ if self .handle_unknown == 'ignore' or self . handle_missing == 'all-zero' :
592
553
unknown = np .asarray (sub .sum (axis = 1 ) == 0 ).flatten ()
593
554
# ignored unknown categories: we have a row of all zero
594
555
if unknown .any ():
@@ -713,9 +674,11 @@ class OrdinalEncoder(_BaseEncoder):
713
674
['Female', 2]], dtype=object)
714
675
"""
715
676
716
- def __init__ (self , categories = 'auto' , dtype = np .float64 ):
677
+ def __init__ (self , categories = 'auto' , dtype = np .float64 , handle_missing = None ):
717
678
self .categories = categories
718
679
self .dtype = dtype
680
+ self .handle_missing = handle_missing
681
+ self .force_all_finite = True if handle_missing is None else 'allow-nan'
719
682
720
683
def fit (self , X , y = None ):
721
684
"""
0 commit comments