2
2
# Joris Van den Bossche <jorisvandenbossche@gmail.com>
3
3
# License: BSD 3 clause
4
4
5
- import warnings
6
-
7
5
import numpy as np
8
6
from scipy import sparse
9
7
10
8
from ..base import BaseEstimator , TransformerMixin
11
9
from ..utils import check_array
12
- from ..utils .validation import check_is_fitted
13
10
from ..utils .fixes import _object_dtype_isnan
11
+ from ..utils .validation import check_is_fitted
14
12
from ._label import _encode , _encode_check_unknown
15
13
16
-
17
14
__all__ = [
18
15
'OneHotEncoder' ,
19
16
'OrdinalEncoder'
@@ -93,19 +90,20 @@ def _fit(self, X, handle_unknown='error'):
93
90
"supported for numerical categories" )
94
91
if handle_unknown == 'error' :
95
92
# NaNs don't count as categoreis during fit
96
- diff = _encode_check_unknown (Xi [~ _object_dtype_isnan (Xi )], cats )
93
+ diff = _encode_check_unknown (
94
+ Xi [~ _object_dtype_isnan (Xi )], cats )
97
95
if diff :
98
96
msg = ("Found unknown categories {0} in column {1}"
99
97
" during fit" .format (diff , i ))
100
98
raise ValueError (msg )
101
99
self .categories_ .append (cats )
102
100
103
101
def _transform (self , X , handle_unknown = 'error' , handle_missing = None ):
104
- X_list , n_samples , n_features = self ._check_X (
105
- X )
106
- # from now on, either X is w.o. NaNs or w. NaNs yet handle_missing != None.
107
- # in the later case, since we'll handle NaNs separately,
108
- # NaNs don 't count as unknown categories
102
+ X_list , n_samples , n_features = self ._check_X (X )
103
+ # from now on, either X is w.o. NaNs
104
+ # or w. NaNs yet handle_missing != None.
105
+ # since we'll handle NaNs separately so that it does not intefere
106
+ # with handle_unknown, we won 't count NaNs as unknown categories
109
107
X_int = np .zeros ((n_samples , n_features ), dtype = np .int )
110
108
X_mask = np .ones ((n_samples , n_features ), dtype = np .bool )
111
109
@@ -137,7 +135,7 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
137
135
Xi = Xi .astype (self .categories_ [i ].dtype )
138
136
else :
139
137
Xi = Xi .copy ()
140
-
138
+
141
139
if handle_missing == 'indicator' :
142
140
valid_mask = na_valid_mask
143
141
Xi [~ valid_mask ] = self .categories_ [i ][0 ]
@@ -151,6 +149,11 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
151
149
check_unknown = False )
152
150
X_int [:, i ] = encoded
153
151
152
+ if (self .handle_missing == 'indicator' and
153
+ _object_dtype_isnan (Xi ).sum () > 0 ):
154
+ self .categories_ [i ] = np .append (
155
+ np .array (self .categories_ [i ], dtype = object ), None )
156
+
154
157
return X_int , X_mask
155
158
156
159
def _more_tags (self ):
@@ -230,7 +233,8 @@ class OneHotEncoder(_BaseEncoder):
230
233
will be denoted as None.
231
234
232
235
handle_missing : {'indicator', 'all-zero'}, default=None
233
- Specify how to handle missing categorical features (NaN) in the training data
236
+ Specify how to handle missing categorical features (NaN) in the
237
+ training data
234
238
235
239
- None : Raise an error in the presence of NaN (the default).
236
240
- 'indicator': Represent with a separate one-hot column.
@@ -310,7 +314,8 @@ class OneHotEncoder(_BaseEncoder):
310
314
"""
311
315
312
316
def __init__ (self , categories = 'auto' , drop = None , sparse = True ,
313
- dtype = np .float64 , handle_unknown = 'error' , handle_missing = None ):
317
+ dtype = np .float64 , handle_unknown = 'error' ,
318
+ handle_missing = None ):
314
319
self .categories = categories
315
320
self .sparse = sparse
316
321
self .dtype = dtype
@@ -340,8 +345,9 @@ def _compute_drop_idx(self):
340
345
if self .drop == 'first' :
341
346
return np .zeros (len (self .categories_ ), dtype = np .object )
342
347
elif self .drop == 'if_binary' :
343
- return np .array ([0 if len (cats ) == 2 else None
344
- for cats in self .categories_ ], dtype = np .object )
348
+ return np .array (
349
+ [0 if len (cats ) == 2 else None
350
+ for cats in self .categories_ ], dtype = np .object )
345
351
else :
346
352
msg = (
347
353
"Wrong input for parameter `drop`. Expected "
@@ -441,7 +447,8 @@ def transform(self, X):
441
447
check_is_fitted (self )
442
448
# validation of X happens in _check_X called by _transform
443
449
X_int , X_mask = self ._transform (
444
- X , handle_unknown = self .handle_unknown , handle_missing = self .handle_missing )
450
+ X , handle_unknown = self .handle_unknown ,
451
+ handle_missing = self .handle_missing )
445
452
446
453
n_samples , n_features = X_int .shape
447
454
@@ -486,7 +493,6 @@ def transform(self, X):
486
493
else :
487
494
return out
488
495
489
-
490
496
def inverse_transform (self , X ):
491
497
"""
492
498
Convert the data back to the original representation.
@@ -549,7 +555,8 @@ def inverse_transform(self, X):
549
555
# for sparse X argmax returns 2D matrix, ensure 1D array
550
556
labels = np .asarray (sub .argmax (axis = 1 )).flatten ()
551
557
X_tr [:, i ] = cats [labels ]
552
- if self .handle_unknown == 'ignore' or self .handle_missing == 'all-zero' :
558
+ if (self .handle_unknown == 'ignore' or
559
+ self .handle_missing == 'all-zero' ):
553
560
unknown = np .asarray (sub .sum (axis = 1 ) == 0 ).flatten ()
554
561
# ignored unknown categories: we have a row of all zero
555
562
if unknown .any ():
@@ -674,7 +681,8 @@ class OrdinalEncoder(_BaseEncoder):
674
681
['Female', 2]], dtype=object)
675
682
"""
676
683
677
- def __init__ (self , categories = 'auto' , dtype = np .float64 , handle_missing = None ):
684
+ def __init__ (self , categories = 'auto' , dtype = np .float64 ,
685
+ handle_missing = None ):
678
686
self .categories = categories
679
687
self .dtype = dtype
680
688
self .handle_missing = handle_missing
0 commit comments