2
2
# Joris Van den Bossche <jorisvandenbossche@gmail.com>
3
3
# License: BSD 3 clause
4
4
5
- import warnings
6
-
7
5
import numpy as np
8
6
from scipy import sparse
9
7
10
8
from ..base import BaseEstimator , TransformerMixin
11
9
from ..utils import check_array
12
- from ..utils .validation import check_is_fitted
13
10
from ..utils .fixes import _object_dtype_isnan
11
+ from ..utils .validation import check_is_fitted
14
12
from ._label import _encode , _encode_check_unknown
15
13
16
-
17
14
__all__ = [
18
15
'OneHotEncoder' ,
19
16
'OrdinalEncoder'
@@ -93,19 +90,20 @@ def _fit(self, X, handle_unknown='error'):
93
90
"supported for numerical categories" )
94
91
if handle_unknown == 'error' :
95
92
# NaNs don't count as categoreis during fit
96
- diff = _encode_check_unknown (Xi [~ _object_dtype_isnan (Xi )], cats )
93
+ diff = _encode_check_unknown (
94
+ Xi [~ _object_dtype_isnan (Xi )], cats )
97
95
if diff :
98
96
msg = ("Found unknown categories {0} in column {1}"
99
97
" during fit" .format (diff , i ))
100
98
raise ValueError (msg )
101
99
self .categories_ .append (cats )
102
100
103
101
def _transform (self , X , handle_unknown = 'error' , handle_missing = None ):
104
- X_list , n_samples , n_features = self ._check_X (
105
- X )
106
- # from now on, either X is w.o. NaNs or w. NaNs yet handle_missing != None.
107
- # in the later case, since we'll handle NaNs separately,
108
- # NaNs don 't count as unknown categories
102
+ X_list , n_samples , n_features = self ._check_X (X )
103
+ # from now on, either X is w.o. NaNs
104
+ # or w. NaNs yet handle_missing != None.
105
+ # since we'll handle NaNs separately so that it does not intefere
106
+ # with handle_unknown, we won 't count NaNs as unknown categories
109
107
X_int = np .zeros ((n_samples , n_features ), dtype = np .int )
110
108
X_mask = np .ones ((n_samples , n_features ), dtype = np .bool )
111
109
@@ -137,7 +135,7 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
137
135
Xi = Xi .astype (self .categories_ [i ].dtype )
138
136
else :
139
137
Xi = Xi .copy ()
140
-
138
+
141
139
if handle_missing == 'indicator' :
142
140
valid_mask = na_valid_mask
143
141
Xi [~ valid_mask ] = self .categories_ [i ][0 ]
@@ -151,6 +149,11 @@ def _transform(self, X, handle_unknown='error', handle_missing=None):
151
149
check_unknown = False )
152
150
X_int [:, i ] = encoded
153
151
152
+ if (self .handle_missing
628C
== 'indicator' and
153
+ _object_dtype_isnan (Xi ).sum () > 0 ):
154
+ self .categories_ [i ] = np .append (
155
+ np .array (self .categories_ [i ], dtype = object ), None )
156
+
154
157
return X_int , X_mask
155
158
156
159
def _more_tags (self ):
@@ -230,7 +233,8 @@ class OneHotEncoder(_BaseEncoder):
230
233
will be denoted as None.
231
234
232
235
handle_missing : {'indicator', 'all-zero'}, default=None
233
- Specify how to handle missing categorical features (NaN) in the training data
236
+ Specify how to handle missing categorical features (NaN) in the
237
+ training data
234
238
235
239
- None : Raise an error in the presence of NaN (the default).
236
240
- 'indicator': Represent with a separate one-hot column.
@@ -310,7 +314,8 @@ class OneHotEncoder(_BaseEncoder):
310
314
"""
311
315
312
316
def __init__ (self , categories = 'auto' , drop = None , sparse = True ,
313
- dtype = np .float64 , handle_unknown = 'error' , handle_missing = None ):
317
+ dtype = np .float64 , handle_unknown = 'error' ,
318
+ handle_missing = None ):
314
319
self .categories = categories
315
320
self .sparse = sparse
316
321
self .dtype = dtype
@@ -441,7 +446,8 @@ def transform(self, X):
441
446
check_is_fitted (self )
442
447
# validation of X happens in _check_X called by _transform
443
448
X_int , X_mask = self ._transform (
444
- X , handle_unknown = self .handle_unknown , handle_missing = self .handle_missing )
449
+ X , handle_unknown = self .handle_unknown ,
450
+ handle_missing = self .handle_missing )
445
451
446
452
n_samples , n_features = X_int .shape
447
453
@@ -486,7 +492,6 @@ def transform(self, X):
486
492
else :
487
493
return out
488
494
489
-
490
495
def inverse_transform (self , X ):
491
496
"""
492
497
Convert the data back to the original representation.
@@ -549,7 +554,8 @@ def inverse_transform(self, X):
549
554
# for sparse X argmax returns 2D matrix, ensure 1D array
550
555
labels = np .asarray (sub .argmax (axis = 1 )).flatten ()
551
556
X_tr [:, i ] = cats [labels ]
552
- if self .handle_unknown == 'ignore' or self .handle_missing == 'all-zero' :
557
+ if (self .handle_unknown == 'ignore' or
558
+ self .handle_missing == 'all-zero' ):
553
559
unknown = np .asarray (sub .sum (axis = 1 ) == 0 ).flatten ()
554
560
# ignored unknown categories: we have a row of all zero
555
561
if unknown .any ():
0 commit comments