@@ -925,6 +925,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
925
925
sparse : boolean, default=True
926
926
Will return sparse matrix if set True else will return an array.
927
927
928
+ handle_unknown : str, 'error' or 'ignore'
929
+ Whether to raise an error or ignore if a unknown categorical feature is
930
+ present during transform.
931
+
928
932
Attributes
929
933
----------
930
934
active_features_ : array
@@ -951,7 +955,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
951
955
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
952
956
[1, 0, 2]]) # doctest: +ELLIPSIS
953
957
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
954
- n_values='auto', sparse=True)
958
+ handle_unknown='error', n_values='auto', sparse=True)
955
959
>>> enc.n_values_
956
960
array([2, 3, 4])
957
961
>>> enc.feature_indices_
@@ -967,11 +971,12 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
967
971
encoding of dictionary items or strings.
968
972
"""
969
973
def __init__ (self , n_values = "auto" , categorical_features = "all" ,
970
- dtype = np .float , sparse = True ):
974
+ dtype = np .float , sparse = True , handle_unknown = 'error' ):
971
975
self .n_values = n_values
972
976
self .categorical_features = categorical_features
973
977
self .dtype = dtype
974
978
self .sparse = sparse
979
+ self .handle_unknown = handle_unknown
975
980
976
981
def fit (self , X , y = None ):
977
982
"""Fit OneHotEncoder to X.
@@ -1056,13 +1061,23 @@ def _transform(self, X):
1056
1061
" Expected %d, got %d."
1057
1062
% (indices .shape [0 ] - 1 , n_features ))
1058
1063
1059
- if (np .max (X , axis = 0 ) >= self .n_values_ ).any ():
1060
- raise ValueError ("Feature out of bounds. Try setting n_values." )
1061
-
1062
- column_indices = (X + indices [:- 1 ]).ravel ()
1064
+ # We use only those catgorical features of X that are known using fit.
1065
+ # i.e lesser than n_values_ using mask.
1066
+ # This means, if self.handle_unknown is "ignore", the row_indices and
1067
+ # col_indices corresponding to the unknown categorical feature are ignored.
1068
+ mask = (X < self .n_values_ ).ravel ()
1069
+ if np .any (~ mask ):
1070
+ if self .handle_unknown not in ['error' , 'ignore' ]:
1071
+ raise ValueError ("handle_unknown should be either error or "
1072
+ "unknown got %s" % self .handle_unknown )
1073
+ if self .handle_unknown == 'error' :
1074
+ raise ValueError ("unknown categorical feature present %s "
1075
+ "during transform." % X [~ mask ])
1076
+
1077
+ column_indices = (X + indices [:- 1 ]).ravel ()[mask ]
1063
1078
row_indices = np .repeat (np .arange (n_samples , dtype = np .int32 ),
1064
- n_features )
1065
- data = np .ones (n_samples * n_features )
1079
+ n_features )[ mask ]
1080
+ data = np .ones (np . sum ( mask ) )
1066
1081
out = sparse .coo_matrix ((data , (row_indices , column_indices )),
1067
1082
shape = (n_samples , indices [- 1 ]),
1068
1083
dtype = self .dtype ).tocsr ()
0 commit comments