@@ -1745,12 +1745,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
1745
1745
Parameters
1746
1746
----------
1747
1747
values : 'auto', 'seen', int, list of ints, or list of lists of objects
1748
- - 'auto' : determine set of values from training data. If the input
1749
- is an int array, values are determined from range in
1750
- training data. For all other inputs, only values observed
1751
- during `fit` are considered valid values for each feature.
1752
- - 'seen': Only values observed during `fit` are considered valid
1753
- values for each feature.
1748
+ - 'auto' : determine set of values from training data. See the
1749
+ documentation of `handle_unknown` for which values are considered
1750
+ acceptable.
1754
1751
- int : values are in ``range(values)`` for all features
1755
1752
- list of ints : values for feature ``i`` are in ``range(values[i])``
1756
1753
- list of lists : values for feature ``i`` are in ``values[i]``
@@ -1771,8 +1768,12 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
1771
1768
Will return sparse matrix if set True else will return an array.
1772
1769
1773
1770
handle_unknown : str, 'error' or 'ignore'
1774
- Whether to raise an error or ignore if a unknown categorical feature is
1775
- present during transform.
1771
+
1772
+ - 'ignore': Ignore all unknown feature values.
1773
+ - 'error': Raise an error when the value of a feature is unseen during
1774
+ `fit` and out of range of values seen during `fit`.
1775
+ - 'error-strict': Raise an error when the value of a feature is unseen
1776
+ during`fit`.
1776
1777
1777
1778
copy : bool, default=True
1778
1779
If unset, `X` maybe modified in space.
@@ -1850,6 +1851,8 @@ def _fit(self, X):
1850
1851
1851
1852
self ._n_features = n_features
1852
1853
self .label_encoders_ = [LabelEncoder () for i in range (n_features )]
1854
+ # Maximum value for each featue
1855
+ self ._max_values = [None for i in range (n_features )]
1853
1856
1854
1857
if self .n_values is not None :
1855
1858
warnings .warn ('The parameter `n_values` is deprecated, use the'
@@ -1878,9 +1881,9 @@ def _fit(self, X):
1878
1881
1879
1882
for i in range (n_features ):
1880
1883
le = self .label_encoders_ [i ]
1884
+
1885
+ self ._max_values [i ] = np .max (X [:, i ])
1881
1886
if self .values == 'auto' :
1882
- le .fit (np .arange (1 + np .max (X [:, i ])))
1883
- elif self .values == 'seen' :
1884
1887
le .fit (X [:, i ])
1885
1888
elif isinstance (self .values , numbers .Integral ):
1886
1889
if (np .max (X , axis = 0 ) >= self .values ).any ():
@@ -1931,14 +1934,27 @@ def _transform(self, X):
1931
1934
valid_mask = in1d (X [:, i ], self .label_encoders_ [i ].classes_ )
1932
1935
1933
1936
if not np .all (valid_mask ):
1934
-
1935
- if self .handle_unknown == 'error' :
1937
+ if self .handle_unknown in ['error' , 'error-strict' ]:
1936
1938
diff = setdiff1d (X [:, i ], self .label_encoders_ [i ].classes_ )
1937
- msg = 'Unknown feature(s) %s in column %d' % (diff , i )
1938
- raise ValueError (msg )
1939
+ if self .handle_unknown == 'error-strict' :
1940
+ msg = 'Unknown feature(s) %s in column %d' % (diff , i )
1941
+ raise ValueError (msg )
1942
+ else :
1943
+ if np .all (diff <= self ._max_values [i ]):
1944
+ msg = ('Values %s for feature %d are unknown but '
1945
+ 'in range. This will raise an error in '
1946
+ 'future versions.' % (str (diff ), i ))
1947
+ warnings .warn (FutureWarning (msg ))
1948
+ X_mask [:, i ] = valid_mask
1949
+ le = self .label_encoders_ [i ]
1950
+ X [:, i ][~ valid_mask ] = le .classes_ [0 ]
1951
+ else :
1952
+ msg = ('Unknown feature(s) %s in column %d' %
1953
+ (diff , i ))
1954
+ raise ValueError (msg )
1939
1955
elif self .handle_unknown == 'ignore' :
1940
1956
# Set the problematic rows to an acceptable value and
1941
- # continue ` The rows are marked in `X_mask` and will be
1957
+ # continue. The rows are marked in `X_mask` and will be
1942
1958
# removed later.
1943
1959
X_mask [:, i ] = valid_mask
1944
1960
X [:, i ][~ valid_mask ] = self .label_encoders_ [i ].classes_ [0 ]
0 commit comments