8000 Merge pull request #3800 from MechCoder/unknown_transform · scikit-learn/scikit-learn@9c1c811 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9c1c811

Browse files
committed
Merge pull request #3800 from MechCoder/unknown_transform
Handle_unknown option to OneHotEncoder
2 parents a383729 + 72db6ae commit 9c1c811

File tree

4 files changed

+52
-9
lines changed

4 files changed

+52
-9
lines changed

doc/modules/preprocessing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ Continuing the example above::
343343
>>> enc = preprocessing.OneHotEncoder()
344344
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS
345345
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
346-
n_values='auto', sparse=True)
346+
handle_unknown='error', n_values='auto', sparse=True)
347347
>>> enc.transform([[0, 1, 3]]).toarray()
348348
array([[ 1., 0., 0., 1., 0., 0., 0., 0., 1.]])
349349

doc/whats_new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ Enhancements
8787
- Add a quantile prediction strategy to the :class:`dummy.DummyRegressor`.
8888
By `Aaron Staple`_.
8989

90+
- Add ``handle_unknown`` option to :class:`preprocessing.OneHotEncoder` to
91+
handle unknown categorical features more gracefully during transform.
92+
By `Manoj Kumar`_
93+
9094
Documentation improvements
9195
..........................
9296

sklearn/preprocessing/data.py

+23-8
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
925925
sparse : boolean, default=True
926926
Will return sparse matrix if set True else will return an array.
927927
928+
handle_unknown : str, 'error' or 'ignore'
929+
Whether to raise an error or ignore if a unknown categorical feature is
930+
present during transform.
931+
928932
Attributes
929933
----------
930934
active_features_ : array
@@ -951,7 +955,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
951955
>>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \
952956
[1, 0, 2]]) # doctest: +ELLIPSIS
953957
OneHotEncoder(categorical_features='all', dtype=<... 'float'>,
954-
n_values='auto', sparse=True)
958+
handle_unknown='error', n_values='auto', sparse=True)
955959
>>> enc.n_values_
956960
array([2, 3, 4])
957961
>>> enc.feature_indices_
@@ -967,11 +971,12 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
967971
encoding of dictionary items or strings.
968972
"""
969973
def __init__(self, n_values="auto", categorical_features="all",
970-
dtype=np.float, sparse=True):
974+
dtype=np.float, sparse=True, handle_unknown='error'):
971975
self.n_values = n_values
972976
self.categorical_features = categorical_features
973977
self.dtype = dtype
974978
self.sparse = sparse
979+
self.handle_unknown = handle_unknown
975980

976981
def fit(self, X, y=None):
977982
"""Fit OneHotEncoder to X.
@@ -1056,13 +1061,23 @@ def _transform(self, X):
10561061
" Expected %d, got %d."
10571062
% (indices.shape[0] - 1, n_features))
10581063

1059-
if (np.max(X, axis=0) >= self.n_values_).any():
1060-
raise ValueError("Feature out of bounds. Try setting n_values.")
1061-
1062-
column_indices = (X + indices[:-1]).ravel()
1064+
# We use only those catgorical features of X that are known using fit.
1065+
# i.e lesser than n_values_ using mask.
1066+
# This means, if self.handle_unknown is "ignore", the row_indices and
1067+
# col_indices corresponding to the unknown categorical feature are ignored.
1068+
mask = (X < self.n_values_).ravel()
1069+
if np.any(~mask):
1070+
if self.handle_unknown not in ['error', 'ignore']:
1071+
raise ValueError("handle_unknown should be either error or "
1072+
"unknown got %s" % self.handle_unknown)
1073+
if self.handle_unknown == 'error':
1074+
raise ValueError("unknown categorical feature present %s "
1075+
"during transform." % X[~mask])
1076+
1077+
column_indices = (X + indices[:-1]).ravel()[mask]
10631078
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
1064-
n_features)
1065-
data = np.ones(n_samples * n_features)
1079+
n_features)[mask]
1080+
data = np.ones(np.sum(mask))
10661081
out = sparse.coo_matrix((data, (row_indices, column_indices)),
10671082
shape=(n_samples, indices[-1]),
10681083
dtype=self.dtype).tocsr()

sklearn/preprocessing/tests/test_data.py

+24
Original file line numberDiff line numberDiff line change
@@ -802,3 +802,27 @@ def test_one_hot_encoder_categorical_features():
802802
# Edge case: all categorical
803803
cat = [True, True, True]
804804
_check_one_hot(X, X2, cat, 5)
805+
806+
807+
def test_one_hot_encoder_unknown_transform():
808+
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
809+
y = np.array([[4, 1, 1]])
810+
811+
# Test that one hot encoder raises error for unknown features
812+
# present during transform.
813+
oh = OneHotEncoder(handle_unknown='error')
814+
oh.fit(X)
815+
assert_raises(ValueError, oh.transform, y)
816+
817+
# Test the ignore option, ignores unknown features.
818+
oh = OneHotEncoder(handle_unknown='ignore')
819+
oh.fit(X)
820+
assert_array_equal(
821+
oh.transform(y).toarray(),
822+
np.array([[ 0., 0., 0., 0., 1., 0., 0.]])
823+
)
824+
825+
# Raise error if handle_unknown is neither ignore or error.
826+
oh = OneHotEncoder(handle_unknown='42')
827+
oh.fit(X)
828+
assert_raises(ValueError, oh.transform, y)

0 commit comments

Comments
 (0)
0