8000 [MRG] Use hashtable (python sets/dicts) for object dtype data in Encoders by jorisvandenbossche · Pull Request #10209 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Use hashtable (python sets/dicts) for object dtype data in Encoders #10209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jul 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 22 additions & 22 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..utils import deprecated
from ..utils.fixes import _argmax
from ..utils.validation import check_is_fitted, FLOAT_DTYPES
from .label import LabelEncoder
from .label import _encode, _encode_check_unknown


range = six.moves.range
Expand Down Expand Up @@ -104,32 +104,30 @@ def _fit(self, X, handle_unknown='error'):
n_samples, n_features = X.shape

if self._categories != 'auto':
for cats in self._categories:
if not np.all(np.sort(cats) == np.array(cats)):
raise ValueError("Unsorted categories are not yet "
"supported")
if X.dtype != object:
for cats in self._categories:
if not np.all(np.sort(cats) == np.array(cats)):
raise ValueError("Unsorted categories are not "
"supported for numerical categories")
if len(self._categories) != n_features:
raise ValueError("Shape mismatch: if n_values is an array,"
" it has to be of shape (n_features,).")

self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
self.categories_ = []

for i in range(n_features):
le = self._label_encoders_[i]
Xi = X[:, i]
if self._categories == 'auto':
le.fit(Xi)
cats = _encode(Xi)
else:
if handle_unknown == 'error':
valid_mask = np.in1d(Xi, self._categories[i])
if not np.all(valid_mask):
diff = np.unique(Xi[~valid_mask])
cats = np.array(self._categories[i], dtype=X.dtype)
if self.handle_unknown == 'error':
diff = _encode_check_unknown(Xi, cats)
if diff:
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
le.classes_ = np.array(self._categories[i], dtype=X.dtype)

self.categories_ = [le.classes_ for le in self._label_encoders_]
self.categories_.append(cats)

def _transform(self, X, handle_unknown='error'):

Expand All @@ -145,11 +143,11 @@ def _transform(self, X, handle_unknown='error'):

for i in range(n_features):
Xi = X[:, i]
valid_mask = np.in1d(Xi, self.categories_[i])
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
return_mask=True)

if not np.all(valid_mask):
if handle_unknown == 'error':
diff = np.unique(X[~valid_mask, i])
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
Expand All @@ -160,7 +158,8 @@ def _transform(self, X, handle_unknown='error'):
X_mask[:, i] = valid_mask
Xi = Xi.copy()
Xi[~valid_mask] = self.categories_[i][0]
X_int[:, i] = self._label_encoders_[i].transform(Xi)
_, encoded = _encode(Xi, self.categories_[i], encode=True)
X_int[:, i] = encoded

return X_int, X_mask

Expand Down Expand Up @@ -195,8 +194,9 @@ class OneHotEncoder(_BaseEncoder):

- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories must be sorted and should not mix
strings and numeric values.
column. The passed categories should not mix strings and numeric
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clarify that this is "within a single feature".

values within a single feature, and should be sorted in case of
numeric values.

The used categories can be found in the ``categories_`` attribute.

Expand Down Expand Up @@ -713,8 +713,8 @@ class OrdinalEncoder(_BaseEncoder):

- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories must be sorted and should not mix
strings and numeric values.
column. The passed categories should not mix strings and numeric
values, and should be sorted in case of numeric values.

The used categories can be found in the ``categories_`` attribute.

Expand Down
135 changes: 127 additions & 8 deletions sklearn/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,129 @@
]


def _encode_numpy(values, uniques=None, encode=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a note that this should be accessed through _encode, where parameters are described.

# only used in _encode below, see docstring there for details
if uniques is None:
if encode:
uniques, encoded = np.unique(values, return_inverse=True)
return uniques, encoded
else:
# unique sorts
return np.unique(values)
if encode:
diff = _encode_check_unknown(values, uniques)
if diff:
raise ValueError("y contains previously unseen labels: %s"
% str(diff))
encoded = np.searchsorted(uniques, values)
return uniques, encoded
else:
return uniques


def _encode_python(values, uniques=None, encode=False):
# only used in _encode below, see docstring there for details
if uniques is None:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
if encode:
table = {val: i for i, val in enumerate(uniques)}
try:
encoded = np.array([table[v] for v in values])
except KeyError as e:
raise ValueError("y contains previously unseen labels: %s"
% str(e))
return uniques, encoded
else:
return uniques


def _encode(values, uniques=None, encode=False):
"""Helper function to factorize (find uniques) and encode values.

Uses pure python method for object dtype, and numpy method for
all other dtypes.
The numpy method has the limitation that the `uniques` need to
be sorted. Importantly, this is not checked but assumed to already be
the case. The calling method needs to ensure this for all non-object
values.

Parameters
----------
values : array
Values to factorize or encode.
uniques : array, optional
If passed, uniques are not determined from passed values (this
can be because the user specified categories, or because they
already have been determined in fit).
encode : bool, default False
If True, also encode the values into integer codes based on `uniques`.

Returns
-------
uniques
If ``encode=False``. The unique values are sorted if the `uniques`
parameter was None (and thus inferred from the data).
(uniques, encoded)
If ``encode=True``.

"""
if values.dtype == object:
return _encode_python(values, uniques, encode)
else:
return _encode_numpy(values, uniques, encode)


def _encode_check_unknown(values, uniques, return_mask=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure that we should start with _ since that this is used in different file. We are not really consistent by checking the code base. @jnothman what do you recommend?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem having private internal utils

"""
Helper function to check for unknowns in values to be encoded.

Uses pure python method for object dtype, and numpy method for
all other dtypes.

Parameters
----------
values A3E2 : array
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can they be array-like or only numpy array? (the same questions for the other occurences.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only numpy array (first thing I do is checking .dtype), but this is only used internally and thus we control it is always passed a numpy array

Values to check for unknowns.
uniques : array
Allowed uniques values.
return_mask : bool, default False
If True, return a mask of the same shape as `values` indicating
the valid values.

Returns
-------
diff : list
The unique values present in `values` and not in `uniques` (the
unknown values).
valid_mask : boolean array
Additionally returned if ``return_mask=True``.

"""
if values.dtype == object:
uniques_set = set(uniques)
diff = list(set(values) - uniques_set)
if return_mask:
if diff:
valid_mask = np.array([val in uniques_set for val in values])
else:
valid_mask = np.ones(len(values), dtype=bool)
return diff, valid_mask
else:
return diff
else:
unique_values = np.unique(values)
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True))
if return_mask:
if diff:
valid_mask = np.in1d(values, uniques)
else:
valid_mask = np.ones(len(values), dtype=bool)
return diff, valid_mask
else:
return diff


class LabelEncoder(BaseEstimator, TransformerMixin):
"""Encode labels with value between 0 and n_classes-1.

Expand Down Expand Up @@ -94,7 +217,7 @@ def fit(self, y):
self : returns an instance of self.
"""
y = column_or_1d(y, warn=True)
self.classes_ = np.unique(y)
self.classes_ = _encode(y)
return self

def fit_transform(self, y):
Expand All @@ -110,7 +233,7 @@ def fit_transform(self, y):
y : array-like of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
self.classes_, y = np.unique(y, return_inverse=True)
self.classes_, y = _encode(y, encode=True)
return y

def transform(self, y):
Expand All @@ -131,12 +254,8 @@ def transform(self, y):
if _num_samples(y) == 0:
return np.array([])

classes = np.unique(y)
if len(np.intersect1d(classes, self.classes_)) < len(classes):
diff = np.setdiff1d(classes, self.classes_)
raise ValueError(
"y contains previously unseen labels: %s" % str(diff))
return np.searchsorted(self.classes_, y)
_, y = _encode(y, uniques=self.classes_, encode=True)
return y

def inverse_transform(self, y):
"""Transform labels back to original encoding.
Expand Down
51 changes: 34 additions & 17 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,10 +339,10 @@ def test_one_hot_encoder_set_params():


def check_categorical_onehot(X):
enc = OneHotEncoder()
enc = OneHotEncoder(categories='auto')
Xtr1 = enc.fit_transform(X)

enc = OneHotEncoder(sparse=False)
enc = OneHotEncoder(categories='auto', sparse=False)
Xtr2 = enc.fit_transform(X)

assert_allclose(Xtr1.toarray(), Xtr2)
Expand All @@ -351,17 +351,20 @@ def check_categorical_onehot(X):
return Xtr1.toarray()


def test_one_hot_encoder():
X = [['abc', 1, 55], ['def', 2, 55]]

@pytest.mark.param 10000 etrize("X", [
[['def', 1, 55], ['abc', 2, 55]],
np.array([[10, 1, 55], [5, 2, 55]]),
np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
], ids=['mixed', 'numeric', 'object'])
def test_one_hot_encoder(X):
Xtr = check_categorical_onehot(np.array(X)[:, [0]])
assert_allclose(Xtr, [[1, 0], [0, 1]])
assert_allclose(Xtr, [[0, 1], [1, 0]])

Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])

Xtr = OneHotEncoder().fit_transform(X)
assert_allclose(Xtr.toarray(), [[1, 0, 1, 0, 1], [0, 1, 0, 1, 1]])
Xtr = OneHotEncoder(categories='auto').fit_transform(X)
assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])


def test_one_hot_encoder_inverse():
Expand Down Expand Up @@ -449,7 +452,8 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
# when specifying categories manually, unknown categories should already
# raise when fitting
enc = OneHotEncoder(categories=cats)
assert_raises(ValueError, enc.fit, X2)
with pytest.raises(ValueError, match="Found unknown categories"):
enc.fit(X2)
enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
exp = np.array([[1., 0., 0.], [0., 0., 0.]])
assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
Expand All @@ -458,10 +462,20 @@ def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
def test_one_hot_encoder_unsorted_categories():
X = np.array([['a', 'b']], dtype=object).T

# unsorted passed categories raises for now
enc = OneHotEncoder(categories=[['c', 'b', 'a']])
msg = re.escape('Unsorted categories are not yet supported')
assert_raises_regex(ValueError, msg, enc.fit_transform, X)
enc = OneHotEncoder(categories=[['b', 'a', 'c']])
exp = np.array([[0., 1., 0.],
[1., 0., 0.]])
assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
assert_array_equal(enc.fit_transform(X).toarray(), exp)
assert enc.categories_[0].tolist() == ['b', 'a', 'c']
assert np.issubdtype(enc.categories_[0].dtype, np.object_)

# unsorted passed categories still raise for numerical values
X = np.array([[1, 2]]).T
enc = OneHotEncoder(categories=[[2, 1, 3]])
msg = 'Unsorted categories are not supported'
with pytest.raises(ValueError, match=msg):
enc.fit_transform(X)


def test_one_hot_encoder_specified_categories_mixed_columns():
Expand All @@ -487,9 +501,12 @@ def test_one_hot_encoder_pandas():
assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])


def test_ordinal_encoder():
X = [['abc', 2, 55], ['def', 1, 55]]

@pytest.mark.parametrize("X", [
[['abc', 2, 55], ['def', 1, 55]],
np.array([[10, 2, 55], [20, 1, 55]]),
np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
], ids=['mixed', 'numeric', 'object'])
def test_ordinal_encoder(X):
enc = OrdinalEncoder()
exp = np.array([[0, 1, 0],
[1, 0, 0]], dtype='int64')
Expand Down
Loading
0