-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[MRG] Use hashtable (python sets/dicts) for object dtype data in Encoders #10209
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
081e784
391f007
c2efafe
9131c46
ed64b91
7268e6b
787d617
7502e5e
f7e195a
bb15f13
3d1281c
626d217
662da67
ea189ce
0bbbe82
4365536
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,129 @@ | |
] | ||
|
||
|
||
def _encode_numpy(values, uniques=None, encode=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a note that this should be accessed through |
||
# only used in _encode below, see docstring there for details | ||
if uniques is None: | ||
if encode: | ||
uniques, encoded = np.unique(values, return_inverse=True) | ||
return uniques, encoded | ||
else: | ||
# unique sorts | ||
return np.unique(values) | ||
if encode: | ||
diff = _encode_check_unknown(values, uniques) | ||
if diff: | ||
raise ValueError("y contains previously unseen labels: %s" | ||
% str(diff)) | ||
encoded = np.searchsorted(uniques, values) | ||
|
||
else: | ||
return uniques | ||
|
||
|
||
def _encode_python(values, uniques=None, encode=False): | ||
# only used in _encode below, see docstring there for details | ||
if uniques is None: | ||
uniques = sorted(set(values)) | ||
uniques = np.array(uniques, dtype=values.dtype) | ||
if encode: | ||
table = {val: i for i, val in enumerate(uniques)} | ||
try: | ||
encoded = np.array([table[v] for v in values]) | ||
except KeyError as e: | ||
raise ValueError("y contains previously unseen labels: %s" | ||
% str(e)) | ||
return uniques, encoded | ||
else: | ||
return uniques | ||
|
||
|
||
def _encode(values, uniques=None, encode=False): | ||
"""Helper function to factorize (find uniques) and encode values. | ||
|
||
Uses pure python method for object dtype, and numpy method for | ||
all other dtypes. | ||
The numpy method has the limitation that the `uniques` need to | ||
be sorted. Importantly, this is not checked but assumed to already be | ||
the case. The calling method needs to ensure this for all non-object | ||
values. | ||
|
||
Parameters | ||
---------- | ||
values : array | ||
Values to factorize or encode. | ||
uniques : array, optional | ||
If passed, uniques are not determined from passed values (this | ||
can be because the user specified categories, or because they | ||
already have been determined in fit). | ||
encode : bool, default False | ||
If True, also encode the values into integer codes based on `uniques`. | ||
|
||
Returns | ||
------- | ||
uniques | ||
If ``encode=False``. The unique values are sorted if the `uniques` | ||
parameter was None (and thus inferred from the data). | ||
(uniques, encoded) | ||
If ``encode=True``. | ||
|
||
""" | ||
if values.dtype == object: | ||
return _encode_python(values, uniques, encode) | ||
else: | ||
return _encode_numpy(values, uniques, encode) | ||
|
||
|
||
def _encode_check_unknown(values, uniques, return_mask=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure that we should start with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No problem having private internal utils |
||
""" | ||
Helper function to check for unknowns in values to be encoded. | ||
|
||
Uses pure python method for object dtype, and numpy method for | ||
all other dtypes. | ||
|
||
Parameters | ||
---------- | ||
values A3E2 : array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can they be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. only numpy array (first thing I do is checking |
||
Values to check for unknowns. | ||
uniques : array | ||
Allowed uniques values. | ||
return_mask : bool, default False | ||
If True, return a mask of the same shape as `values` indicating | ||
the valid values. | ||
|
||
Returns | ||
------- | ||
diff : list | ||
The unique values present in `values` and not in `uniques` (the | ||
unknown values). | ||
valid_mask : boolean array | ||
Additionally returned if ``return_mask=True``. | ||
|
||
""" | ||
if values.dtype == object: | ||
uniques_set = set(uniques) | ||
diff = list(set(values) - uniques_set) | ||
if return_mask: | ||
if diff: | ||
valid_mask = np.array([val in uniques_set for val in values]) | ||
else: | ||
valid_mask = np.ones(len(values), dtype=bool) | ||
return diff, valid_mask | ||
else: | ||
return diff | ||
else: | ||
unique_values = np.unique(values) | ||
diff = list(np.setdiff1d(unique_values, uniques, assume_unique=True)) | ||
if return_mask: | ||
if diff: | ||
valid_mask = np.in1d(values, uniques) | ||
else: | ||
valid_mask = np.ones(len(values), dtype=bool) | ||
return diff, valid_mask | ||
else: | ||
return diff | ||
|
||
|
||
class LabelEncoder(BaseEstimator, TransformerMixin): | ||
"""Encode labels with value between 0 and n_classes-1. | ||
|
||
|
@@ -94,7 +217,7 @@ def fit(self, y): | |
self : returns an instance of self. | ||
""" | ||
y = column_or_1d(y, warn=True) | ||
self.classes_ = np.unique(y) | ||
self.classes_ = _encode(y) | ||
return self | ||
|
||
def fit_transform(self, y): | ||
|
@@ -110,7 +233,7 @@ def fit_transform(self, y): | |
y : array-like of shape [n_samples] | ||
""" | ||
y = column_or_1d(y, warn=True) | ||
self.classes_, y = np.unique(y, return_inverse=True) | ||
self.classes_, y = _encode(y, encode=True) | ||
return y | ||
|
||
def transform(self, y): | ||
|
@@ -131,12 +254,8 @@ def transform(self, y): | |
if _num_samples(y) == 0: | ||
return np.array([]) | ||
|
||
classes = np.unique(y) | ||
if len(np.intersect1d(classes, self.classes_)) < len(classes): | ||
diff = np.setdiff1d(classes, self.classes_) | ||
raise ValueError( | ||
"y contains previously unseen labels: %s" % str(diff)) | ||
return np.searchsorted(self.classes_, y) | ||
_, y = _encode(y, uniques=self.classes_, encode=True) | ||
return y | ||
|
||
def inverse_transform(self, y): | ||
"""Transform labels back to original encoding. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clarify that this is "within a single feature".