8000 [WIP] 'most_frequent' drop method for OneHotEncoder by trewaite · Pull Request #18678 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] 'most_frequent' drop method for OneHotEncoder #18678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,19 @@ def _fit(self, X, handle_unknown='error'):
" it has to be of shape (n_features,).")

self.categories_ = []
self.categories_count_ = []

for i in range(n_features):
Xi = X_list[i]
if self.categories == 'auto':
cats = _unique(Xi)
cats, cats_count = _unique(Xi, return_counts=True)
else:
cats = np.array(self.categories[i], dtype=Xi.dtype)
cats_, cats_count = _unique(Xi, return_counts=True)
missing = np.setdiff1d(cats, cats_, assume_unique=True)
cats_count = np.append(cats_count, np.zeros(missing.shape[0]))
idx = np.argsort(cats)
cats_count = cats_count[idx]
if Xi.dtype != object:
if not np.all(np.sort(cats) == cats):
raise ValueError("Unsorted categories are not "
Expand All @@ -97,6 +103,7 @@ def _fit(self, X, handle_unknown='error'):
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
self.categories_count_.append(cats_count)
self.categories_.append(cats)

def _transform(self, X, handle_unknown='error'):
Expand Down Expand Up @@ -205,6 +212,8 @@ class OneHotEncoder(_BaseEncoder):
- 'if_binary' : drop the first category in each feature with two
categories. Features with 1 or more than 2 categories are
left intact.
- 'most_frequent' : drop the most frequent category in each feature.
If all categories frequency are equal, first category is dropped.
- array : ``drop[i]`` is the category in feature ``X[:, i]`` that
should be dropped.

Expand All @@ -230,6 +239,13 @@ class OneHotEncoder(_BaseEncoder):
of ``transform``). This includes the category specified in ``drop``
(if any).

categories_count_ : list of arrays
The frequency count of each category within each feature determined
during fitting.
(in order of the features in X and corresponding with the output
of ``transform``). This includes the category specified in ``drop``
(if any).

drop_idx_ : array of shape (n_features,)
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
to be dropped for each feature.
Expand Down Expand Up @@ -327,10 +343,15 @@ def _compute_drop_idx(self):
elif self.drop == 'if_binary':
return np.array([0 if len(cats) == 2 else None
for cats in self.categories_], dtype=object)
elif self.drop == 'most_frequent':
return np.array([np.argmax(cats_count)
for cats_count in self.categories_count_],
dtype=object)
else:
msg = (
"Wrong input for parameter `drop`. Expected "
"'first', 'if_binary', None or array of objects, got {}"
"'first', 'if_binary', 'most_frequent', None "
"or array of objects, got {}"
)
raise ValueError(msg.format(type(self.drop)))

Expand All @@ -341,7 +362,8 @@ def _compute_drop_idx(self):
except (ValueError, TypeError):
msg = (
"Wrong input for parameter `drop`. Expected "
"'first', 'if_binary', None or array of objects, got {}"
"'first', 'if_binary', 'most_frequent', None "
"or array of objects, got {}"
)
raise ValueError(msg.format(type(self.drop)))
if droplen != len(self.categories_):
Expand Down
13 changes: 13 additions & 0 deletions sklearn/preprocessing/tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,3 +757,16 @@ def test_encoders_does_not_support_none_values(Encoder):
with pytest.raises(TypeError, match="Encoders require their input to be "
"uniformly strings or numbers."):
Encoder().fit(values)


def test_drop_most_frequent():
X = [['abc', 12, 2, 55],
['def', 12, 1, 55],
['def', 12, 3, 56]]
exp = [[1, 1, 0, 0],
[0, 0, 0, 0],
[0, 0, 1, 1]]
ohe = OneHotEncoder(drop='most_frequent')
trans = ohe.fit_transform(X).toarray()
assert_array_equal(trans, exp)
assert_array_equal(ohe.inverse_transform(trans), np.array(X, dtype=object))
23 changes: 17 additions & 6 deletions sklearn/utils/_encode.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
from collections import Counter


def _unique(values, *, return_inverse=False):
def _unique(values, *, return_inverse=False, return_counts=False):
"""Helper function to find unique values with support for python objects.

Uses pure python method for object dtype, and numpy method for
Expand All @@ -25,22 +26,32 @@ def _unique(values, *, return_inverse=False):
Only provided if `return_inverse` is True.
"""
if values.dtype == object:
return _unique_python(values, return_inverse=return_inverse)
return _unique_python(values,
return_inverse=return_inverse,
return_counts=return_counts)
# numerical
return np.unique(values, return_inverse=return_inverse)
return np.unique(values,
return_inverse=return_inverse,
return_counts=return_counts)


def _unique_python(values, *, return_inverse):
def _unique_python(values, *, return_inverse, return_counts):
# Only used in `_uniques`, see docstring there for details
try:
uniques = sorted(set(values))
uniques = np.array(uniques, dtype=values.dtype)
if return_counts:
ctr = Counter(values)
ctr = dict(sorted(ctr.items()))
uniques = (np.array(list(ctr.keys()), dtype=values.dtype),
np.array(list(ctr.values()), dtype=values.dtype))
else:
uniques = np.array(uniques, dtype=values.dtype)
except TypeError:
types = sorted(t.__qualname__
for t in set(type(v) for v in values))
raise TypeError("Encoders require their input to be uniformly "
f"strings or numbers. Got {types}")

# TODO
if return_inverse:
table = {val: i for i, val in enumerate(uniques)}
inverse = np.array([table[v] for v in values])
Expand Down
0