8000 [MRG] ENH: process DataFrames in OneHot/OrdinalEncoder without converting to array #12147 by maikia · Pull Request #13253 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] ENH: process DataFrames in OneHot/OrdinalEncoder without converting to array #12147 #13253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 29 commits into from
Mar 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
55b1374
test for checking the type of the features from OneHotEncoders
maikia Feb 25, 2019
a7b21db
keeps the dtype of every column (encoders)
maikia Feb 25, 2019
638e2c3
some PEP8 violations corrected
maikia Feb 25, 2019
0cbbba7
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 25, 2019
622bc3a
keeping the same column dtype in _transform as well
maikia Feb 25, 2019
21cba20
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 25, 2019
68025e9
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 26, 2019
12f8837
corrected PEP8
maikia Feb 26, 2019
6828fa3
comment added on why we dont check the full pandas dataframe
maikia Feb 26, 2019
fd79f0b
pass only pandas dataframe, make sure it is not a sequence by checkin…
maikia Feb 26, 2019
482f132
tests to make sure X is 2D
maikia Feb 26, 2019
a65ab6d
PEP8
maikia Feb 26, 2019
e278e24
PEP8
maikia Feb 26, 2019
421f8da
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 26, 2019
85039f6
removed checking for nans multiple times, corrected comment, added te…
maikia Feb 26, 2019
d41ec92
removed _check_X_feature(), now _check_X also does the validation fea…
maikia Feb 27, 2019
959ff84
pep8
maikia Feb 27, 2019
233374c
better heading for _check_X()
maikia Feb 27, 2019
f82efea
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 27, 2019
f39eee9
few comments added
maikia Feb 28, 2019
9e2a48e
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 28, 2019
6c0380e
Update sklearn/preprocessing/_encoders.py
jorisvandenbossche Feb 28, 2019
28f6dca
Apply suggestions from code review
jorisvandenbossche Feb 28, 2019
418b2bd
Update sklearn/preprocessing/_encoders.py
GaelVaroquaux Feb 28, 2019
b86edcb
Update sklearn/preprocessing/_encoders.py
GaelVaroquaux Feb 28, 2019
3aba39f
added some comments and cleaned up the code
maikia Feb 28, 2019
cedc230
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Feb 28, 2019
451d27e
final edits
jorisvandenbossche Feb 28, 2019
041cb29
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
maikia Mar 1, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8000
64 changes: 44 additions & 20 deletions sklearn/preprocessing/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,39 +38,64 @@ def _check_X(self, X):
- convert list of strings to object dtype
- check for missing values for object dtype data (check_array does
not do that)
- return list of features (arrays): this list of features is
constructed feature by feature to preserve the data types
of pandas DataFrame columns, as otherwise information is lost
and cannot be used, eg for the `categories_` attribute.

"""
X_temp = check_array(X, dtype=None)
if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
X = check_array(X, dtype=np.object)
if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
# if not a dataframe, do normal check_array validation
X_temp = check_array(X, dtype=None)
if (not hasattr(X, 'dtype')
and np.issubdtype(X_temp.dtype, np.str_)):
X = check_array(X, dtype=np.object)
else:
X = X_temp
needs_validation = False
else:
X = X_temp
# pandas dataframe, do validation later column by column, in order
# to keep the dtype information to be used in the encoder.
needs_validation = True

return X
n_samples, n_features = X.shape
X_columns = []

def _fit(self, X, handle_unknown='error'):
X = self._check_X(X)
for i in range(n_features):
Xi = self._get_feature(X, feature_idx=i)
Xi = check_array(Xi, ensure_2d=False, dtype=None,
force_all_finite=needs_validation)
X_columns.append(Xi)

n_samples, n_features = X.shape
return X_columns, n_samples, n_features

def _get_feature(self, X, feature_idx):
if hasattr(X, 'iloc'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return X.iloc[:, key] if hasattr(X, 'iloc) else X[:, key]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's certainly less lines, but not sure if it improves readability :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skip it if you think it is best. I don't have strong opinion here.

8000
# pandas dataframes
return X.iloc[:, feature_idx]
# numpy arrays, sparse arrays
return X[:, feature_idx]

def _fit(self, X, handle_unknown='error'):
X_list, n_samples, n_features = self._check_X(X)

if self._categories != 'auto':
if X.dtype != object:
for cats in self._categories:
if not np.all(np.sort(cats) == np.array(cats)):
raise ValueError("Unsorted categories are not "
"supported for numerical categories")
if len(self._categories) != n_features:
raise ValueError("Shape mismatch: if n_values is an array,"
" it has to be of shape (n_features,).")

self.categories_ = []

for i in range(n_features):
Xi = X[:, i]
Xi = X_list[i]
if self._categories == 'auto':
cats = _encode(Xi)
else:
cats = np.array(self._categories[i], dtype=X.dtype)
cats = np.array(self._categories[i], dtype=Xi.dtype)
if Xi.dtype != object:
if not np.all(np.sort(cats) == cats):
raise ValueError("Unsorted categories are not "
"supported for numerical categories")
if handle_unknown == 'error':
diff = _encode_check_unknown(Xi, cats)
if diff:
Expand All @@ -80,14 +105,13 @@ def _fit(self, X, handle_unknown='error'):
self.categories_.append(cats)

def _transform(self, X, handle_unknown='error'):
X = self._check_X(X)
X_list, n_samples, n_features = self._check_X(X)

_, n_features = X.shape
X_int = np.zeros_like(X, dtype=np.int)
X_mask = np.ones_like(X, dtype=np.bool)
X_int = np.zeros((n_samples, n_features), dtype=np.int)
X_mask = np.ones((n_samples, n_features), dtype=np.bool)

for i in range(n_features):
Xi = X[:, i]
Xi = X_list[i]
diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
return_mask=True)

Expand Down
49 changes: 43 additions & 6 deletions sklearn/preprocessing/tests/test_encoders.py
4730
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,30 @@ def test_one_hot_encoder_inverse(sparse_, drop):
assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)


@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
@pytest.mark.parametrize("X", [
[1, 2],
np.array([3., 4.])
])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pytest.mark.parametrize(""method", ['fit', 'fit_transform'])

def test_X_is_not_1D(X, method):
oh = OneHotEncoder()

msg = ("Expected 2D array, got 1D array instead")
with pytest.raises(ValueError, match=msg):
getattr(oh, method)(X)


@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
def test_X_is_not_1D_pandas(method):
pd = pytest.importorskip('pandas')
X = pd.Series([6, 3, 4, 6])
oh = OneHotEncoder()

msg = ("Expected 2D array, got 1D array instead")
with pytest.raises(ValueError, match=msg):
getattr(oh, method)(X)


@pytest.mark.parametrize("X, cat_exp, cat_dtype", [
([['abc', 55], ['def', 55]], [['abc', 'def'], [55]], np.object_),
(np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
Expand Down Expand Up @@ -569,8 +593,14 @@ def test_one_hot_encoder_feature_names_unicode():
@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
np.array([['a', np.nan]], dtype=object).T],
ids=['numeric', 'object'])
@pytest.mark.parametrize("as_data_frame", [False, True],
ids=['array', 'dataframe'])
@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
def test_one_hot_encoder_raise_missing(X, handle_unknown):
def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
if as_data_frame:
pd = pytest.importorskip('pandas')
X = pd.DataFrame(X)

ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)

with pytest.raises(ValueError, match="Input contains NaN"):
Expand All @@ -579,7 +609,12 @@ def test_one_hot_encoder_raise_missing(X, handle_unknown):
with pytest.raises(ValueError, match="Input contains NaN"):
ohe.fit_transform(X)

ohe.fit(X[:1, :])
if as_data_frame:
X_partial = X.iloc[:1, :]
else:
X_partial = X[:1, :]

ohe.fit(X_partial)

with pytest.raises(ValueError, match="Input contains NaN"):
ohe.transform(X)
Expand Down Expand Up @@ -688,16 +723,18 @@ def test_encoder_dtypes_pandas():
pd = pytest.importorskip('pandas')

enc = OneHotEncoder(categories='auto')
exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
exp = np.array([[1., 0., 1., 0., 1., 0.],
[0., 1., 0., 1., 0., 1.]], dtype='float64')

X = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, dtype='int64')
X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
enc.fit(X)
assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
assert_array_equal(enc.transform(X).toarray(), exp)

X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b']})
X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
X_type = [int, object, float]
enc.fit(X)
assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
assert_array_equal(enc.transform(X).toarray(), exp)


Expand Down
0