8000 FEA Add make_column_selector for ColumnTransformer by thomasjpfan · Pull Request #12371 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

FEA Add make_column_selector for ColumnTransformer #12371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Nov 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
3c2f8f4
ENH: Adds make_select_dtypes
thomasjpfan Oct 12, 2018
ff2c7e7
STY: Flake8
thomasjpfan Oct 12, 2018
cbd1169
DOC: Remove example
thomasjpfan Oct 13, 2018
dca0726
Merge remote-tracking branch 'upstream/master' into select_dtypes_fac…
thomasjpfan Oct 14, 2018
fac1f55
ENH: Adds string regex matching
thomasjpfan Oct 14, 2018
ae8ed2f
Merge remote-tracking branch 'upstream/master' into select_dtypes_fac…
thomasjpfan Oct 17, 2019
90e3bde
CLN Cleans up tests and docstrings
thomasjpfan Oct 17, 2019
ff93079
CLN Makes callable picklable
thomasjpfan Oct 17, 2019
71b5bf5
CLN Only uses callable class
thomasjpfan Oct 17, 2019
56c68e9
DOC Address joels comments
thomasjpfan Oct 18, 2019
4ea1f79
Merge remote-tracking branch 'upstream/master' into select_dtypes_fac…
thomasjpfan Oct 22, 2019
81a6ccb
DOC Adds examples
thomasjpfan Oct 22, 2019
149b96a
DOC Less imports
thomasjpfan Oct 22, 2019
4b69b61
BUG Fix in CI
thomasjpfan Oct 22, 2019
2863fab
Reference make_column_transformer when describing columns
jnothman Oct 23, 2019
59f281e
CLN Address comments
thomasjpfan Oct 23, 2019
d535b66
CLN Address comments
thomasjpfan Oct 28, 2019
7b56e46
CLN Convert to list
thomasjpfan Oct 28, 2019
5893a78
DOC Move make_column_selector up in user guide
thomasjpfan Oct 30, 2019
3f9ce99
DOC Skip doctest
thomasjpfan Oct 30, 2019
3fa1f6d
Merge remote-tracking branch 'upstream/master' into select_dtypes_fac…
thomasjpfan Oct 30, 2019
f4feaea
TST Fix tests
thomasjpfan Oct 31, 2019
d970fd2
DOC Fix
thomasjpfan Nov 5, 2019
4d651c1
Merge remote-tracking branch 'upstream/master' into select_dtypes_fac…
thomasjpfan Nov 5, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/modules/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ details.
:template: function.rst

compose.make_column_transformer
compose.make_column_selector

.. _covariance_ref:

Expand Down
20 changes: 19 additions & 1 deletion doc/modules/compose.rst
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,25 @@ as most of other transformers expects 2D data, therefore in that case you need
to specify the column as a list of strings (``['city']``).

Apart from a scalar or a single item list, the column selection can be specified
as a list of multiple items, an integer array, a slice, or a boolean mask.
as a list of multiple items, an integer array, a slice, a boolean mask, or
with a :func:`~sklearn.compose.make_column_selector`. The
:func:`~sklearn.compose.make_column_selector` is used to select columns based
on data type or column name::

>>> from sklearn.preprocessing import StandardScaler
>>> from sklearn.compose import make_column_selector
>>> ct = ColumnTransformer([
... ('scale', StandardScaler(),
... make_column_selector(dtype_include=np.number)),
... ('onehot',
... OneHotEncoder(),
... make_column_selector(pattern='city', dtype_include=object))])
>>> ct.fit_transform(X)
array([[ 0.904..., 0. , 1. , 0. , 0. ],
[-1.507..., 1.414..., 1. , 0. , 0. ],
[-0.301..., 0. , 0. , 1. , 0. ],
[ 0.904..., -1.414..., 0. , 0. , 1. ]])

Strings can reference columns if the input is a DataFrame, integers are always
interpreted as the positional columns.

Expand Down
4 changes: 4 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ Changelog
:mod:`sklearn.compose`
......................

- |Feature| Adds :func:`compose.make_column_selector` which is used with
:class:`compose.ColumnTransformer` to select DataFrame columns on the basis
of name and dtype. :pr:`12303` by `Thomas Fan `_.

- |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to
select the proper columns when using a boolean list, with NumPy older than
1.12.
Expand Down
4 changes: 3 additions & 1 deletion sklearn/compose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@

"""

from ._column_transformer import ColumnTransformer, make_column_transformer
from ._column_transformer import (ColumnTransformer, make_column_transformer,
make_column_selector)
from ._target import TransformedTargetRegressor


__all__ = [
'ColumnTransformer',
'make_column_transformer',
'TransformedTargetRegressor',
'make_column_selector',
]
82 changes: 80 additions & 2 deletions sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
from ..utils.validation import check_array, check_is_fitted


__all__ = ['ColumnTransformer', 'make_column_transformer']
__all__ = [
'ColumnTransformer', 'make_column_transformer', 'make_column_selector'
]


_ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. "
Expand Down Expand Up @@ -69,7 +71,8 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
``transformer`` expects X to be a 1d array-like (vector),
otherwise a 2d array will be passed to the transformer.
A callable is passed the input data `X` and can return any of the
above.
above. To select multiple columns by name or dtype, you can use
:obj:`make_column_transformer`.

remainder : {'drop', 'passthrough'} or estimator, default 'drop'
By default, only the specified columns in `transformers` are
Expand Down Expand Up @@ -145,6 +148,8 @@ class ColumnTransformer(TransformerMixin, _BaseComposition):
sklearn.compose.make_column_transformer : convenience function for
combining the outputs of multiple transformer objects applied to
column subsets of the original feature space.
sklearn.compose.make_column_selector : convenience function for selecting
columns based on datatype or the columns name with a regex pattern.

Examples
--------
Expand Down Expand Up @@ -759,3 +764,76 @@ def is_neg(x): return isinstance(x, numbers.Integral) and x < 0
elif _determine_key_type(key) == 'int':
return np.any(np.asarray(key) < 0)
return False


class make_column_selector:
"""Create a callable to select columns to be used with
:class:`ColumnTransformer`.

:func:`make_column_selector` can select columns based on datatype or the< 57AE /span>
columns name with a regex. When using multiple selection criteria, **all**
criteria must match for a column to be selected.

Parameters
----------
pattern : str, default=None
Name of columns containing this regex pattern will be included. If
None, column selection will not be selected based on pattern.

dtype_include : column dtype or list of column dtypes, default=None
A selection of dtypes to include. For more details, see
:meth:`pandas.DataFrame.select_dtypes`.

dtype_exclude : column dtype or list of column dtypes, default=None
A selection of dtypes to exclude. For more details, see
:meth:`pandas.DataFrame.select_dtypes`.

Returns
-------
selector : callable
Callable for column selection to be used by a
:class:`ColumnTransformer`.

See also
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need another see also in the ColumnTransformer class to link there.

--------
sklearn.compose.ColumnTransformer : Class that allows combining the
outputs of multiple transformer objects used on column subsets
of the data into a single feature space.

Examples
--------
>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
>>> from sklearn.compose import make_column_transformer
>>> from sklearn.compose import make_column_selector
>>> import pandas as pd # doctest: +SKIP
>>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],
... 'rating': [5, 3, 4, 5]}) # doctest: +SKIP
>>> ct = make_column_transformer(
... (StandardScaler(),
... make_column_selector(dtype_include=np.number)), # rating
... (OneHotEncoder(),
... make_column_selector(dtype_include=object))) # city
>>> ct.fit_transform(X) # doctest: +SKIP
array([[ 0.90453403, 1. , 0. , 0. ],
[-1.50755672, 1. , 0. , 0. ],
[-0.30151134, 0. , 1. , 0. ],
[ 0.90453403, 0. , 0. , 1. ]])
"""

def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None):
self.pattern = pattern
self.dtype_include = dtype_include
self.dtype_exclude = dtype_exclude

def __call__(self, df):
if not hasattr(df, 'iloc'):
raise ValueError("make_column_selector can only be applied to "
"pandas dataframes")
df_row = df.iloc[:1]
if self.dtype_include is not None or self.dtype_exclude is not None:
df_row = df_row.select_dtypes(include=self.dtype_include,
exclude=self.dtype_exclude)
cols = df_row.columns
if self.pattern is not None:
cols = cols[cols.str.contains(self.pattern, regex=True)]
return cols.tolist()
88 changes: 87 additions & 1 deletion sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,23 @@
Test the ColumnTransformer.
"""
import re
import pickle

import warnings
import numpy as np
from scipy import sparse
import pytest

from numpy.testing import assert_allclose
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_almost_equal

from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import (
ColumnTransformer, make_column_transformer, make_column_selector
)
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder
Expand Down Expand Up @@ -1180,3 +1184,85 @@ def test_column_transformer_mask_indexing(array_type):
)
X_trans = column_transformer.fit_transform(X)
assert X_trans.shape == (3, 2)


@pytest.mark.parametrize('cols, pattern, include, exclude', [
(['col_int', 'col_float'], None, np.number, None),
(['col_int', 'col_float'], None, None, object),
(['col_int', 'col_float'], None, [np.int, np.float], None),
(['col_str'], None, [np.object], None),
(['col_str'], None, np.object, None),
(['col_float'], None, float, None),
(['col_float'], 'at$', [np.number], None),
(['col_int'], None, [np.int], None),
(['col_int'], '^col_int', [np.number], None),
(['col_float', 'col_str'], 'float|str', None, None),
(['col_str'], '^col_s', None, [np.int]),
([], 'str$', np.float, None),
(['col_int', 'col_float', 'col_str'], None, [np.number, np.object], None),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a test where none of the conditions are met? That should just return [] right?

])
def test_make_column_selector_with_select_dtypes(cols, pattern, include,
exclude):
pd = pytest.importorskip('pandas')

X_df = pd.DataFrame({
'col_int': np.array([0, 1, 2], dtype=np.int),
'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float),
'col_str': ["one", "two", "three"],
}, columns=['col_int', 'col_float', 'col_str'])

selector = make_column_selector(
dtype_include=include, dtype_exclude=exclude, pattern=pattern)

assert_array_equal(selector(X_df), cols)


def test_column_transformer_with_make_column_selector():
# Functional test for column transformer + column selector
pd = pytest.importorskip('pandas')
X_df = pd.DataFrame({
'col_int': np.array([0, 1, 2], dtype=np.int),
'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float),
'col_cat': ["one", "two", "one"],
'col_str': ["low", "middle", "high"]
}, columns=['col_int', 'col_float', 'col_cat', 'col_str'])
X_df['col_str'] = X_df['col_str'].astype('category')

cat_selector = make_column_selector(dtype_include=['category', object])
num_selector = make_column_selector(dtype_include=np.number)

ohe = OneHotEncoder()
scaler = StandardScaler()

ct_selector = make_column_transformer((ohe, cat_selector),
(scaler, num_selector))
ct_direct = make_column_transformer((ohe, ['col_cat', 'col_str']),
(scaler, ['col_float', 'col_int']))

X_selector = ct_selector.fit_transform(X_df)
X_direct = ct_direct.fit_transform(X_df)

assert_allclose(X_selector, X_direct)


def test_make_column_selector_error():
selector = make_column_selector(dtype_include=np.number)
X = np.array([[0.1, 0.2]])
msg = ("make_column_selector can only be applied to pandas dataframes")
with pytest.raises(ValueError, match=msg):
selector(X)


def test_make_column_selector_pickle():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this test needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd = pytest.importorskip('pandas')

X_df = pd.DataFrame({
'col_int': np.array([0, 1, 2], dtype=np.int),
'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float),
'col_str': ["one", "two", "three"],
}, columns=['col_int', 'col_float', 'col_str'])

selector = make_column_selector(dtype_include=[object])
selector_picked = pickle.loads(pickle.dumps(selector))

assert_array_equal(selector(X_df), selector_picked(X_df))
0