8000 [WIP] Column selector functions for ColumnTransformer by partmor · Pull Request #11301 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] Column selector functions for ColumnTransformer #11301

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions examples/compose/column_selector_callables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
=======================
Select Column Functions
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the existing examples would benefit from this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will drop this new example then, and enhance the already existing one for ColumnTransformer.

=======================

TODO
"""

# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

from __future__ import print_function

import numpy as np
import pandas as pd

from sklearn.compose import make_column_transformer, select_types
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, CategoricalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')

data = pd.read_csv(titanic_url)

# Provisionally, use pd.fillna() to impute missing values for categorical
# features; SimpleImputer will eventually support strategy="constant".
data[['embarked', 'sex', 'pclass']] = data[
['embarked', 'sex', 'pclass']].fillna(value='missing')


# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
categorical_transformer = CategoricalEncoder('onehot-dense',
handle_unknown='ignore')

# We construct a column transformer, using dtype selectors
preprocessing_pl = make_column_transformer(
(select_types([float]), numeric_transformer),
(select_types([int, 'object']), categorical_transformer),
remainder='drop'
)

clf = make_pipeline(preprocessing_pl, LogisticRegression())

np.random.seed(0)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.
X = data[['age', 'fare', 'embarked', 'sex', 'pclass']]
y = data.survived.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
shuffle=True)

clf.fit(X_train, y_train)
print("model score: %f" % clf.score(X_test, y_test))
9 changes: 7 additions & 2 deletions sklearn/compose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@

"""

from ._column_transformer import ColumnTransformer, make_column_transformer
from ._column_transformer import (
ColumnTransformer,
make_column_transformer,
select_types
)
from ._target import TransformedTargetRegressor


__all__ = [
'ColumnTransformer',
'make_column_transformer',
'TransformedTargetRegressor',
'select_types',
'TransformedTargetRegressor'
]
59 changes: 57 additions & 2 deletions sklearn/compose/_column_transformer.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..utils.validation import check_is_fitted


__all__ = ['ColumnTransformer', 'make_column_transformer']
__all__ = ['ColumnTransformer', 'make_column_transformer', 'select_types']


_ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. "
Expand Down Expand Up @@ -514,6 +514,9 @@ def _get_column(X, key):
if hasattr(X, 'loc'):
# pandas boolean masks don't work with iloc, so take loc path
column_names = True
elif callable(key) and hasattr(X, 'loc'):
# column selector callable
column_names = True
else:
raise ValueError("No valid specification of the columns. Only a "
"scalar, list or slice of all integers or all "
Expand All @@ -522,7 +525,10 @@ def _get_column(X, key):
if column_names:
if hasattr(X, 'loc'):
# pandas dataframes
return X.loc[:, key]
if not callable(key):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we do this before the ifs, setting key = key(X)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will do

return X.loc[:, key]
else:
return X.loc[:, key(X)]
else:
raise ValueError("Specifying the columns using strings is only "
"supported for pandas DataFrames")
Expand Down Expand Up @@ -578,6 +584,11 @@ def _get_column_indices(X, key):
elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_):
# boolean mask
return list(np.arange(n_columns)[key])
elif callable(key):
if not hasattr(X, 'columns'):
raise ValueError("Using column selector callables is only "
"supported for pandas DataFrames")
return list(np.arange(n_columns)[key(X)])
else:
raise ValueError("No valid specification of the columns. Only a "
"scalar, list or slice of all integers or all "
Expand All @@ -597,6 +608,50 @@ def _get_transformer_list(estimators):
return transformer_list


def select_types(dtypes):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add to doc/modules/classes.rst

"""Generate a column selector callable (type-based) to be passed to
< 6D47 span class="Button-content"> Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pep257: a short summary should be alone on the first line

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.

`make_column_transformer` or `ColumnTransformer` in place of the column
selections. The returned callable will return a boolean mask for column
selection when applied to a dataframe.

Parameters
----------
dtypes : list of column dtypes to be selected from the dataset.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Numpydoc: type on this line, semantics on the next

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should make the same factory also able to select columns by nane. Please add a param to do so and rename the function


Returns
-------
callable

Examples
--------
>>> import pandas as pd
>>> df = pd.DataFrame({
... 'a': [1.5, 2.5],
... 'b': ['dog', 'cat'],
... 'c': ['big', 'small'],
... 'd': [1, 2]
... })
>>> df.dtypes
a float64
b object
c object
d int64
dtype: object
>>> type_selector = select_types([int, float])
>>> type_selector(df)
array([ True, False, False, True])
"""
def apply_dtype_mask(X, dtype):
if hasattr(X, 'dtypes'):
return X.dtypes == dtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want to use issubdtype

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I am returning a boolean mask, issubdtype can't help me here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well you can use np.asarray([issubdtype(xtype, dtypes) for xtype in X.dtypes], dtype=bool)

Copy link
Contributor Author
@partmor partmor Jun 18, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Not to be insistent; we prefer to use np.issubdtype rather than == directly for consistency in the module?


def get_dtype_masks(X):
masks = [apply_dtype_mask(X, t) for t in dtypes]
return masks

return lambda X: np.any(get_dtype_masks(X), axis=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot pickle closures. Please implement this as a class with __call__ instead

Copy link
Contributor Author
@partmor partmor Jun 17, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep; also using a class with __call__ will also make the code a lot cleaner.



def make_column_transformer(*transformers, **kwargs):
"""Construct a ColumnTransformer from the given transformers.

Expand Down
0