-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[WIP] Column selector functions for ColumnTransformer #11301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0a7749b
75daae1
b4081ee
c8e6fd7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
""" | ||
======================= | ||
Select Column Functions | ||
======================= | ||
|
||
TODO | ||
""" | ||
|
||
# Author: Pedro Morales <part.morales@gmail.com> | ||
# | ||
# License: BSD 3 clause | ||
|
||
from __future__ import print_function | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.compose import make_column_transformer, select_types | ||
from sklearn.pipeline import make_pipeline | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.preprocessing import StandardScaler, CategoricalEncoder | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.model_selection import train_test_split | ||
|
||
|
||
# Read data from Titanic dataset. | ||
titanic_url = ('https://raw.githubusercontent.com/amueller/' | ||
'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv') | ||
|
||
data = pd.read_csv(titanic_url) | ||
|
||
# Provisionally, use pd.fillna() to impute missing values for categorical | ||
# features; SimpleImputer will eventually support strategy="constant". | ||
data[['embarked', 'sex', 'pclass']] = data[ | ||
['embarked', 'sex', 'pclass']].fillna(value='missing') | ||
|
||
|
||
# We create the preprocessing pipelines for both numeric and categorical data. | ||
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) | ||
categorical_transformer = CategoricalEncoder('onehot-dense', | ||
handle_unknown='ignore') | ||
|
||
# We construct a column transformer, using dtype selectors | ||
preprocessing_pl = make_column_transformer( | ||
(select_types([float]), numeric_transformer), | ||
(select_types([int, 'object']), categorical_transformer), | ||
remainder='drop' | ||
) | ||
|
||
clf = make_pipeline(preprocessing_pl, LogisticRegression()) | ||
|
||
np.random.seed(0) | ||
|
||
# We will train our classifier with the following features: | ||
# Numeric Features: | ||
# - age: float. | ||
# - fare: float. | ||
# Categorical Features: | ||
# - embarked: categories encoded as strings {'C', 'S', 'Q'}. | ||
# - sex: categories encoded as strings {'female', 'male'}. | ||
# - pclass: ordinal integers {1, 2, 3}. | ||
X = data[['age', 'fare', 'embarked', 'sex', 'pclass']] | ||
y = data.survived.values | ||
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, | ||
shuffle=True) | ||
|
||
clf.fit(X_train, y_train) | ||
print("model score: %f" % clf.score(X_test, y_test)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,7 @@ | |
from ..utils.validation import check_is_fitted | ||
|
||
|
||
__all__ = ['ColumnTransformer', 'make_column_transformer'] | ||
__all__ = ['ColumnTransformer', 'make_column_transformer', 'select_types'] | ||
|
||
|
||
_ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. " | ||
|
@@ -514,6 +514,9 @@ def _get_column(X, key): | |
if hasattr(X, 'loc'): | ||
8000 | # pandas boolean masks don't work with iloc, so take loc path | |
column_names = True | ||
elif callable(key) and hasattr(X, 'loc'): | ||
# column selector callable | ||
column_names = True | ||
else: | ||
raise ValueError("No valid specification of the columns. Only a " | ||
"scalar, list or slice of all integers or all " | ||
|
@@ -522,7 +525,10 @@ def _get_column(X, key): | |
if column_names: | ||
if hasattr(X, 'loc'): | ||
# pandas dataframes | ||
return X.loc[:, key] | ||
if not callable(key): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we do this before the ifs, setting key = key(X)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, will do |
||
return X.loc[:, key] | ||
else: | ||
return X.loc[:, key(X)] | ||
else: | ||
raise ValueError("Specifying the columns using strings is only " | ||
"supported for pandas DataFrames") | ||
|
@@ -578,6 +584,11 @@ def _get_column_indices(X, key): | |
elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): | ||
# boolean mask | ||
return list(np.arange(n_columns)[key]) | ||
elif callable(key): | ||
if not hasattr(X, 'columns'): | ||
raise ValueError("Using column selector callables is only " | ||
"supported for pandas DataFrames") | ||
return list(np.arange(n_columns)[key(X)]) | ||
else: | ||
raise ValueError("No valid specification of the columns. Only a " | ||
"scalar, list or slice of all integers or all " | ||
|
@@ -597,6 +608,50 @@ def _get_transformer_list(estimators): | |
return transformer_list | ||
|
||
|
||
def select_types(dtypes): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add to doc/modules/classes.rst |
||
"""Generate a column selector callable (type-based) to be passed to | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pep257: a short summary should be alone on the first line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. |
||
`make_column_transformer` or `ColumnTransformer` in place of the column | ||
selections. The returned callable will return a boolean mask for column | ||
selection when applied to a dataframe. | ||
|
||
Parameters | ||
---------- | ||
dtypes : list of column dtypes to be selected from the dataset. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Numpydoc: type on this line, semantics on the next There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should make the same factory also able to select columns by nane. Please add a param to do so and rename the function |
||
|
||
Returns | ||
------- | ||
callable | ||
|
||
Examples | ||
-------- | ||
>>> import pandas as pd | ||
>>> df = pd.DataFrame({ | ||
... 'a': [1.5, 2.5], | ||
... 'b': ['dog', 'cat'], | ||
... 'c': ['big', 'small'], | ||
... 'd': [1, 2] | ||
... }) | ||
>>> df.dtypes | ||
a float64 | ||
b object | ||
c object | ||
d int64 | ||
dtype: object | ||
>>> type_selector = select_types([int, float]) | ||
>>> type_selector(df) | ||
array([ True, False, False, True]) | ||
""" | ||
def apply_dtype_mask(X, dtype): | ||
if hasattr(X, 'dtypes'): | ||
return X.dtypes == dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we want to use issubdtype There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I am returning a boolean mask, issubdtype can't help me here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well you can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Not to be insistent; we prefer to use |
||
|
||
def get_dtype_masks(X): | ||
masks = [apply_dtype_mask(X, t) for t in dtypes] | ||
return masks | ||
|
||
return lambda X: np.any(get_dtype_masks(X), axis=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We cannot pickle closures. Please implement this as a class with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep; also using a class with |
||
|
||
|
||
def make_column_transformer(*transformers, **kwargs): | ||
"""Construct a ColumnTransformer from the given transformers. | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the existing examples would benefit from this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will drop this new example then, and enhance the already existing one for
ColumnTransformer
.