scikit-learn · partmor · Jun 16, 2018 · Jun 16, 2018 · Jun 16, 2018 · Jun 16, 2018
diff --git a/examples/compose/column_selector_callables.py b/examples/compose/column_selector_callables.py
@@ -0,0 +1,69 @@
+"""
+=======================
+Select Column Functions
+=======================
+
+TODO
+"""
+
+# Author: Pedro Morales <part.morales@gmail.com>
+#
+# License: BSD 3 clause
+
+from __future__ import print_function
+
+import numpy as np
+import pandas as pd
+
+from sklearn.compose import make_column_transformer, select_types
+from sklearn.pipeline import make_pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, CategoricalEncoder
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+
+
+# Read data from Titanic dataset.
+titanic_url = ('https://raw.githubusercontent.com/amueller/'
+               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
+
+data = pd.read_csv(titanic_url)
+
+# Provisionally, use pd.fillna() to impute missing values for categorical
+# features; SimpleImputer will eventually support strategy="constant".
+data[['embarked', 'sex', 'pclass']] = data[
+    ['embarked', 'sex', 'pclass']].fillna(value='missing')
+
+
+# We create the preprocessing pipelines for both numeric and categorical data.
+numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
+categorical_transformer = CategoricalEncoder('onehot-dense',
+                                             handle_unknown='ignore')
+
+# We construct a column transformer, using dtype selectors
+preprocessing_pl = make_column_transformer(
+    (select_types([float]), numeric_transformer),
+    (select_types([int, 'object']), categorical_transformer),
+    remainder='drop'
+)
+
+clf = make_pipeline(preprocessing_pl, LogisticRegression())
+
+np.random.seed(0)
+
+# We will train our classifier with the following features:
+# Numeric Features:
+# - age: float.
+# - fare: float.
+# Categorical Features:
+# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
+# - sex: categories encoded as strings {'female', 'male'}.
+# - pclass: ordinal integers {1, 2, 3}.
+X = data[['age', 'fare', 'embarked', 'sex', 'pclass']]
+y = data.survived.values
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
+                                                    shuffle=True)
+
+clf.fit(X_train, y_train)
+print("model score: %f" % clf.score(X_test, y_test))
diff --git a/sklearn/compose/__init__.py b/sklearn/compose/__init__.py
@@ -5,12 +5,17 @@
 
 """
 
-from ._column_transformer import ColumnTransformer, make_column_transformer
+from ._column_transformer import (
+    ColumnTransformer,
+    make_column_transformer,
+    select_types
+)
 from ._target import TransformedTargetRegressor
 
 
 __all__ = [
     'ColumnTransformer',
     'make_column_transformer',
-    'TransformedTargetRegressor',
+    'select_types',
+    'TransformedTargetRegressor'
 ]
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -22,7 +22,7 @@
 from ..utils.validation import check_is_fitted
 
 
-__all__ = ['ColumnTransformer', 'make_column_transformer']
+__all__ = ['ColumnTransformer', 'make_column_transformer', 'select_types']
 
 
 _ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. "
@@ -514,6 +514,9 @@ def _get_column(X, key):
         if hasattr(X, 'loc'):
             # pandas boolean masks don't work with iloc, so take loc path
             column_names = True
+    elif callable(key) and hasattr(X, 'loc'):
+        # column selector callable
+        column_names = True
     else:
         raise ValueError("No valid specification of the columns. Only a "
                          "scalar, list or slice of all integers or all "
@@ -522,7 +525,10 @@ def _get_column(X, key):
     if column_names:
         if hasattr(X, 'loc'):
             # pandas dataframes
-            return X.loc[:, key]
+            if not callable(key):
+                return X.loc[:, key]
+            else:
+                return X.loc[:, key(X)]
         else:
             raise ValueError("Specifying the columns using strings is only "
                              "supported for pandas DataFrames")
@@ -578,6 +584,11 @@ def _get_column_indices(X, key):
     elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_):
         # boolean mask
         return list(np.arange(n_columns)[key])
+    elif callable(key):
+        if not hasattr(X, 'columns'):
+            raise ValueError("Using column selector callables is only "
+                             "supported for pandas DataFrames")
+        return list(np.arange(n_columns)[key(X)])
     else:
         raise ValueError("No valid specification of the columns. Only a "
                          "scalar, list or slice of all integers or all "
@@ -597,6 +608,50 @@ def _get_transformer_list(estimators):
     return transformer_list
 
 
+def select_types(dtypes):
+    """Generate a column selector callable (type-based) to be passed to
+    `make_column_transformer` or `ColumnTransformer` in place of the column
+    selections. The returned callable will return a boolean mask for column
+    selection when applied to a dataframe.
+
+    Parameters
+    ----------
+    dtypes : list of column dtypes to be selected from the dataset.
+
+    Returns
+    -------
+    callable
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'a': [1.5, 2.5],
+    ...     'b': ['dog', 'cat'],
+    ...     'c': ['big', 'small'],
+    ...     'd': [1, 2]
+    ... })
+    >>> df.dtypes
+    a    float64
+    b     object
+    c     object
+    d      int64
+    dtype: object
+    >>> type_selector = select_types([int, float])
+    >>> type_selector(df)
+    array([ True, False, False,  True])
+    """
+    def apply_dtype_mask(X, dtype):
+        if hasattr(X, 'dtypes'):
+            return X.dtypes == dtype
+
+    def get_dtype_masks(X):
+        masks = [apply_dtype_mask(X, t) for t in dtypes]
+        return masks
+
+    return lambda X: np.any(get_dtype_masks(X), axis=0)
+
+
 def make_column_transformer(*transformers, **kwargs):
     """Construct a ColumnTransformer from the given transformers.