Open
Description
make_column_selector
is great addtion in 0.22 but one feature is missing, namely selection by cardinality. For example we want to one-hot encode only categorical columns that has low number of unique values and target-encode the rest.
This feature seems to require only small addition to code-base:
class make_column_selector():
def __init__(self, pattern=None, dtype_include=None, dtype_exclude=None,
cardinality=None, cardinality_treshold=7):
self.pattern = pattern
self.dtype_include = dtype_include
self.dtype_exclude = dtype_exclude
def __call__(self, df):
if not hasattr(df, 'iloc'):
raise ValueError("make_column_selector can only be applied to "
"pandas dataframes")
df_row = df.iloc[:1]
if self.dtype_include is not None or self.dtype_exclude is not None:
df_row = df_row.select_dtypes(include=self.dtype_include,
exclude=self.dtype_exclude)
cols = df_row.columns
if self.pattern is not None:
cols = cols[cols.str.contains(self.pattern, regex=True)]
cols = list(cols)
if self.cardinality:
cols_cardinality = dict(df[cols].nunique())
if self.cardinality == 'high':
cols = [c for c in cols_cardinality if cols_cardinality[c] > self.cardinality_treshold]
if self.cardinality == 'low':
cols = [c for c in cols_cardinality if cols_cardinality[c] <= self.cardinality_treshold]
return cols