8000 ColumnTransformer input feature name and count validation by adrinjalali · Pull Request #14544 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ColumnTransformer input feature name and count validation #14544

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 7, 2019
53 changes: 52 additions & 1 deletion sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
# Author: Andreas Mueller
# Joris Van den Bossche
# License: BSD

import warnings
from itertools import chain

import numbers
import numpy as np
from scipy import sparse
from joblib import Parallel, delayed
Expand Down Expand Up @@ -394,6 +395,34 @@ def _validate_output(self, result):
"The output of the '{0}' transformer should be 2D (scipy "
"matrix, array, or pandas DataFrame).".format(name))

def _validate_features(self, n_features, feature_names):
"""Ensures feature counts and names are the same during fit and
transform.

TODO: It should raise an error from v0.24
"""

if ((self._feature_names_in is None or feature_names is None)
and self._n_features == n_features):
return

neg_col_present = np.any([_is_negative_indexing(col)
for col in self._columns])
if neg_col_present and self._n_features != n_features:
raise RuntimeError("At least one negative column was used to "
"indicate columns, and the new data's number "
"of columns does not match the data given "
"during fit. "
"Please make sure the data during fit and "
"transform have the same number of columns.")

if (self._n_features != n_features or
np.any(self._feature_names_in != np.asarray(feature_names))):
warnings.warn("Given feature/column names or counts do not match "
"the ones for the data given during fit. This will "
"fail from v0.24.",
DeprecationWarning)

def _log_message(self, name, idx, total):
if not self.verbose:
return None
Expand Down Expand Up @@ -470,6 +499,11 @@ def fit_transform(self, X, y=None):
sparse matrices.

"""
# TODO: this should be `feature_names_in_` when we start having it
if hasattr(X, "columns"):
self._feature_names_in = np.asarray(X.columns)
else:
self._feature_names_in = None
X = _check_X(X)
self._validate_transformers()
self._validate_column_callables(X)
Expand Down Expand Up @@ -518,6 +552,10 @@ def transform(self, X):
"""
check_is_fitted(self, 'transformers_')
X = _check_X(X)
if hasattr(X, "columns"):
X_feature_names = np.asarray(X.columns)
else:
X_feature_names = None

if self._n_features > X.shape[1]:
raise ValueError('Number of features of the input must be equal '
Expand All @@ -527,6 +565,8 @@ def transform(self, X):
.format(self._n_features, X.shape[1]))

# No column reordering allowed for named cols combined with remainder
# TODO: remove this mechanism in 0.24, once we enforce strict column
# name order and count. See #14237 for details.
if (self._remainder[2] is not None and
hasattr(self, '_df_columns') and
hasattr(X, 'columns')):
Expand All @@ -538,6 +578,7 @@ def transform(self, X):
'and for transform when using the '
'remainder keyword')

self._validate_features(X.shape[1], X_feature_names)
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
self._validate_output(Xs)

Expand Down Expand Up @@ -707,3 +748,13 @@ def make_column_transformer(*transformers, **kwargs):
remainder=remainder,
10000 sparse_threshold=sparse_threshold,
verbose=verbose)


def _is_negative_indexing(key):
# TODO: remove in v0.24
def is_neg(x): return isinstance(x, numbers.Integral) and x < 0
if isinstance(key, slice):
return is_neg(key.start) or is_neg(key.stop)
elif _check_key_type(key, int):
return np.any(np.asarray(key) < 0)
return False
61 changes: 59 additions & 2 deletions sklearn/compose/tests/test_column_transformer.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import re

import warnings
import numpy as np
from scipy import sparse
import pytest
Expand Down Expand Up @@ -498,7 +499,10 @@ def test_column_transformer_invalid_columns(remainder):
ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
ct.fit(X_array)
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
ct.transform(X_array_more) # Should accept added columns
msg = ("Given feature/column names or counts do not match the ones for "
"the data given during fit.")
with pytest.warns(DeprecationWarning, match=msg):
ct.transform(X_array_more) # Should accept added columns, for now
X_array_fewer = np.array([[0, 1, 2], ]).T
err_msg = 'Number of features'
with pytest.raises(ValueError, match=err_msg):
Expand Down Expand Up @@ -1096,13 +1100,16 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):

tf.fit(X_fit_df)
err_msg = 'Column ordering must be equal'
warn_msg = ("Given feature/column names or counts do not match the ones "
"for the data given during fit.")
with pytest.raises(ValueError, match=err_msg):
tf.transform(X_trans_df)

# No error for added columns if ordering is identical
X_extended_df = X_fit_df.copy()
X_extended_df['third'] = [3, 6, 9]
tf.transform(X_extended_df) # No error should be raised
with pytest.warns(DeprecationWarning, match=warn_msg):
tf.transform(X_extended_df) # No error should be raised, for now

# No 'columns' AttributeError when transform input is a numpy array
X_array = X_fit_array.copy()
Expand All @@ -1111,6 +1118,56 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
tf.transform(X_array)


def test_feature_name_validation():
"""Tests if the proper warning/error is raised if the columns do not match
during fit and transform."""
pd = pytest.importorskip("pandas")

X = np.ones(shape=(3, 2))
X_extra = np.ones(shape=(3, 3))
df = pd.DataFrame(X, columns=['a', 'b'])
df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c'])

tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
tf.fit(df)

msg = ("Given feature/column names or counts do not match the ones for "
"the data given during fit.")
with pytest.warns(DeprecationWarning, match=msg):
tf.transform(df_extra)

tf = ColumnTransformer([('bycol', Trans(), [0])])
tf.fit(df)

with pytest.warns(DeprecationWarning, match=msg):
tf.transform(X_extra)

with warnings.catch_warnings(record=True) as warns:
tf.transform(X)
assert not warns

tf = ColumnTransformer([('bycol', Trans(), ['a'])],
remainder=Trans())
tf.fit(df)
with pytest.warns(DeprecationWarning, match=msg):
tf.transform(df_extra)

tf = ColumnTransformer([('bycol', Trans(), [0, -1])])
tf.fit(df)
msg = "At least one negative column was used to"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also check that no warning is raised when n_features matches with negative indexing? Just in case.

with pytest.raises(RuntimeError, match=msg):
tf.transform(df_extra)

tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))])
tf.fit(df)
with pytest.raises(RuntimeError, match=msg):
tf.transform(df_extra)

with warnings.catch_warnings(record=True) as warns:
tf.transform(df)
assert not warns


@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
def test_column_transformer_mask_indexing(array_type):
# Regression test for #14510
Expand Down
0