10BC0 ColumnTransformer input feature name and count validation by adrinjalali · Pull Request #14544 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content
53 changes: 52 additions & 1 deletion sklearn/compose/_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
# Author: Andreas Mueller
# Joris Van den Bossche
# License: BSD

import warnings
from itertools import chain

import numbers
import numpy as np
from scipy import sparse
from joblib import Parallel, delayed
Expand Down Expand Up @@ -394,6 +395,34 @@ def _validate_output(self, result):
"The output of the '{0}' transformer should be 2D (scipy "
"matrix, array, or pandas DataFrame).".format(name))

def _validate_features(self, n_features, feature_names):
"""Ensures feature counts and names are the same during fit and
transform.

TODO: It should raise an error from v0.24
"""

if ((self._feature_names_in is None or feature_names is None)
and self._n_features == n_features):
return

neg_col_present = np.any([_is_negative_indexing(col)
for col in self._columns])
if neg_col_present and self._n_features != n_features:
raise RuntimeError("At least one negative column was used to "
"indicate columns, and the new data's number "
"of columns does not match the data given "
"during fit. "
"Please make sure the data during fit and "
"transform have the same number of columns.")

if (self._n_features != n_features or
np.any(self._feature_names_in != np.asarray(feature_names))):
warnings.warn("Given feature/column names or counts do not match "
"the ones for the data given during fit. This will "
"fail from v0.24.",
DeprecationWarning)

def _log_message(self, name, idx, total):
if not self.verbose:
return None
Expand Down Expand Up @@ -470,6 +499,11 @@ def fit_transform(self, X, y=None):
sparse matrices.

"""
# TODO: this should be `feature_names_in_` when we start having it
if hasattr(X, "columns"):
self._feature_names_in = np.asarray(X.columns)
else:
self._feature_names_in = None
X = _check_X(X)
self._validate_transformers()
self._validate_column_callables(X)
Expand Down Expand Up @@ -518,6 +552,10 @@ def transform(self, X):
"""
check_is_fitted(self, 'transformers_')
X = _check_X(X)
if hasattr(X, "columns"):
X_feature_names = np.asarray(X.columns)
else:
X_feature_names = None

if self._n_features > X.shape[1]:
raise ValueError('Number of features of the input must be equal '
Expand All @@ -527,6 +565,8 @@ def transform(self, X):
.format(self._n_features, X.shape[1]))

# No column reordering allowed for named cols combined with remainder
# TODO: remove this mechanism in 0.24, once we enforce strict column
# name order and count. See #14237 for details.
if (self._remainder[2] is not None and
hasattr(self, '_df_columns') and
hasattr(X, 'columns')):
Expand All @@ -538,6 +578,7 @@ def transform(self, X):
'and for transform when using the '
'remainder keyword')

self._validate_features(X.shape[1], X_feature_names)
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
self._validate_output(Xs)

Expand Down Expand Up @@ -707,3 +748,13 @@ def make_column_transformer(*transformers, **kwargs):
remainder=remainder,
sparse_threshold=sparse_threshold,
verbose=verbose)


def _is_negative_indexing(key):
# TODO: remove in v0.24
def is_neg(x): return isinstance(x, numbers.Integral) and x < 0
if isinstance(key, slice):
return is_neg(key.start) or is_neg(key.stop)
elif _check_key_type(key, int):
return np.any(np.asarray(key) < 0)
return False
61 changes: 59 additions & 2 deletions sklearn/compose/tests/test_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import re

import warnings
import numpy as np
from scipy import sparse
import pytest
Expand Down Expand Up @@ -498,7 +499,10 @@ def test_column_transformer_invalid_columns(remainder):
ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
ct.fit(X_array)
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
ct.transform(X_array_more) # Should accept added columns
msg = ("Given feature/column names or counts do not match the ones for "
"the data given during fit.")
with pytest.warns(DeprecationWarning, match=msg):
ct.transform(X_array_more) # Should accept added columns, for now
X_array_fewer = np.array([[0, 1, 2], ]).T
err_msg = 'Number of features'
with pytest.raises(ValueError, match=err_msg):
Expand Down Expand Up @@ -1096,13 +1100,16 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):

tf.fit(X_fit_df)
err_msg = 'Column ordering must be equal'
warn_msg = ("Given feature/column names or counts do not match the ones "
"for the data given during fit.")
with pytest.raises(ValueError, match=err_msg):
tf.transform(X_trans_df)

# No error for added columns if ordering is identical
X_extended_df = X_fit_df.copy()
X_extended_df['third'] = [3, 6, 9]
tf.transform(X_extended_df) # No error should be raised
with pytest.warns(DeprecationWarning, match=warn_msg):
tf.transform(X_extended_df) # No error should be raised, for now

# No 'columns' AttributeError when transform input is a numpy array
X_array = X_fit_array.copy()
Expand All @@ -1111,6 +1118,56 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
tf.transform(X_array)


def test_feature_name_validation():
"""Tests if the proper warning/error is raised if the columns do not match
during fit and transform."""
pd = pytest.importorskip("pandas")

X = np.ones(shape=(3, 2))
X_extra = np.ones(shape=(3, 3))
df = pd.DataFrame(X, columns=['a', 'b'])
df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c'])

tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
tf.fit(df)

msg = ("Given feature/column names or counts do not match the ones for "
"the data given during fit.")
with pytest.warns(DeprecationWarning, match=msg):
tf.transform(df_extra)

tf = ColumnTransformer([('bycol', Trans(), [0])])
tf.fit(df)

with pytest.warns(DeprecationWarning, match=msg):
tf.transform(X_extra)

with warnings.catch_warnings(record=True) as warns:
tf.transform(X)
assert not warns

tf = ColumnTransformer([('bycol', Trans(), ['a'])],
remainder=Trans())
tf.fit(df)
with pytest.warns(DeprecationWarning, match=msg):
tf.transform(df_extra)

tf = ColumnTransformer([('bycol', Trans(), [0, -1])])
tf.fit(df)
msg = "At least one negative column was used to"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also check that no warning is raised when n_features matches with negative indexing? Just in case.

with pytest.raises(RuntimeError, match=msg):
tf.transform(df_extra)

tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))])
tf.fit(df)
with pytest.raises(RuntimeError, match=msg):
tf.transform(df_extra)

with warnings.catch_warnings(record=True) as warns:
tf.transform(df)
assert not warns


@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
def test_column_transformer_mask_indexing(array_type):
# Regression test for #14510
Expand Down
0