10000 FIX ColumnTransformer: raise error on reordered columns with remainde… · scikit-learn/scikit-learn@9115ab0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9115ab0

Browse files
schudereradrinjalali
authored andcommitted
FIX ColumnTransformer: raise error on reordered columns with remainder (#14237)
* FIX Raise error on reordered columns in ColumnTransformer with remainder * FIX Check for different length of X.columns to avoid exception * FIX linter, line too long * FIX import _check_key_type from its new location utils * ENH Adjust doc, allow added columns * Fix comment typo as suggested, remove non-essential exposition in doc * Add PR 14237 to what's new * Avoid AttributeError in favor of ValueError "column names only for DF" * ENH Add check for n_features_ for array-likes and DataFrames * Rename self.n_features to self._n_features * Replaced backslash line continuation with parenthesis * Style changes
1 parent 9dfa49a commit 9115ab0

File tree

3 files changed

+88
-3
lines changed

3 files changed

+88
-3
lines changed

doc/whats_new/v0.21.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,15 @@ Changelog
7474
``tol`` required too strict types. :pr:`14092` by
7575
:user:`Jérémie du Boisberranger <jeremiedbb>`.
7676

77+
:mod:`sklearn.compose`
78+
.....................
79+
80+
- |Fix| Fixed an issue in :class:`compose.ColumnTransformer` where using
81+
DataFrames whose column order differs between :func:``fit`` and
82+
:func:``transform`` could lead to silently passing incorrect columns to the
83+
``remainder`` transformer.
84+
:pr:`14237` by `Andreas Schuderer <schuderer>`.
85+
7786
.. _changes_0_21_2:
7887

7988
Version 0.21.2

sklearn/compose/_column_transformer.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from ..utils import Bunch
2020
from ..utils import safe_indexing
2121
from ..utils import _get_column_indices
22+
fr 10000 om ..utils import _check_key_type
2223
from ..utils.metaestimators import _BaseComposition
2324
from ..utils.validation import check_array, check_is_fitted
2425

@@ -80,6 +81,8 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
8081
By setting ``remainder`` to be an estimator, the remaining
8182
non-specified columns will use the ``remainder`` estimator. The
8283
estimator must support :term:`fit` and :term:`transform`.
84+
Note that using this feature requires that the DataFrame columns
85+
input at :term:`fit` and :term:`transform` have identical order.
8386
8487
sparse_threshold : float, default = 0.3
8588
If the output of the different transformers contains sparse matrices,
@@ -303,11 +306,17 @@ def _validate_remainder(self, X):
303306
"'passthrough', or estimator. '%s' was passed instead" %
304307
self.remainder)
305308

306-
n_columns = X.shape[1]
309+
# Make it possible to check for reordered named columns on transform
310+
if (hasattr(X, 'columns') and
311+
any(_check_key_type(cols, str) for cols in self._columns)):
312+
self._df_columns = X.columns
313+
314+
self._n_features = X.shape[1]
307315
cols = []
308316
for columns in self._columns:
309317
cols.extend(_get_column_indices(X, columns))
310-
remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None
318+
remaining_idx = list(set(range(self._n_features)) - set(cols))
319+
remaining_idx = sorted(remaining_idx) or None
311320

312321
self._remainder = ('remainder', self.remainder, remaining_idx)
313322

@@ -508,8 +517,27 @@ def transform(self, X):
508517
509518
"""
510519
check_is_fitted(self, 'transformers_')
511-
512520
X = _check_X(X)
521+
522+
if self._n_features > X.shape[1]:
523+
raise ValueError('Number of features of the input must be equal '
524+
'to or greater than that of the fitted '
525+
'transformer. Transformer n_features is {0} '
526+
'and input n_features is {1}.'
527+
.format(self._n_features, X.shape[1]))
528+
529+
# No column reordering allowed for named cols combined with remainder
530+
if (self._remainder[2] is not None and
531+
hasattr(self, '_df_columns') and
532+
hasattr(X, 'columns')):
533+
n_cols_fit = len(self._df_columns)
534+
n_cols_transform = len(X.columns)
535+
if (n_cols_transform >= n_cols_fit and
536+
any(X.columns[:n_cols_fit] != self._df_columns)):
537+
raise ValueError('Column ordering must be equal for fit '
538+
'and for transform when using the '
539+
'remainder keyword')
540+
513541
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
514542
self._validate_output(Xs)
515543

sklearn/compose/tests/test_column_transformer.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,17 @@ def test_column_transformer_invalid_columns(remainder):
492492
assert_raise_message(ValueError, "Specifying the columns",
493493
ct.fit, X_array)
494494

495+
# transformed n_features does not match fitted n_features
496+
col = [0, 1]
497+
ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
498+
ct.fit(X_array)
499+
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
500+
ct.transform(X_array_more) # Should accept added columns
501+
X_array_fewer = np.array([[0, 1, 2], ]).T
502+
err_msg = 'Number of features'
503+
with pytest.raises(ValueError, match=err_msg):
504+
ct.transform(X_array_fewer)
505+
495506

496507
def test_column_transformer_invalid_transformer():
497508

@@ -1060,3 +1071,40 @@ def test_column_transformer_negative_column_indexes():
10601071
tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough')
10611072
tf_2 = ColumnTransformer([('ohe', ohe, [2])], remainder='passthrough')
10621073
assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
1074+
1075+
1076+
@pytest.mark.parametrize("explicit_colname", ['first', 'second'])
1077+
def test_column_transformer_reordered_column_names_remainder(explicit_colname):
1078+
"""Regression test for issue #14223: 'Named col indexing fails with
1079+
ColumnTransformer remainder on changing DataFrame column ordering'
1080+
1081+
Should raise error on changed order combined with remainder.
1082+
Should allow for added columns in `transform` input DataFrame
1083+
as long as all preceding columns match.
1084+
"""
1085+
pd = pytest.importorskip('pandas')
1086+
1087+
X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T
1088+
X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second'])
1089+
1090+
X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T
1091+
X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first'])
1092+
1093+
tf = ColumnTransformer([('bycol', Trans(), explicit_colname)],
1094+
remainder=Trans())
1095+
1096+
tf.fit(X_fit_df)
1097+
err_msg = 'Column ordering must be equal'
1098+
with pytest.raises(ValueError, match=err_msg):
1099+
tf.transform(X_trans_df)
1100+
1101+
# No error for added columns if ordering is identical
1102+
X_extended_df = X_fit_df.copy()
1103+
X_extended_df['third'] = [3, 6, 9]
1104+
tf.transform(X_extended_df) # No error should be raised
1105+
1106+
# No 'columns' AttributeError when transform input is a numpy array
1107+
X_array = X_fit_array.copy()
1108+
err_msg = 'Specifying the columns'
1109+
with pytest.raises(ValueError, match=err_msg):
1110+
tf.transform(X_array)

0 commit comments

Comments
 (0)
0