8000 FIX Enforce strict column name order/count in ColumnTransformer (#18256) · franslarsson/scikit-learn@313568b · GitHub
[go: up one dir, main page]

Skip to content

Commit 313568b

Browse files
madhuracjglemaitre
andauthored
FIX Enforce strict column name order/count in ColumnTransformer (scikit-learn#18256)
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 1e53ae2 commit 313568b

File tree

3 files changed

+32
-96
lines changed

3 files changed

+32
-96
lines changed

doc/whats_new/v0.24.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ Changelog
111111
- |Fix| :class:`compose.ColumnTransformer` now displays the remainder in the
112112
diagram display. :pr:`18167` by `Thomas Fan`_.
113113

114+
- |Fix| :class:`compose.ColumnTransformer` enforces strict count and order
115+
of column names between `fit` and `transform` by raising an error instead
116+
of a warning, following the deprecation cycle.
117+
:pr:`18256` by :user:`Madhura Jayratne <madhuracj>`.
118+
114119
:mod:`sklearn.covariance`
115120
.........................
116121

sklearn/compose/_column_transformer.py

Lines changed: 8 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,8 @@
66
# Author: Andreas Mueller
77
# Joris Van den Bossche
88
# License: BSD
9-
import warnings
109
from itertools import chain
1110

12-
import numbers
1311
import numpy as np
1412
from scipy import sparse
1513
from joblib import Parallel
@@ -418,34 +416,6 @@ def _validate_output(self, result):
418416
"The output of the '{0}' transformer should be 2D (scipy "
419417
"matrix, array, or pandas DataFrame).".format(name))
420418

421-
def _validate_features(self, n_features, feature_names):
422-
"""Ensures feature counts and names are the same during fit and
423-
transform.
424-
425-
TODO: It should raise an error from v0.24
426-
"""
427-
428-
if ((self._feature_names_in is None or feature_names is None)
429-
and self._n_features == n_features):
430-
return
431-
432-
neg_col_present = np.any([_is_negative_indexing(col)
433-
for col in self._columns])
434-
if neg_col_present and self._n_features != n_features:
435-
raise RuntimeError("At least one negative column was used to "
436-
"indicate columns, and the new data's number "
437-
"of columns does not match the data given "
438-
"during fit. "
439-
"Please make sure the data during fit and "
440-
"transform have the same number of columns.")
441-
442-
if (self._n_features != n_features or
443-
np.any(self._feature_names_in != np.asarray(feature_names))):
444-
warnings.warn("Given feature/column names or counts do not match "
445-
"the ones for the data given during fit. This will "
446-
"fail from v0.24.",
447-
FutureWarning)
448-
449419
def _log_message(self, name, idx, total):
450420
if not self.verbose:
451421
return None
@@ -584,30 +554,14 @@ def transform(self, X):
584554
else:
585555
X_feature_names = None
586556

587-
if self._n_features > X.shape[1]:
588-
raise ValueError('Number of features of the input must be equal '
589-
'to or greater than that of the fitted '
590-
'transformer. Transformer n_features is {0} '
591-
'and input n_features is {1}.'
592-
.format(self._n_features, X.shape[1]))
593-
594-
# No column reordering allowed for named cols combined with remainder
595-
# TODO: remove this mechanism in 0.24, once we enforce strict column
596-
# name order and count. See #14237 for details.
597-
if (self._remainder[2] is not None and
598-
hasattr(self, '_df_columns') and
599-
self._has_str_cols and
600-
hasattr(X, 'columns')):
601-
n_cols_fit = len(self._df_columns)
602-
n_cols_transform = len(X.columns)
603-
if (n_cols_transform >= n_cols_fit and
604-
any(X.columns[:n_cols_fit] != self._df_columns)):
605-
raise ValueError('Column ordering must be equal for fit '
606-
'and for transform when using the '
607-
'remainder keyword')
608-
609-
# TODO: also call _check_n_features(reset=False) in 0.24
610-
self._validate_features(X.shape[1], X_feature_names)
557+
self._check_n_features(X, reset=False)
558+
if (self._feature_names_in is not None and
559+
X_feature_names is not None and
560+
np.any(self._feature_names_in != X_feature_names)):
561+
raise RuntimeError(
562+
"Given feature/column names do not match the ones for the "
563+
"data given during fit."
564+
)
611565
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
612566
self._validate_output(Xs)
613567

@@ -801,16 +755,6 @@ def make_column_transformer(*transformers,
801755
verbose=verbose)
802756

803757

804-
def _is_negative_indexing(key):
805-
# TODO: remove in v0.24
806-
def is_neg(x): return isinstance(x, numbers.Integral) and x < 0
807-
if isinstance(key, slice):
808-
return is_neg(key.start) or is_neg(key.stop)
809-
elif _determine_key_type(key) == 'int':
810-
return np.any(np.asarray(key) < 0)
811-
return False
812-
813-
814758
class make_column_selector:
815759
"""Create a callable to select columns to be used with
816760
:class:`ColumnTransformer`.

sklearn/compose/tests/test_column_transformer.py

Lines changed: 19 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -514,12 +514,13 @@ def test_column_transformer_invalid_columns(remainder):
514514
ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
515515
ct.fit(X_array)
516516
X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
517-
msg = ("Given feature/column names or counts do not match the ones for "
518-
"the data given during fit.")
519-
with pytest.warns(FutureWarning, match=msg):
520-
ct.transform(X_array_more) # Should accept added columns, for now
517+
msg = ("X has 3 features, but ColumnTransformer is expecting 2 features "
518+
"as input.")
519+
with pytest.raises(ValueError, match=msg):
520+
ct.transform(X_array_more)
521521
X_array_fewer = np.array([[0, 1, 2], ]).T
522-
err_msg = 'Number of features'
522+
err_msg = ("X has 1 features, but ColumnTransformer is expecting 2 "
523+
"features as input.")
523524
with pytest.raises(ValueError, match=err_msg):
524525
ct.transform(X_array_fewer)
525526

@@ -1186,17 +1187,18 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
11861187
remainder=Trans())
11871188

11881189
tf.fit(X_fit_df)
1189-
err_msg = 'Column ordering must be equal'
1190-
warn_msg = ("Given feature/column names or counts do not match the ones "
1191-
"for the data given during fit.")
1192-
with pytest.raises(ValueError, match=err_msg):
1190+
err_msg = ("Given feature/column names do not match the ones for the "
1191+
"data given during fit.")
1192+
with pytest.raises(RuntimeError, match=err_msg):
11931193
tf.transform(X_trans_df)
11941194

1195-
# No error for added columns if ordering is identical
1195+
# ValueError for added columns
11961196
X_extended_df = X_fit_df.copy()
11971197
X_extended_df['third'] = [3, 6, 9]
1198-
with pytest.warns(FutureWarning, match=warn_msg):
1199-
tf.transform(X_extended_df) # No error should be raised, for now
1198+
err_msg = ("X has 3 features, but ColumnTransformer is expecting 2 "
1199+
"features as input.")
1200+
with pytest.raises(ValueError, match=err_msg):
1201+
tf.transform(X_extended_df)
12001202

12011203
# No 'columns' AttributeError when transform input is a numpy array
12021204
X_array = X_fit_array.copy()
@@ -1218,15 +1220,15 @@ def test_feature_name_validation():
12181220
tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
12191221
tf.fit(df)
12201222

1221-
msg = ("Given feature/column names or counts do not match the ones for "
1222-
"the data given during fit.")
1223-
with pytest.warns(FutureWarning, match=msg):
1223+
msg = ("X has 3 features, but ColumnTransformer is expecting 2 features "
1224+
"as input.")
1225+
with pytest.raises(ValueError, match=msg):
12241226
tf.transform(df_extra)
12251227

12261228
tf = ColumnTransformer([('bycol', Trans(), [0])])
12271229
tf.fit(df)
12281230

1229-
with pytest.warns(FutureWarning, match=msg):
1231+
with pytest.raises(ValueError, match=msg):
12301232
tf.transform(X_extra)
12311233

12321234
with warnings.catch_warnings(record=True) as warns:
@@ -1236,24 +1238,9 @@ def test_feature_name_validation():
12361238
tf = ColumnTransformer([('bycol', Trans(), ['a'])],
12371239
remainder=Trans())
12381240
tf.fit(df)
1239-
with pytest.warns(FutureWarning, match=msg):
1240-
tf.transform(df_extra)
1241-
1242-
tf = ColumnTransformer([('bycol', Trans(), [0, -1])])
1243-
tf.fit(df)
1244-
msg = "At least one negative column was used to"
1245-
with pytest.raises(RuntimeError, match=msg):
1246-
tf.transform(df_extra)
1247-
1248-
tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))])
1249-
tf.fit(df)
1250-
with pytest.raises(RuntimeError, match=msg):
1241+
with pytest.raises(ValueError, match=msg):
12511242
tf.transform(df_extra)
12521243

1253-
with warnings.catch_warnings(record=True) as warns:
1254-
tf.transform(df)
1255-
assert not warns
1256-
12571244

12581245
@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
12591246
def test_column_transformer_mask_indexing(array_type):

0 commit comments

Comments
 (0)
0