scikit-learn · amueller · Aug 7, 2019 · Aug 1, 2019 · Aug 2, 2019 · Aug 2, 2019
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -6,9 +6,10 @@
 # Author: Andreas Mueller
 #         Joris Van den Bossche
 # License: BSD
-
+import warnings
 from itertools import chain
 
+import numbers
 import numpy as np
 from scipy import sparse
 from joblib import Parallel, delayed
@@ -394,6 +395,34 @@ def _validate_output(self, result):
                     "The output of the '{0}' transformer should be 2D (scipy "
                     "matrix, array, or pandas DataFrame).".format(name))
 
+    def _validate_features(self, n_features, feature_names):
+        """Ensures feature counts and names are the same during fit and
+        transform.
+
+        TODO: It should raise an error from v0.24
+        """
+
+        if ((self._feature_names_in is None or feature_names is None)
+                and self._n_features == n_features):
+            return
+
+        neg_col_present = np.any([_is_negative_indexing(col)
+                                  for col in self._columns])
+        if neg_col_present and self._n_features != n_features:
+            raise RuntimeError("At least one negative column was used to "
+                               "indicate columns, and the new data's number "
+                               "of columns does not match the data given "
+                               "during fit. "
+                               "Please make sure the data during fit and "
+                               "transform have the same number of columns.")
+
+        if (self._n_features != n_features or
+                np.any(self._feature_names_in != np.asarray(feature_names))):
+            warnings.warn("Given feature/column names or counts do not match "
+                          "the ones for the data given during fit. This will "
+                          "fail from v0.24.",
+                          DeprecationWarning)
+
     def _log_message(self, name, idx, total):
         if not self.verbose:
             return None
@@ -470,6 +499,11 @@ def fit_transform(self, X, y=None):
             sparse matrices.
 
         """
+        # TODO: this should be `feature_names_in_` when we start having it
+        if hasattr(X, "columns"):
+            self._feature_names_in = np.asarray(X.columns)
+        else:
+            self._feature_names_in = None
         X = _check_X(X)
         self._validate_transformers()
         self._validate_column_callables(X)
@@ -518,6 +552,10 @@ def transform(self, X):
         """
         check_is_fitted(self, 'transformers_')
         X = _check_X(X)
+        if hasattr(X, "columns"):
+            X_feature_names = np.asarray(X.columns)
+        else:
+            X_feature_names = None
 
         if self._n_features > X.shape[1]:
             raise ValueError('Number of features of the input must be equal '
@@ -527,6 +565,8 @@ def transform(self, X):
                              .format(self._n_features, X.shape[1]))
 
         # No column reordering allowed for named cols combined with remainder
+        # TODO: remove this mechanism in 0.24, once we enforce strict column
+        # name order and count. See #14237 for details.
         if (self._remainder[2] is not None and
                 hasattr(self, '_df_columns') and
                 hasattr(X, 'columns')):
@@ -538,6 +578,7 @@ def transform(self, X):
                                  'and for transform when using the '
                                  'remainder keyword')
 
+        self._validate_features(X.shape[1], X_feature_names)
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)
 
@@ -707,3 +748,13 @@ def make_column_transformer(*transformers, **kwargs):
                            
10000
  remainder=remainder,
                              sparse_threshold=sparse_threshold,
                              verbose=verbose)
+
+
+def _is_negative_indexing(key):
+    # TODO: remove in v0.24
+    def is_neg(x): return isinstance(x, numbers.Integral) and x < 0
+    if isinstance(key, slice):
+        return is_neg(key.start) or is_neg(key.stop)
+    elif _check_key_type(key, int):
+        return np.any(np.asarray(key) < 0)
+    return False
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -3,6 +3,7 @@
 """
 import re
 
+import warnings
 import numpy as np
 from scipy import sparse
 import pytest
@@ -498,7 +499,10 @@ def test_column_transformer_invalid_columns(remainder):
     ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder)
     ct.fit(X_array)
     X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T
-    ct.transform(X_array_more)  # Should accept added columns
+    msg = ("Given feature/column names or counts do not match the ones for "
+           "the data given during fit.")
+    with pytest.warns(DeprecationWarning, match=msg):
+        ct.transform(X_array_more)  # Should accept added columns, for now
     X_array_fewer = np.array([[0, 1, 2], ]).T
     err_msg = 'Number of features'
     with pytest.raises(ValueError, match=err_msg):
@@ -1096,13 +1100,16 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
 
     tf.fit(X_fit_df)
     err_msg = 'Column ordering must be equal'
+    warn_msg = ("Given feature/column names or counts do not match the ones "
+                "for the data given during fit.")
     with pytest.raises(ValueError, match=err_msg):
         tf.transform(X_trans_df)
 
     # No error for added columns if ordering is identical
     X_extended_df = X_fit_df.copy()
     X_extended_df['third'] = [3, 6, 9]
-    tf.transform(X_extended_df)  # No error should be raised
+    with pytest.warns(DeprecationWarning, match=warn_msg):
+        tf.transform(X_extended_df)  # No error should be raised, for now
 
     # No 'columns' AttributeError when transform input is a numpy array
     X_array = X_fit_array.copy()
@@ -1111,6 +1118,56 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname):
         tf.transform(X_array)
 
 
+def test_feature_name_validation():
+    """Tests if the proper warning/error is raised if the columns do not match
+    during fit and transform."""
+    pd = pytest.importorskip("pandas")
+
+    X = np.ones(shape=(3, 2))
+    X_extra = np.ones(shape=(3, 3))
+    df = pd.DataFrame(X, columns=['a', 'b'])
+    df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c'])
+
+    tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])])
+    tf.fit(df)
+
+    msg = ("Given feature/column names or counts do not match the ones for "
+           "the data given during fit.")
+    with pytest.warns(DeprecationWarning, match=msg):
+        tf.transform(df_extra)
+
+    tf = ColumnTransformer([('bycol', Trans(), [0])])
+    tf.fit(df)
+
+    with pytest.warns(DeprecationWarning, match=msg):
+        tf.transform(X_extra)
+
+    with warnings.catch_warnings(record=True) as warns:
+        tf.transform(X)
+    assert not warns
+
+    tf = ColumnTransformer([('bycol', Trans(), ['a'])],
+                           remainder=Trans())
+    tf.fit(df)
+    with pytest.warns(DeprecationWarning, match=msg):
+        tf.transform(df_extra)
+
+    tf = ColumnTransformer([('bycol', Trans(), [0, -1])])
+    tf.fit(df)
+    msg = "At least one negative column was used to"
+    with pytest.raises(RuntimeError, match=msg):
+        tf.transform(df_extra)
+
+    tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))])
+    tf.fit(df)
+    with pytest.raises(RuntimeError, match=msg):
+        tf.transform(df_extra)
+
+    with warnings.catch_warnings(record=True) as warns:
+        tf.transform(df)
+    assert not warns
+
+
 @pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix])
 def test_column_transformer_mask_indexing(array_type):
     # Regression test for #14510