Closed
Description
Description
When supplying a pandas dataframe to model_selection.KFold or GroupKFold the split function fails if the dataframe contains a column labelled 'fit'. Renaming the column to something other than 'fit' fixes the issue. This is especially problematic for users of CountVectorizer (or similar) when using the feature names as column names in a dataframe that is later used for machine learning.
Steps/Code to Reproduce
BUG:
from sklearn import datasets
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
import pandas as pd
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
Y = iris.target
X['fit'] = 0
gkf = GroupKFold(n_splits=10)
for train, test in gkf.split(X, Y, groups=X['petal width (cm)']):
pass
WORKS:
from sklearn import datasets
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold
import pandas as pd
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
Y = iris.target
X['fit_'] = 0
gkf = GroupKFold(n_splits=10)
for train, test in gkf.split(X, Y, groups=X['petal width (cm)']):
pass
Only difference between two code samples is using 'fit' vs 'fit_'
Expected Results
No output, loop should complete silently.
Actual Results
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-20-4e085b2f516b> in <module>()
13 gkf = GroupKFold(n_splits=10)
14
---> 15 for train, test in gkf.split(X, Y, groups=X['petal width (cm)']):
16 pass
AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\model_selection\_split.pyc in split(self, X, y, groups)
312 The testing set indices for that split.
313 """
--> 314 X, y, groups = indexable(X, y, groups)
315 n_samples = _num_samples(X)
316 if self.n_splits > n_samples:
AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in indexable(*iterables)
204 else:
205 result.append(np.array(X))
--> 206 check_consistent_length(*result)
207 return result
208
AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in check_consistent_length(*arrays)
175 """
176
--> 177 lengths = [_num_samples(X) for X in arrays if X is not None]
178 uniques = np.unique(lengths)
179 if len(uniques) > 1:
AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in _num_samples(x)
114 # Don't get num_samples from an ensembles length!
115 raise TypeError('Expected sequence or array-like, got '
--> 116 'estimator %s' % x)
117 if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
118 if hasattr(x, '__array__'):
TypeError: Expected sequence or array-like, got estimator sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \
Versions
Windows-10-10.0.14393
('Python', '2.7.12 |Anaconda custom (64-bit)| (default, Jun 29 2016, 11:07:13) [MSC v.1500 64 bit (AMD64)]')
('NumPy', '1.11.1')
('SciPy', '0.18.1')
('Scikit-Learn', '0.18.1')