From 839b0059479b17e9910ababe9fe26aedd224e69d Mon Sep 17 00:00:00 2001 From: Kat Hempstalk Date: Tue, 21 Feb 2017 15:46:52 +1300 Subject: [PATCH 1/6] Fixed assumption fit attribute means object is estimator. --- sklearn/utils/validation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 58ea733c3a118..b9babfc2f2597 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -93,8 +93,9 @@ def _is_arraylike(x): def _num_samples(x): """Return number of samples in array-like x.""" if hasattr(x, 'fit'): - # Don't get num_samples from an ensembles length! - raise TypeError('Expected sequence or array-like, got ' + if hasattr(x.fit, '__call__'): + # Don't get num_samples from an ensembles length! + raise TypeError('Expected sequence or array-like, got ' 'estimator %s' % x) if not hasattr(x, '__len__') and not hasattr(x, 'shape'): if hasattr(x, '__array__'): From 63486425d7e6a29b108703f18c3f20e0a462be6b Mon Sep 17 00:00:00 2001 From: Kat Hempstalk Date: Tue, 21 Feb 2017 16:08:22 +1300 Subject: [PATCH 2/6] Fixed 'and' in if statement --- sklearn/utils/validation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b9babfc2f2597..e87b6c25df5ea 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -92,10 +92,9 @@ def _is_arraylike(x): def _num_samples(x): """Return number of samples in array-like x.""" - if hasattr(x, 'fit'): - if hasattr(x.fit, '__call__'): - # Don't get num_samples from an ensembles length! - raise TypeError('Expected sequence or array-like, got ' + if hasattr(x, 'fit') and hasattr(x.fit, '__call__'): + # Don't get num_samples from an ensembles length! + raise TypeError('Expected sequence or array-like, got ' 'estimator %s' % x) if not hasattr(x, '__len__') and not hasattr(x, 'shape'): if hasattr(x, '__array__'): From d37c2480a7241f8b290cb7868527ab70ba187318 Mon Sep 17 00:00:00 2001 From: Kathryn Hempstalk Date: Tue, 21 Feb 2017 16:50:10 +1300 Subject: [PATCH 3/6] Added non-regression test for #8418 --- sklearn/utils/tests/test_validation.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 37469f165da15..9a899add7734a 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -15,6 +15,7 @@ from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import SkipTest from sklearn.utils import as_float_array, check_array, check_symmetric from sklearn.utils import check_X_y from sklearn.utils.mocking import MockDataFrame @@ -500,4 +501,14 @@ def test_check_consistent_length(): # Despite ensembles having __len__ they must raise TypeError assert_raises_regexp(TypeError, 'estimator', check_consistent_length, [1, 2], RandomForestRegressor()) + + #check pandas dataframe with 'fit' column does not raise error + try: + import pandas as pd + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + X_df = pd.DataFrame(X,columns=['a','b','fit']) + check_consistent_length(X_df) + except ImportError: + raise SkipTest("Pandas not found") + # XXX: We should have a test with a string, but what is correct behaviour? From b417ce6d471e134e0fe5fbb17cfea41ba4b5699d Mon Sep 17 00:00:00 2001 From: Kathryn Hempstalk Date: Tue, 21 Feb 2017 16:57:14 +1300 Subject: [PATCH 4/6] Added PEP8 compliance for new regression test --- sklearn/utils/tests/test_validation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 9a899add7734a..e15ffeced28b6 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -501,14 +501,13 @@ def test_check_consistent_length(): # Despite ensembles having __len__ they must raise TypeError assert_raises_regexp(TypeError, 'estimator', check_consistent_length, [1, 2], RandomForestRegressor()) - - #check pandas dataframe with 'fit' column does not raise error + + # check pandas dataframe with 'fit' column does not raise error try: import pandas as pd X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - X_df = pd.DataFrame(X,columns=['a','b','fit']) + X_df = pd.DataFrame(X, columns=['a', 'b', 'fit']) check_consistent_length(X_df) except ImportError: raise SkipTest("Pandas not found") - # XXX: We should have a test with a string, but what is correct behaviour? From acad16c7c3b5257edf161fe7a04e213d182cbc6a Mon Sep 17 00:00:00 2001 From: Kathryn Hempstalk Date: Tue, 21 Feb 2017 17:02:53 +1300 Subject: [PATCH 5/6] Replaced hasattr function with callable in-built --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index e87b6c25df5ea..aa7b3cc78f808 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -92,7 +92,7 @@ def _is_arraylike(x): def _num_samples(x): """Return number of samples in array-like x.""" - if hasattr(x, 'fit') and hasattr(x.fit, '__call__'): + if hasattr(x, 'fit') and callable(x.fit): # Don't get num_samples from an ensembles length! raise TypeError('Expected sequence or array-like, got ' 'estimator %s' % x) From 6041763c192c4df9d1d88e4624389e87d7710776 Mon Sep 17 00:00:00 2001 From: drkatnz Date: Tue, 21 Feb 2017 22:27:01 +1300 Subject: [PATCH 6/6] Moved pandas test to separate function --- sklearn/utils/tests/test_validation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index e15ffeced28b6..49d867a1b0bee 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -501,8 +501,12 @@ def test_check_consistent_length(): # Despite ensembles having __len__ they must raise TypeError assert_raises_regexp(TypeError, 'estimator', check_consistent_length, [1, 2], RandomForestRegressor()) + # XXX: We should have a test with a string, but what is correct behaviour? + +def test_check_dataframe_fit_attribute(): # check pandas dataframe with 'fit' column does not raise error + # https://github.com/scikit-learn/scikit-learn/issues/8415 try: import pandas as pd X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) @@ -510,4 +514,3 @@ def test_check_consistent_length(): check_consistent_length(X_df) except ImportError: raise SkipTest("Pandas not found") - # XXX: We should have a test with a string, but what is correct behaviour?