From fb33cb379e516e30d65182c34ca1a9074248fb78 Mon Sep 17 00:00:00 2001 From: sergul Date: Tue, 17 Jul 2018 12:10:31 -0400 Subject: [PATCH 1/7] check sample_weight shape added --- sklearn/ensemble/weight_boosting.py | 7 +++++++ sklearn/linear_model/ransac.py | 9 +++++++++ sklearn/utils/estimator_checks.py | 14 ++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index d01f8516d01be..759d5825205e5 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -124,6 +124,13 @@ def fit(self, X, y, sample_weight=None): "Attempting to fit with a non-positive " "weighted number of samples.") + if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]: + raise ValueError("sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (sample_weight.shape, X.shape)) + # Check parameters self._validate_estimator() diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index f9c372d8ed469..3a4299386f396 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -311,6 +311,15 @@ def fit(self, X, y, sample_weight=None): if sample_weight is not None: sample_weight = np.asarray(sample_weight) + if sample_weight is not None and estimator_fit_has_sample_weight: + if (sample_weight.shape[0] > 0 and + sample_weight.shape[0] != X.shape[0]): + raise ValueError("sample_weight and X have incompatible " + " shapes: %r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (sample_weight.shape, X.shape)) + n_inliers_best = 1 score_best = -np.inf inlier_mask_best = None diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3633479672cde..b64a8b1af9eff 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -88,6 +88,7 @@ def _yield_non_meta_checks(name, estimator): yield check_dtype_object yield check_sample_weights_pandas_series yield check_sample_weights_list + yield check_sample_weights_shape yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) yield check_complex_data @@ -554,6 +555,19 @@ def check_sample_weights_list(name, estimator_orig): estimator.fit(X, y, sample_weight=sample_weight) +@ignore_warnings(category=(DeprecationWarning, FutureWarning)) +def check_sample_weights_shape(name, estimator_orig): + # check if estimators raise an error if sample_weight + # shape mismatches the input + if has_fit_parameter(estimator_orig, "sample_weight"): + estimator = clone(estimator_orig) + X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], + [2, 1], [2, 2], [2, 3], [2, 4]]) + y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + assert_raises(ValueError, estimator.fit, X, y, + sample_weight=np.ones(shape=2*len(y))) + + @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible From 6bf4c6078ce11d923f64696d828cf6703cd62786 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 14:04:27 +0100 Subject: [PATCH 2/7] Remove redundant checks --- sklearn/ensemble/_weight_boosting.py | 7 ------- sklearn/linear_model/_ransac.py | 9 --------- sklearn/utils/estimator_checks.py | 2 +- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index a7fd422c6b534..2908b888e7c91 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -123,13 +123,6 @@ def fit(self, X, y, sample_weight=None): if np.any(sample_weight < 0): raise ValueError("sample_weight cannot contain negative weights") - if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) - # Check parameters self._validate_estimator() diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index fab8851950a14..0363032359524 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -322,15 +322,6 @@ def fit(self, X, y, sample_weight=None): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) - if sample_weight is not None and estimator_fit_has_sample_weight: - if (sample_weight.shape[0] > 0 and - sample_weight.shape[0] != X.shape[0]): - raise ValueError("sample_weight and X have incompatible " - " shapes: %r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) - n_inliers_best = 1 score_best = -np.inf inlier_mask_best = None diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 38eb5f39c1ba9..694210fd96dc0 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -781,7 +781,7 @@ def check_sample_weights_shape(name, estimator_orig): assert_raises(ValueError, estimator.fit, X, y, sample_weight=np.ones(2*len(y))) - + @ignore_warnings(category=(FutureWarning, UserWarning)) def check_sample_weights_invariance(name, estimator_orig): # check that the estimators yield same results for From 6d922a35040d0d878ed2e80d7c23b4e3511e872e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 14:09:27 +0100 Subject: [PATCH 3/7] Minor fix --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 694210fd96dc0..53d9bb59e13d5 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -765,7 +765,7 @@ def check_sample_weights_list(name, estimator_orig): estimator.fit(X, y, sample_weight=sample_weight) -@ignore_warnings(category=(FutureWarning, UserWarning)) +@ignore_warnings(category=FutureWarning) def check_sample_weights_shape(name, estimator_orig): # check that estimators raise an error if sample_weight # shape mismatches the input @@ -782,7 +782,7 @@ def check_sample_weights_shape(name, estimator_orig): sample_weight=np.ones(2*len(y))) -@ignore_warnings(category=(FutureWarning, UserWarning)) +@ignore_warnings(category=FutureWarning) def check_sample_weights_invariance(name, estimator_orig): # check that the estimators yield same results for # unit weights and no weights From 4b3c3b0580250da397d5e98c962f71a9873dff1b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 14:18:02 +0100 Subject: [PATCH 4/7] Also check for incorrect ndim --- sklearn/utils/estimator_checks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 53d9bb59e13d5..e843fe741716a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -781,6 +781,8 @@ def check_sample_weights_shape(name, estimator_orig): assert_raises(ValueError, estimator.fit, X, y, sample_weight=np.ones(2*len(y))) + assert_raises(ValueError, estimator.fit, X, y, + sample_weight=np.ones((len(y), 2))) @ignore_warnings(category=FutureWarning) def check_sample_weights_invariance(name, estimator_orig): From f5661aaff702ec7856b3b2a7f6733ea41eb606c9 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 14:32:23 +0100 Subject: [PATCH 5/7] Remove redundant tests --- sklearn/ensemble/tests/test_gradient_boosting.py | 4 ---- sklearn/neighbors/tests/test_kde.py | 4 ---- sklearn/svm/tests/test_svm.py | 5 ----- sklearn/tree/tests/test_tree.py | 8 -------- 4 files changed, 21 deletions(-) diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index a28c69d0f7cc5..0c7f07929e370 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -345,10 +345,6 @@ def test_check_inputs(): clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.fit, X, y + [0, 1]) - clf = GradientBoostingClassifier(n_estimators=100, random_state=1) - assert_raises(ValueError, clf.fit, X, y, - sample_weight=([1] * len(y)) + [0, 1]) - weight = [0, 0, 0, 1, 1, 1] clf = GradientBoostingClassifier(n_estimators=100, random_state=1) msg = ("y contains 1 class after sample_weight trimmed classes with " diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 69aca8e8f75b8..6687cfa475ce8 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -210,10 +210,6 @@ def test_sample_weight_invalid(): kde = KernelDensity() data = np.reshape([1., 2., 3.], (-1, 1)) - sample_weight = [0.1, 0.2] - with pytest.raises(ValueError): - kde.fit(data, sample_weight=sample_weight) - sample_weight = [0.1, -0.2, 0.3] expected_err = "sample_weight must have positive values" with pytest.raises(ValueError, match=expected_err): diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 191420d1d7147..fb811940c2971 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -639,11 +639,6 @@ def test_bad_input(): with pytest.raises(ValueError): clf.fit(X, Y) - # sample_weight bad dimensions - clf = svm.SVC() - with pytest.raises(ValueError): - clf.fit(X, Y, sample_weight=range(len(X) - 1)) - # predict with sparse input when trained with dense clf = svm.SVC().fit(X, Y) with pytest.raises(ValueError): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 572bf8d01d57c..1149ceb8678d9 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1125,14 +1125,6 @@ def test_sample_weight_invalid(): with pytest.raises(TypeError, match=expected_err): clf.fit(X, y, sample_weight=sample_weight) - sample_weight = np.ones(101) - with pytest.raises(ValueError): - clf.fit(X, y, sample_weight=sample_weight) - - sample_weight = np.ones(99) - with pytest.raises(ValueError): - clf.fit(X, y, sample_weight=sample_weight) - def check_class_weights(name): """Check class_weights resemble sample_weights behavior.""" From dc706dfde18963de170999dd7d1aa0a842fea7c6 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 14:46:23 +0100 Subject: [PATCH 6/7] Fix tests --- sklearn/utils/estimator_checks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e843fe741716a..9eeadf836ac67 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -769,7 +769,9 @@ def check_sample_weights_list(name, estimator_orig): def check_sample_weights_shape(name, estimator_orig): # check that estimators raise an error if sample_weight # shape mismatches the input - if has_fit_parameter(estimator_orig, "sample_weight"): + if (has_fit_parameter(estimator_orig, "sample_weight") and + not (hasattr(estimator_orig, "_pairwise") + and estimator_orig._pairwise)): estimator = clone(estimator_orig) X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [2, 3], [2, 4]]) @@ -784,6 +786,7 @@ def check_sample_weights_shape(name, estimator_orig): assert_raises(ValueError, estimator.fit, X, y, sample_weight=np.ones((len(y), 2))) + @ignore_warnings(category=FutureWarning) def check_sample_weights_invariance(name, estimator_orig): # check that the estimators yield same results for From 9b97866506afc950216aa2f9df0161f03965a4da Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 21 Feb 2020 15:30:01 +0100 Subject: [PATCH 7/7] Fix failing test --- sklearn/utils/estimator_checks.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9eeadf836ac67..1e86f68d4ca3c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -773,9 +773,12 @@ def check_sample_weights_shape(name, estimator_orig): not (hasattr(estimator_orig, "_pairwise") and estimator_orig._pairwise)): estimator = clone(estimator_orig) - X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], - [2, 1], [2, 2], [2, 3], [2, 4]]) - y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], + [2, 1], [2, 1], [2, 1], [2, 1], + [3, 3], [3, 3], [3, 3], [3, 3], + [4, 1], [4, 1], [4, 1], [4, 1]]) + y = np.array([1, 1, 1, 1, 2, 2, 2, 2, + 1, 1, 1, 1, 2, 2, 2, 2]) y = _enforce_estimator_tags_y(estimator, y) estimator.fit(X, y, sample_weight=np.ones(len(y)))