From ac19cd65239f7da3fbbc90b8800d9511f8ffa6aa Mon Sep 17 00:00:00 2001 From: Maxwell Aladago Date: Thu, 15 Aug 2019 17:03:43 -0400 Subject: [PATCH 1/5] tests for stronger validation for linear regression weights --- sklearn/linear_model/base.py | 6 ++--- sklearn/linear_model/tests/test_base.py | 34 +++++++++++++++++++++++ sklearn/utils/validation.py | 36 +++++++++++++------------ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index d2af98d07ac09..95554fa9aa37a 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -27,7 +27,7 @@ from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin) from ..utils import check_array, check_X_y -from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import FLOAT_DTYPES, _check_sample_weight from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale @@ -467,8 +467,8 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) - if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: - raise ValueError("Sample weights must be 1D array or scalar") + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, X.dtype) X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 2836a63859a10..95bccffa5c3f7 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -93,6 +93,40 @@ def test_linear_regression_sample_weights(): assert_almost_equal(inter1, coefs2[0]) +def test_sample_weights(): + X = [[1], [2]] + Y = [1, 2] + + reg = LinearRegression() + reg.fit(X, Y) + # test should pass if sample weight is None + assert_array_almost_equal(reg.coef_, [0]) + assert_array_almost_equal(reg.intercept_, [0]) + assert_array_almost_equal(reg.predict(X), [0]) + + # improperly defined sample weight + sample_weight = ["1", "pi"] + np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight) + + # sample weights which can be converted to numeric types passes + # if this behavior is not desirable, we can raise an exception + sample_weight = ["1", "1"] + reg.fit(X, Y, sample_weight) + assert_array_almost_equal(reg.coef_, [0]) + assert_array_almost_equal(reg.intercept_, [0]) + assert_array_almost_equal(reg.predict(X), [0]) + + # sample wight with few samples than X should raise an ValueError + sample_weight = [2] + reg.fit(X, Y, sample_weight) + np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight) + + # sample weights greater than 1D raises ValueError + sample_weight = [[1, 1]] + reg.fit(X, Y, sample_weight) + np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight) + + def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5de790701aaaa..d615c7137455e 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1025,27 +1025,29 @@ def _check_sample_weight(sample_weight, X, dtype=None): """ n_samples = _num_samples(X) - if dtype is not None and dtype not in [np.float32, np.float64]: + if dtype is None: + dtype = [np.float64, np.float32] + elif dtype not in [np.float32, np.float64]: dtype = np.float64 - if sample_weight is None or isinstance(sample_weight, numbers.Number): - if sample_weight is None: - sample_weight = np.ones(n_samples, dtype=dtype) - else: - sample_weight = np.full(n_samples, sample_weight, + if sample_weight is None: + sample_weight = np.ones(n_samples, dtype=dtype) + elif isinstance(sample_weight, numbers.Number): + sample_weight = np.full(n_samples, sample_weight, dtype=dtype) else: - if dtype is None: - dtype = [np.float64, np.float32] - sample_weight = check_array( - sample_weight, accept_sparse=False, - ensure_2d=False, dtype=dtype, order="C" - ) - if sample_weight.ndim != 1: - raise ValueError("Sample weights must be 1D array or scalar") - - if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" + try: + sample_weight = np.array(sample_weight, dtype=dtype) + except ValueError as e: + e.args = e.args[0] + ". sample weights must a scalar or" \ + " a 1D array numeric types" + raise + + if sample_weight.ndim != 1: + raise ValueError("Sample weights must be 1D array or scalar") + + if sample_weight.shape != (n_samples,): + raise ValueError("sample_weight.shape == {}, expected {}!" .format(sample_weight.shape, (n_samples,))) return sample_weight From 4c25185cc43c12a900026d34972ebc9561e2042b Mon Sep 17 00:00:00 2001 From: Maxwell Aladago Date: Wed, 21 Aug 2019 03:38:56 -0400 Subject: [PATCH 2/5] refactored validation and tested against all estimators --- sklearn/ensemble/forest.py | 4 +-- sklearn/linear_model/tests/test_base.py | 34 ------------------------- sklearn/utils/estimator_checks.py | 11 ++++++++ sklearn/utils/validation.py | 23 +++++++---------- 4 files changed, 22 insertions(+), 50 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index 911b16b67df5f..15fef6214cbab 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -60,7 +60,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from .base import BaseEnsemble, _partition_estimators from ..utils.fixes import parallel_helper, _joblib_parallel_args from ..utils.multiclass import check_classification_targets -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight __all__ = ["RandomForestClassifier", @@ -243,7 +243,7 @@ def fit(self, X, y, sample_weight=None): X = check_array(X, accept_sparse="csc", dtype=DTYPE) y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) if sample_weight is not None: - sample_weight = check_array(sample_weight, ensure_2d=False) + sample_weight = _check_sample_weight(sample_weight, X) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py index 95bccffa5c3f7..2836a63859a10 100644 --- a/sklearn/linear_model/tests/test_base.py +++ b/sklearn/linear_model/tests/test_base.py @@ -93,40 +93,6 @@ def test_linear_regression_sample_weights(): assert_almost_equal(inter1, coefs2[0]) -def test_sample_weights(): - X = [[1], [2]] - Y = [1, 2] - - reg = LinearRegression() - reg.fit(X, Y) - # test should pass if sample weight is None - assert_array_almost_equal(reg.coef_, [0]) - assert_array_almost_equal(reg.intercept_, [0]) - assert_array_almost_equal(reg.predict(X), [0]) - - # improperly defined sample weight - sample_weight = ["1", "pi"] - np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight) - - # sample weights which can be converted to numeric types passes - # if this behavior is not desirable, we can raise an exception - sample_weight = ["1", "1"] - reg.fit(X, Y, sample_weight) - assert_array_almost_equal(reg.coef_, [0]) - assert_array_almost_equal(reg.intercept_, [0]) - assert_array_almost_equal(reg.predict(X), [0]) - - # sample wight with few samples than X should raise an ValueError - sample_weight = [2] - reg.fit(X, Y, sample_weight) - np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight) - - # sample weights greater than 1D raises ValueError - sample_weight = [[1, 1]] - reg.fit(X, Y, sample_weight) - np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight) - - def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 23109e32a4549..acd76199887f7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -11,6 +11,7 @@ from scipy import sparse from scipy.stats import rankdata import joblib +import pytest from . import IS_PYPY from .testing import assert_raises, _get_args @@ -636,6 +637,16 @@ def check_sample_weights_invariance(name, estimator_orig): 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) y = enforce_estimator_tags_y(estimator1, y) + # sample weights greater than 1D raises ValueError + sample_weight = [[1, 2]] + with pytest.raises(ValueError): + estimator2.fit(X, y, sample_weight=sample_weight) + + # sample wight with few samples than X raises ValueError + sample_weight = [[1, 1]] + with pytest.raises(ValueError): + estimator2.fit(X, y, sample_weight=sample_weight) + estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) estimator2.fit(X, y=y, sample_weight=None) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index d615c7137455e..13e60aaffc371 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1025,23 +1025,18 @@ def _check_sample_weight(sample_weight, X, dtype=None): """ n_samples = _num_samples(X) - if dtype is None: - dtype = [np.float64, np.float32] - elif dtype not in [np.float32, np.float64]: + if dtype not in [np.float32, np.float64]: dtype = np.float64 - if sample_weight is None: - sample_weight = np.ones(n_samples, dtype=dtype) - elif isinstance(sample_weight, numbers.Number): - sample_weight = np.full(n_samples, sample_weight, + if sample_weight is None or isinstance(sample_weight, numbers.Number): + if sample_weight is None: + sample_weight = np.ones(n_samples, dtype=dtype) + elif isinstance(sample_weight, numbers.Number): + sample_weight = np.full(n_samples, sample_weight, dtype=dtype) - else: - try: - sample_weight = np.array(sample_weight, dtype=dtype) - except ValueError as e: - e.args = e.args[0] + ". sample weights must a scalar or" \ - " a 1D array numeric types" - raise + return sample_weight + + sample_weight = np.array(sample_weight, dtype=dtype) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") From 7b807785b7f863cb4171cbbb070b493d91296937 Mon Sep 17 00:00:00 2001 From: Maxwell Aladago Date: Wed, 21 Aug 2019 08:46:12 -0400 Subject: [PATCH 3/5] added checks to calibration.py --- sklearn/calibration.py | 8 ++++---- sklearn/linear_model/base.py | 6 +++--- sklearn/utils/validation.py | 3 +++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index b88a8b8eb37ef..6568c83a53526 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -23,6 +23,7 @@ from .preprocessing import label_binarize, LabelBinarizer from .utils import check_X_y, check_array, indexable, column_or_1d from .utils.validation import check_is_fitted, check_consistent_length +from .utils.validation import _check_sample_weight from .isotonic import IsotonicRegression from .svm import LinearSVC from .model_selection import check_cv @@ -155,6 +156,9 @@ def fit(self, X, y, sample_weight=None): else: base_estimator = self.base_estimator + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + if self.cv == "prefit": calibrated_classifier = _CalibratedClassifier( base_estimator, method=self.method) @@ -172,12 +176,8 @@ def fit(self, X, y, sample_weight=None): warnings.warn("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) - sample_weight = check_array(sample_weight, ensure_2d=False) base_estimator_sample_weight = None else: - if sample_weight is not None: - sample_weight = check_array(sample_weight, ensure_2d=False) - check_consistent_length(y, sample_weight) base_estimator_sample_weight = sample_weight for train, test in cv.split(X, y): this_estimator = clone(base_estimator) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index 95554fa9aa37a..4cc7af51c243f 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -34,7 +34,7 @@ from ..utils.fixes import sparse_lsqr from ..utils.seq_dataset import ArrayDataset32, CSRDataset32 from ..utils.seq_dataset import ArrayDataset64, CSRDataset64 -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from ..preprocessing.data import normalize as f_normalize # TODO: bayesian_ridge_regression and bayesian_regression_ard @@ -118,8 +118,8 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, centered. This function also systematically makes y consistent with X.dtype """ - if isinstance(sample_weight, numbers.Number): - sample_weight = None + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, np.float64) if check_input: X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 13e60aaffc371..afd9339f4c6c8 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1025,6 +1025,9 @@ def _check_sample_weight(sample_weight, X, dtype=None): """ n_samples = _num_samples(X) + if hasattr(sample_weight, "dtype"): + dtype = sample_weight.dtype + if dtype not in [np.float32, np.float64]: dtype = np.float64 From e679df87e51d8c8b3ce7dccf7ea11912f5bcc2ae Mon Sep 17 00:00:00 2001 From: Maxwell Aladago Date: Wed, 21 Aug 2019 08:50:45 -0400 Subject: [PATCH 4/5] flake8 issue --- sklearn/linear_model/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index 4cc7af51c243f..e77c05e17d8d0 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -14,7 +14,6 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod -import numbers import warnings import numpy as np @@ -27,7 +26,7 @@ from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin) from ..utils import check_array, check_X_y -from ..utils.validation import FLOAT_DTYPES, _check_sample_weight +from ..utils.validation import FLOAT_DTYPES from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale From bdeb2c21e91cfdadccbf9f5ac3d9ccc674609c4d Mon Sep 17 00:00:00 2001 From: Maxwell Aladago Date: Fri, 23 Aug 2019 08:50:31 -0400 Subject: [PATCH 5/5] removing tests for all estimators --- sklearn/linear_model/base.py | 2 +- sklearn/utils/estimator_checks.py | 11 ----------- sklearn/utils/tests/test_validation.py | 6 ++++++ sklearn/utils/validation.py | 20 ++++++++++++++++++-- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index e77c05e17d8d0..8b651be1e709c 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -118,7 +118,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, """ if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X, np.float64) + sample_weight = _check_sample_weight(sample_weight, X) if check_input: X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index acd76199887f7..23109e32a4549 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -11,7 +11,6 @@ from scipy import sparse from scipy.stats import rankdata import joblib -import pytest from . import IS_PYPY from .testing import assert_raises, _get_args @@ -637,16 +636,6 @@ def check_sample_weights_invariance(name, estimator_orig): 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) y = enforce_estimator_tags_y(estimator1, y) - # sample weights greater than 1D raises ValueError - sample_weight = [[1, 2]] - with pytest.raises(ValueError): - estimator2.fit(X, y, sample_weight=sample_weight) - - # sample wight with few samples than X raises ValueError - sample_weight = [[1, 1]] - with pytest.raises(ValueError): - estimator2.fit(X, y, sample_weight=sample_weight) - estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) estimator2.fit(X, y=y, sample_weight=None) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 0f7ffe9a3e4f0..f2113208ab51e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -912,6 +912,12 @@ def test_check_sample_weight(): sample_weight = _check_sample_weight(None, X, dtype=X.dtype) assert sample_weight.dtype == np.float64 + # wrongly formated sample_weight + sample_weight = np.array(["1", "pi", "e"]) + err_msg = "could not convert string to float: 'pi'" + with pytest.raises(ValueError, match=err_msg): + _check_sample_weight(sample_weight, X) + @pytest.mark.parametrize("toarray", [ np.array, sp.csr_matrix, sp.csc_matrix]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index afd9339f4c6c8..bffc3422e7f4e 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1025,6 +1025,13 @@ def _check_sample_weight(sample_weight, X, dtype=None): """ n_samples = _num_samples(X) + # this check is needed to ensure that we don't change the dtype of + # of sample_weight if it's already np.float32. + # since sample_weight can be a list or an array, we first + # need to verify that it has a dtype attribute before the check. + # if dtype is None or any other type besides np.float32, np.float64 + # is given. + if hasattr(sample_weight, "dtype"): dtype = sample_weight.dtype @@ -1039,14 +1046,23 @@ def _check_sample_weight(sample_weight, X, dtype=None): dtype=dtype) return sample_weight + # at this point, sample_weight is either a list or + # an array. These checks will validate that the dtype + # of the returned sample_weight is either np.float32 or + # np.float64. If sample weight contained elements which + # cannot be passed safely to the above types, the + # following line will raise a ValueError sample_weight = np.array(sample_weight, dtype=dtype) + # sample_weights must be 1-D arrays if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") - if sample_weight.shape != (n_samples,): + # and must have the same number of elements + # as X + if sample_weight.shape[0] != n_samples: raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + .format(sample_weight.shape, (n_samples, ))) return sample_weight