From 9800b4b08c62b7616e3d3de997c82f840aa97c3c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 10:32:56 +0200 Subject: [PATCH 01/34] Common sample_weight validation in huber --- sklearn/linear_model/huber.py | 9 +++------ sklearn/utils/validation.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index 0a4b6e10f6f98..3225487df7dab 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -8,8 +8,8 @@ from ..base import BaseEstimator, RegressorMixin from .base import LinearModel from ..utils import check_X_y -from ..utils import check_consistent_length from ..utils import axis0_safe_slice +from ..utils.validation import _check_sample_weight from ..utils.extmath import safe_sparse_dot @@ -253,11 +253,8 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y( X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) - if sample_weight is not None: - sample_weight = np.array(sample_weight) - check_consistent_length(y, sample_weight) - else: - sample_weight = np.ones_like(y) + + sample_weight = _check_sample_weight(sample_weight, y) if self.epsilon < 1.0: raise ValueError( diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index bb6cf1c8ffe00..0413c48daeec7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -980,3 +980,32 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): if max_val is not None and x > max_val: raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) + + +def _check_sample_weight(sample_weight, y, **kwargs): + """Validate sample weights + + Parameters + ---------- + sample_weight : {ndarray, Number or None} + input sample weights + y: ndarray or None + target variable + kwargs: + additional parameters to pass to check_array + + Parameters + ---------- + sample_weight : ndarray + validated sample weights + """ + if sample_weight is None or isinstance(sample_weight, numbers.Number): + sample_weight = np.ones_like(y) + else: + sample_weight = check_array( + sample_weight, accept_sparse=False, + ensure_2d=False, dtype=[np.float64, np.float32], + **kwargs + ) + check_consistent_length(y, sample_weight) + return sample_weight From 95df18752be8f8a39b94196ef1a0584f42629c1c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 10:52:58 +0200 Subject: [PATCH 02/34] Use _check_sample_weight in linear_model/stochastic_gradient --- sklearn/linear_model/stochastic_gradient.py | 21 ++++----------------- sklearn/utils/validation.py | 2 +- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 625bdb5bdc3f9..b47adcfac2e87 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -18,7 +18,7 @@ from ..utils import check_array, check_random_state, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import _check_partial_fit_first_call -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from ..exceptions import ConvergenceWarning from ..model_selection import StratifiedShuffleSplit, ShuffleSplit @@ -169,19 +169,6 @@ def _get_penalty_type(self, penalty): except KeyError: raise ValueError("Penalty %s is not supported. " % penalty) - def _validate_sample_weight(self, sample_weight, n_samples): - """Set the sample weight array.""" - if sample_weight is None: - # uniform sample weights - sample_weight = np.ones(n_samples, dtype=np.float64, order='C') - else: - # user-provided array - sample_weight = np.asarray(sample_weight, dtype=np.float64, - order="C") - if sample_weight.shape[0] != n_samples: - raise ValueError("Shapes of X and sample_weight do not match.") - return sample_weight - def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None, intercept_init=None): """Allocate mem for parameters; initialize if provided.""" @@ -488,7 +475,7 @@ def _partial_fit(self, X, y, alpha, C, # Allocate datastructures from input arguments self._expanded_class_weight = compute_class_weight(self.class_weight, self.classes_, y) - sample_weight = self._validate_sample_weight(sample_weight, n_samples) + sample_weight = _check_sample_weight(sample_weight, y, order="C") if getattr(self, "coef_", None) is None or coef_init is not None: self._allocate_parameter_mem(n_classes, n_features, @@ -1095,9 +1082,9 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate, n_samples, n_features = X.shape - # Allocate datastructures from input arguments - sample_weight = self._validate_sample_weight(sample_weight, n_samples) + sample_weight = _check_sample_weight(sample_weight, y, order="C") + # Allocate datastructures from input arguments if getattr(self, "coef_", None) is None: self._allocate_parameter_mem(1, n_features, coef_init, intercept_init) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 0413c48daeec7..4ae108c0cade4 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1000,7 +1000,7 @@ def _check_sample_weight(sample_weight, y, **kwargs): validated sample weights """ if sample_weight is None or isinstance(sample_weight, numbers.Number): - sample_weight = np.ones_like(y) + sample_weight = np.ones(y.shape) else: sample_weight = check_array( sample_weight, accept_sparse=False, From c889db37fc015499a4d8d78e44dc7192800a2b1e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 11:17:09 +0200 Subject: [PATCH 03/34] Use check_sample weight in logistic --- sklearn/linear_model/logistic.py | 13 ++++--------- sklearn/linear_model/ransac.py | 5 ++--- sklearn/linear_model/ridge.py | 9 ++++----- sklearn/linear_model/sag.py | 5 +++-- sklearn/utils/validation.py | 28 ++++++++++++++++++++++------ 5 files changed, 35 insertions(+), 25 deletions(-) diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 5b9de7bc6e68c..56697206ace82 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -30,7 +30,7 @@ from ..utils.fixes import logsumexp from ..utils.optimize import newton_cg from ..utils.validation import check_X_y -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils import deprecated from ..exceptions import (ConvergenceWarning, ChangedBehaviorWarning) from ..utils.multiclass import check_classification_targets @@ -826,11 +826,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, # If sample weights exist, convert them to array (support for lists) # and check length # Otherwise set them to 1 for all examples - if sample_weight is not None: - sample_weight = np.array(sample_weight, dtype=X.dtype, order='C') - check_consistent_length(y, sample_weight) - else: - sample_weight = np.ones(X.shape[0], dtype=X.dtype) + sample_weight = _check_sample_weight(sample_weight, n_samples=X.shape[0], + dtype=X.dtype, order='C') # If class_weights is a dict (provided by the user), the weights # are assigned to the original labels. If it is "balanced", then @@ -1135,9 +1132,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, y_test = y[test] if sample_weight is not None: - sample_weight = check_array(sample_weight, ensure_2d=False) - check_consistent_length(y, sample_weight) - + sample_weight = _check_sample_weight(sample_weight, y) sample_weight = sample_weight[train] coefs, Cs, n_iter = _logistic_regression_path( diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index 7f4fb650b59e8..b4f9cc03d9e4a 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -11,7 +11,7 @@ from ..base import MultiOutputMixin from ..utils import check_random_state, check_array, check_consistent_length from ..utils.random import sample_without_replacement -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from .base import LinearRegression from ..utils.validation import has_fit_parameter from ..exceptions import ConvergenceWarning @@ -324,8 +324,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) - if sample_weight is not None: - sample_weight = np.asarray(sample_weight) + sample_weight = _check_sample_weight(sample_weight, y) n_inliers_best = 1 score_best = -np.inf diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 45862d5f3cffb..9db7bd41731a4 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -27,6 +27,7 @@ from ..utils import check_consistent_length from ..utils import compute_sample_weight from ..utils import column_or_1d +from ..utils.validation import _check_sample_weight from ..preprocessing import LabelBinarizer from ..model_selection import GridSearchCV from ..metrics.scorer import check_scoring @@ -428,8 +429,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', " %d != %d" % (n_samples, n_samples_)) if has_sw: - if np.atleast_1d(sample_weight).ndim > 1: - raise ValueError("Sample weights must be 1D array or scalar") + sample_weight = _check_sample_weight(sample_weight, y) if solver not in ['sag', 'saga']: # SAG supports sample_weight directly. For other solvers, @@ -1406,9 +1406,8 @@ def fit(self, X, y, sample_weight=None): "alphas must be positive. Got {} containing some " "negative or null value instead.".format(self.alphas)) - if sample_weight is not None and not isinstance(sample_weight, float): - sample_weight = check_array(sample_weight, ensure_2d=False, - dtype=X.dtype) + sample_weight = _check_sample_weight(sample_weight, y, dtype=X.dtype) + n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py index 233a6ed1c50af..eccc4ab99462e 100644 --- a/sklearn/linear_model/sag.py +++ b/sklearn/linear_model/sag.py @@ -12,6 +12,7 @@ from .sag_fast import sag32, sag64 from ..exceptions import ConvergenceWarning from ..utils import check_array +from ..utils.validation import _check_sample_weight from ..utils.extmath import row_norms @@ -251,8 +252,8 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., n_classes = int(y.max()) + 1 if loss == 'multinomial' else 1 # initialization - if sample_weight is None: - sample_weight = np.ones(n_samples, dtype=X.dtype, order='C') + sample_weight = _check_sample_weight(sample_weight, n_samples=n_samples, + order='C') if 'coef' in warm_start_mem.keys(): coef_init = warm_start_mem['coef'] diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4ae108c0cade4..887a9fe956bfa 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -982,7 +982,8 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_sample_weight(sample_weight, y, **kwargs): +def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, + **kwargs): """Validate sample weights Parameters @@ -990,7 +991,11 @@ def _check_sample_weight(sample_weight, y, **kwargs): sample_weight : {ndarray, Number or None} input sample weights y: ndarray or None - target variable + target variable. Either y or n_samples must be provided. + n_samples: int or None + expected number of samples. Either y or n_samples must be provided. + dtype: dtype + dtype of the validated sample_weight kwargs: additional parameters to pass to check_array @@ -999,13 +1004,24 @@ def _check_sample_weight(sample_weight, y, **kwargs): sample_weight : ndarray validated sample weights """ + if n_samples is not None and y is not None: + raise ValueError('Only one of y, n_samples must be provided!') + elif y is not None: + n_samples = y.shape[0] + if sample_weight is None or isinstance(sample_weight, numbers.Number): - sample_weight = np.ones(y.shape) + sample_weight = np.ones(n_samples, dtype=dtype) else: + if dtype is None: + dtype = [np.float64, np.float32] sample_weight = check_array( sample_weight, accept_sparse=False, - ensure_2d=False, dtype=[np.float64, np.float32], - **kwargs + ensure_2d=False, dtype=dtype, **kwargs ) - check_consistent_length(y, sample_weight) + if sample_weight.ndim != 1: + raise ValueError("Sample weights must be 1D array or scalar") + + if sample_weight.shape != (n_samples,): + raise ValueError("samples_weight.shape == {}, expected {}!" + .format(sample_weight.shape, (n_samples,))) return sample_weight From bd52cfc1c0d04c8ef65ace8be0ff8be58b50ad5e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 11:21:10 +0200 Subject: [PATCH 04/34] Better order parameter --- sklearn/utils/validation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 887a9fe956bfa..49f439c18962d 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -983,7 +983,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, - **kwargs): + order=None, **kwargs): """Validate sample weights Parameters @@ -996,6 +996,12 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, expected number of samples. Either y or n_samples must be provided. dtype: dtype dtype of the validated sample_weight + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + When order is None (default), if ``sample_weights`` is an ndarray, + nothing is ensured about the memory layout of the output array, + otherwise it will be of 'C' order by default. + kwargs: additional parameters to pass to check_array @@ -1010,7 +1016,9 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, n_samples = y.shape[0] if sample_weight is None or isinstance(sample_weight, numbers.Number): - sample_weight = np.ones(n_samples, dtype=dtype) + if order is None: + order = 'C' + sample_weight = np.ones(n_samples, dtype=dtype, order=order) else: if dtype is None: dtype = [np.float64, np.float32] From 9e108a467ff442299ac0ba5ff5283c7fb47e6280 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 11:25:10 +0200 Subject: [PATCH 05/34] Sample weight checks in svm --- sklearn/svm/base.py | 10 ++++------ sklearn/utils/validation.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index 4a50ee479f030..2dbfad97eb774 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -8,7 +8,7 @@ from ..base import BaseEstimator, ClassifierMixin from ..preprocessing import LabelEncoder from ..utils.multiclass import _ovr_decision_function -from ..utils import check_array, check_consistent_length, check_random_state +from ..utils import check_array, check_random_state, _check_sample_weight from ..utils import column_or_1d, check_X_y from ..utils import compute_class_weight from ..utils.extmath import safe_sparse_dot @@ -906,11 +906,9 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, # LibLinear wants targets as doubles, even for classification y_ind = np.asarray(y_ind, dtype=np.float64).ravel() y_ind = np.require(y_ind, requirements="W") - if sample_weight is None: - sample_weight = np.ones(X.shape[0]) - else: - sample_weight = np.array(sample_weight, dtype=np.float64, order='C') - check_consistent_length(sample_weight, X) + + sample_weight = _check_sample_weight(sample_weight, n_samples=X.shape[0], + dtype=np.float64, order='C') solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual) raw_coef_, n_iter_ = liblinear.train_wrap( diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 49f439c18962d..5d91655ae307b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1007,7 +1007,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, Parameters ---------- - sample_weight : ndarray + sample_weight : ndarray, shape=(n_samples,) validated sample weights """ if n_samples is not None and y is not None: From 4ff292d0fc9f9481e1e9106672baf44f986d442c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 11:39:42 +0200 Subject: [PATCH 06/34] Tests --- sklearn/svm/base.py | 3 ++- sklearn/utils/tests/test_validation.py | 29 +++++++++++++++++++++++++- sklearn/utils/validation.py | 2 +- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index 2dbfad97eb774..4f3b57a35c144 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -8,11 +8,12 @@ from ..base import BaseEstimator, ClassifierMixin from ..preprocessing import LabelEncoder from ..utils.multiclass import _ovr_decision_function -from ..utils import check_array, check_random_state, _check_sample_weight +from ..utils import check_array, check_random_state from ..utils import column_or_1d, check_X_y from ..utils import compute_class_weight from ..utils.extmath import safe_sparse_dot from ..utils.validation import check_is_fitted, _check_large_sparse +from ..utils.validation import _check_sample_weight from ..utils.multiclass import check_classification_targets from ..exceptions import ConvergenceWarning from ..exceptions import NotFittedError diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 0aa8eae22b1e2..c614019c548f3 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -20,6 +20,7 @@ from sklearn.utils.testing import SkipTest from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_allclose_dense_sparse +from sklearn.utils.testing import assert_allclose from sklearn.utils import as_float_array, check_array, check_symmetric from sklearn.utils import check_X_y from sklearn.utils import deprecated @@ -39,7 +40,8 @@ check_memory, check_non_negative, _num_samples, - check_scalar) + check_scalar, + _check_sample_weight) import sklearn from sklearn.exceptions import NotFittedError @@ -853,3 +855,28 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, min_val=min_val, max_val=max_val) assert str(raised_error.value) == str(err_msg) assert type(raised_error.value) == type(err_msg) + + +def test_check_sample_weight(): + with pytest.raises(ValueError, + match="Only one of y, n_samples must be provided"): + _check_sample_weight(np.ones(3), y=np.ones(3), n_samples=3) + + # check order="C" parameter + sample_weight = np.ones(10)[::2] + assert not sample_weight.flags["C_CONTIGUOUS"] + sample_weight = _check_sample_weight(sample_weight, n_samples=5, order="C") + assert sample_weight.flags["C_CONTIGUOUS"] + + # check None input + sample_weight = _check_sample_weight(None, n_samples=5) + assert_allclose(sample_weight, np.ones(5)) + + # check numbers input + sample_weight = _check_sample_weight(2.0, n_samples=5) + assert_allclose(sample_weight, np.ones(5)) + + # check wrong number of dimensions + with pytest.raises(ValueError, + match="Sample weights must be 1D array or scalar"): + _check_sample_weight(np.ones((2, 4)), n_samples=5) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5d91655ae307b..f70ef5db5b8cb 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1024,7 +1024,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, dtype = [np.float64, np.float32] sample_weight = check_array( sample_weight, accept_sparse=False, - ensure_2d=False, dtype=dtype, **kwargs + ensure_2d=False, dtype=dtype, order=order, **kwargs ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") From bb64a9b18e1f4b85b10eb5ee2d661210bdc0065d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 11:51:57 +0200 Subject: [PATCH 07/34] Minor fix --- sklearn/linear_model/sag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py index eccc4ab99462e..7bd2cfba24703 100644 --- a/sklearn/linear_model/sag.py +++ b/sklearn/linear_model/sag.py @@ -253,7 +253,7 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., # initialization sample_weight = _check_sample_weight(sample_weight, n_samples=n_samples, - order='C') + dtype=X.dtype, order='C') if 'coef' in warm_start_mem.keys(): coef_init = warm_start_mem['coef'] From 29e4ff620c794f0ff7c253535c903ccc18f95995 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 12:57:33 +0200 Subject: [PATCH 08/34] Better handle numeric sample_weight --- sklearn/utils/tests/test_validation.py | 2 +- sklearn/utils/validation.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index c614019c548f3..1961b995f8f31 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -874,7 +874,7 @@ def test_check_sample_weight(): # check numbers input sample_weight = _check_sample_weight(2.0, n_samples=5) - assert_allclose(sample_weight, np.ones(5)) + assert_allclose(sample_weight, 2*np.ones(5)) # check wrong number of dimensions with pytest.raises(ValueError, diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f70ef5db5b8cb..308611445805a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1018,7 +1018,11 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, if sample_weight is None or isinstance(sample_weight, numbers.Number): if order is None: order = 'C' - sample_weight = np.ones(n_samples, dtype=dtype, order=order) + if sample_weight is None: + sample_weight = np.ones(n_samples, dtype=dtype, order=order) + else: + sample_weight = np.full(n_samples, sample_weight, + dtype=dtype, order=order) else: if dtype is None: dtype = [np.float64, np.float32] From 4d7bb1550bba297c3f4e919d47777622b3368464 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:24:13 +0200 Subject: [PATCH 09/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 308611445805a..53c80e6d5346c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -988,7 +988,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, Parameters ---------- - sample_weight : {ndarray, Number or None} + sample_weight : {ndarray, Number or None}, shape (n_samples,) input sample weights y: ndarray or None target variable. Either y or n_samples must be provided. From 1c0f6a764d7c1ca7b441a398b36fd5c9ebf53194 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:24:26 +0200 Subject: [PATCH 10/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 53c80e6d5346c..755f69186a5a1 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -989,7 +989,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, Parameters ---------- sample_weight : {ndarray, Number or None}, shape (n_samples,) - input sample weights + Input sample weights. y: ndarray or None target variable. Either y or n_samples must be provided. n_samples: int or None From 59abc05176b9aaea20f7c493e9362d2841b56462 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:24:38 +0200 Subject: [PATCH 11/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 755f69186a5a1..bf2772b7054af 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -990,7 +990,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, ---------- sample_weight : {ndarray, Number or None}, shape (n_samples,) Input sample weights. - y: ndarray or None + y : ndarray or None, shape (n_samples,) target variable. Either y or n_samples must be provided. n_samples: int or None expected number of samples. Either y or n_samples must be provided. From 84b0ac0bf8e1433d221ce2a79c572c52ff3d4be3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:24:54 +0200 Subject: [PATCH 12/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index bf2772b7054af..f690fb9bf3eed 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -991,7 +991,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, sample_weight : {ndarray, Number or None}, shape (n_samples,) Input sample weights. y : ndarray or None, shape (n_samples,) - target variable. Either y or n_samples must be provided. + Target variable. Either `y` or `n_samples` must be provided. n_samples: int or None expected number of samples. Either y or n_samples must be provided. dtype: dtype From 4ea0694f852c5e07094b7e760b0d5804bc129e5b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:25:05 +0200 Subject: [PATCH 13/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f690fb9bf3eed..2514f7e40cea0 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -995,7 +995,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, n_samples: int or None expected number of samples. Either y or n_samples must be provided. dtype: dtype - dtype of the validated sample_weight + dtype of the validated `sample_weight`. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. When order is None (default), if ``sample_weights`` is an ndarray, From cfc7a97c836d9d8a166a7210c224dd7deb526303 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:25:17 +0200 Subject: [PATCH 14/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2514f7e40cea0..b0885b7cfd0cd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -992,7 +992,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, Input sample weights. y : ndarray or None, shape (n_samples,) Target variable. Either `y` or `n_samples` must be provided. - n_samples: int or None + n_samples : int or None expected number of samples. Either y or n_samples must be provided. dtype: dtype dtype of the validated `sample_weight`. From 908bbfc073bf641f4adb5078fd46a02b9e6c721b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:25:26 +0200 Subject: [PATCH 15/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b0885b7cfd0cd..f55e63f5a5ac2 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1003,7 +1003,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, otherwise it will be of 'C' order by default. kwargs: - additional parameters to pass to check_array + Additional parameters to pass to `check_array` Parameters ---------- From c6280b679fc86ea0d1a0512e4117740b4d775dbd Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:25:37 +0200 Subject: [PATCH 16/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index f55e63f5a5ac2..2ce2e7aeb59d7 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1007,7 +1007,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, Parameters ---------- - sample_weight : ndarray, shape=(n_samples,) + sample_weight : ndarray, shape (n_samples,) validated sample weights """ if n_samples is not None and y is not None: From 2b84f90ef6fe6e55003257fece7384d36506bb54 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:26:02 +0200 Subject: [PATCH 17/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2ce2e7aeb59d7..9b53c3b48b86b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1002,7 +1002,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, nothing is ensured about the memory layout of the output array, otherwise it will be of 'C' order by default. - kwargs: + kwargs : Additional parameters to pass to `check_array` Parameters From d81fec19c07daf856c20ec90a9e277a3d18ed638 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 13:26:12 +0200 Subject: [PATCH 18/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 9b53c3b48b86b..a70d58be41eaa 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1008,7 +1008,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, Parameters ---------- sample_weight : ndarray, shape (n_samples,) - validated sample weights + Validated sample weights. """ if n_samples is not None and y is not None: raise ValueError('Only one of y, n_samples must be provided!') From b2b1773b24df5368e03ba7134ae60dac21987adb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 16:49:46 +0200 Subject: [PATCH 19/34] Remove kwargs --- sklearn/utils/validation.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a70d58be41eaa..3135f7d2c4efc 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -983,7 +983,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, - order=None, **kwargs): + order=None): """Validate sample weights Parameters @@ -1002,9 +1002,6 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, nothing is ensured about the memory layout of the output array, otherwise it will be of 'C' order by default. - kwargs : - Additional parameters to pass to `check_array` - Parameters ---------- sample_weight : ndarray, shape (n_samples,) @@ -1028,7 +1025,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, dtype = [np.float64, np.float32] sample_weight = check_array( sample_weight, accept_sparse=False, - ensure_2d=False, dtype=dtype, order=order, **kwargs + ensure_2d=False, dtype=dtype, order=order ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") From 22f9275bb6ca21c4829dad9bfee4535b8dd5491d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 17:43:53 +0200 Subject: [PATCH 20/34] Use _check_sample_weight(sample_weight, X) --- sklearn/linear_model/huber.py | 2 +- sklearn/linear_model/logistic.py | 4 ++-- sklearn/linear_model/ransac.py | 2 +- sklearn/linear_model/ridge.py | 2 +- sklearn/linear_model/sag.py | 2 +- sklearn/linear_model/stochastic_gradient.py | 4 ++-- sklearn/svm/base.py | 2 +- sklearn/utils/tests/test_validation.py | 13 +++++-------- sklearn/utils/validation.py | 13 ++++--------- 9 files changed, 18 insertions(+), 26 deletions(-) diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index 3225487df7dab..15d9415de0031 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -254,7 +254,7 @@ def fit(self, X, y, sample_weight=None): X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) - sample_weight = _check_sample_weight(sample_weight, y) + sample_weight = _check_sample_weight(sample_weight, X) if self.epsilon < 1.0: raise ValueError( diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 56697206ace82..8dffe84d5bb02 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -826,7 +826,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, # If sample weights exist, convert them to array (support for lists) # and check length # Otherwise set them to 1 for all examples - sample_weight = _check_sample_weight(sample_weight, n_samples=X.shape[0], + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, order='C') # If class_weights is a dict (provided by the user), the weights @@ -1132,7 +1132,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, y_test = y[test] if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, y) + sample_weight = _check_sample_weight(sample_weight, X) sample_weight = sample_weight[train] coefs, Cs, n_iter = _logistic_regression_path( diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index b4f9cc03d9e4a..b901e848f49bf 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -324,7 +324,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("%s does not support sample_weight. Samples" " weights are only used for the calibration" " itself." % estimator_name) - sample_weight = _check_sample_weight(sample_weight, y) + sample_weight = _check_sample_weight(sample_weight, X) n_inliers_best = 1 score_best = -np.inf diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 9db7bd41731a4..e111d2061514b 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -1406,7 +1406,7 @@ def fit(self, X, y, sample_weight=None): "alphas must be positive. Got {} containing some " "negative or null value instead.".format(self.alphas)) - sample_weight = _check_sample_weight(sample_weight, y, dtype=X.dtype) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py index 7bd2cfba24703..0a092447a49b1 100644 --- a/sklearn/linear_model/sag.py +++ b/sklearn/linear_model/sag.py @@ -252,7 +252,7 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., n_classes = int(y.max()) + 1 if loss == 'multinomial' else 1 # initialization - sample_weight = _check_sample_weight(sample_weight, n_samples=n_samples, + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, order='C') if 'coef' in warm_start_mem.keys(): diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index b47adcfac2e87..3b892ecec4e35 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -475,7 +475,7 @@ def _partial_fit(self, X, y, alpha, C, # Allocate datastructures from input arguments self._expanded_class_weight = compute_class_weight(self.class_weight, self.classes_, y) - sample_weight = _check_sample_weight(sample_weight, y, order="C") + sample_weight = _check_sample_weight(sample_weight, X, order="C") if getattr(self, "coef_", None) is None or coef_init is not None: self._allocate_parameter_mem(n_classes, n_features, @@ -1082,7 +1082,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate, n_samples, n_features = X.shape - sample_weight = _check_sample_weight(sample_weight, y, order="C") + sample_weight = _check_sample_weight(sample_weight, X, order="C") # Allocate datastructures from input arguments if getattr(self, "coef_", None) is None: diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index 4f3b57a35c144..f2ab0baaf3a70 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -908,7 +908,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, y_ind = np.asarray(y_ind, dtype=np.float64).ravel() y_ind = np.require(y_ind, requirements="W") - sample_weight = _check_sample_weight(sample_weight, n_samples=X.shape[0], + sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64, order='C') solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 1961b995f8f31..ecd25ffd63386 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -858,25 +858,22 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, def test_check_sample_weight(): - with pytest.raises(ValueError, - match="Only one of y, n_samples must be provided"): - _check_sample_weight(np.ones(3), y=np.ones(3), n_samples=3) - # check order="C" parameter sample_weight = np.ones(10)[::2] assert not sample_weight.flags["C_CONTIGUOUS"] - sample_weight = _check_sample_weight(sample_weight, n_samples=5, order="C") + sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1)), + order="C") assert sample_weight.flags["C_CONTIGUOUS"] # check None input - sample_weight = _check_sample_weight(None, n_samples=5) + sample_weight = _check_sample_weight(None, X=np.ones((5, 2))) assert_allclose(sample_weight, np.ones(5)) # check numbers input - sample_weight = _check_sample_weight(2.0, n_samples=5) + sample_weight = _check_sample_weight(2.0, X=np.ones((5, 2))) assert_allclose(sample_weight, 2*np.ones(5)) # check wrong number of dimensions with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"): - _check_sample_weight(np.ones((2, 4)), n_samples=5) + _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2))) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3135f7d2c4efc..c3cc1f6cb054f 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -982,7 +982,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, +def _check_sample_weight(sample_weight, X, dtype=None, order=None): """Validate sample weights @@ -990,10 +990,8 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, ---------- sample_weight : {ndarray, Number or None}, shape (n_samples,) Input sample weights. - y : ndarray or None, shape (n_samples,) - Target variable. Either `y` or `n_samples` must be provided. - n_samples : int or None - expected number of samples. Either y or n_samples must be provided. + X : nd-array, list or sparse matrix + Input data. dtype: dtype dtype of the validated `sample_weight`. order : 'F', 'C' or None (default=None) @@ -1007,10 +1005,7 @@ def _check_sample_weight(sample_weight, y=None, n_samples=None, dtype=None, sample_weight : ndarray, shape (n_samples,) Validated sample weights. """ - if n_samples is not None and y is not None: - raise ValueError('Only one of y, n_samples must be provided!') - elif y is not None: - n_samples = y.shape[0] + n_samples = _num_samples(X) if sample_weight is None or isinstance(sample_weight, numbers.Number): if order is None: From c28226a97c0d84f6b563a482d3319037f39235d3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 17:56:16 +0200 Subject: [PATCH 21/34] Update kmeans to use _check_sample_weight --- sklearn/cluster/k_means_.py | 34 +++++++++++++-------------- sklearn/cluster/tests/test_k_means.py | 11 +++++---- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 7520b6b6b6bd5..365c5a9a886f0 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -27,7 +27,7 @@ from ..utils import check_array from ..utils import gen_batches from ..utils import check_random_state -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.validation import FLOAT_DTYPES from ..exceptions import ConvergenceWarning from . import _k_means @@ -164,19 +164,19 @@ def _tolerance(X, tol): return np.mean(variances) * tol -def _check_sample_weight(X, sample_weight): +def _check_normalize_sample_weight(sample_weight, X): """Set sample_weight if None, and check for correct dtype""" - n_samples = X.shape[0] - if sample_weight is None: - return np.ones(n_samples, dtype=X.dtype) - else: - sample_weight = np.asarray(sample_weight) - if n_samples != len(sample_weight): - raise ValueError("n_samples=%d should be == len(sample_weight)=%d" - % (n_samples, len(sample_weight))) + + sample_weight_was_none = sample_weight is None + + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + if not sample_weight_was_none: # normalize the weights to sum up to n_samples + # an array of 1 (i.e. samples_weight is None) is already normalized + n_samples = len(sample_weight) scale = n_samples / sample_weight.sum() - return (sample_weight * scale).astype(X.dtype, copy=False) + sample_weight *= scale + return sample_weight def k_means(X, n_clusters, sample_weight=None, init='k-means++', @@ -434,7 +434,7 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, if verbose: print('Initialization complete') - checked_sample_weight = _check_sample_weight(X, sample_weight) + checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) centers, labels, n_iter = k_means_elkan(X, checked_sample_weight, n_clusters, centers, tol=tol, max_iter=max_iter, verbose=verbose) @@ -519,7 +519,7 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, """ random_state = check_random_state(random_state) - sample_weight = _check_sample_weight(X, sample_weight) + sample_weight = _check_normalize_sample_weight(sample_weight, X) best_labels, best_inertia, best_centers = None, None, None # init @@ -662,7 +662,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] - sample_weight = _check_sample_weight(X, sample_weight) + sample_weight = _check_normalize_sample_weight(sample_weight, X) # set the default value of centers to -1 to be able to detect any anomaly # easily labels = np.full(n_samples, -1, np.int32) @@ -1492,7 +1492,7 @@ def fit(self, X, y=None, sample_weight=None): raise ValueError("n_samples=%d should be >= n_clusters=%d" % (n_samples, self.n_clusters)) - sample_weight = _check_sample_weight(X, sample_weight) + sample_weight = _check_normalize_sample_weight(sample_weight, X) n_init = self.n_init if hasattr(self.init, '__array__'): @@ -1641,7 +1641,7 @@ def _labels_inertia_minibatch(self, X, sample_weight): """ if self.verbose: print('Computing label assignment and total inertia') - sample_weight = _check_sample_weight(X, sample_weight) + sample_weight = _check_normalize_sample_weight(sample_weight, X) x_squared_norms = row_norms(X, squared=True) slices = gen_batches(X.shape[0], self.batch_size) results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s], @@ -1675,7 +1675,7 @@ def partial_fit(self, X, y=None, sample_weight=None): if n_samples == 0: return self - sample_weight = _check_sample_weight(X, sample_weight) + sample_weight = _check_normalize_sample_weight(sample_weight, X) x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 4fca8f621e141..5b8f086a89948 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -909,14 +909,15 @@ def test_sample_weight_length(): # check that an error is raised when passing sample weights # with an incompatible shape km = KMeans(n_clusters=n_clusters, random_state=42) - assert_raises_regex(ValueError, r'len\(sample_weight\)', km.fit, X, - sample_weight=np.ones(2)) + msg = r'samples_weight.shape == \(2,\), expected \(100,\)' + with pytest.raises(ValueError, match=msg): + km.fit(X, sample_weight=np.ones(2)) -def test_check_sample_weight(): - from sklearn.cluster.k_means_ import _check_sample_weight +def test_check_normalize_sample_weight(): + from sklearn.cluster.k_means_ import _check_normalize_sample_weight sample_weight = None - checked_sample_weight = _check_sample_weight(X, sample_weight) + checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) assert _num_samples(X) == _num_samples(checked_sample_weight) assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) assert X.dtype == checked_sample_weight.dtype From ed2dc698ba862bfb2420fc055f555155f2a0b4e9 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 12 Jul 2019 18:43:24 +0200 Subject: [PATCH 22/34] Non float dtype should not be supported --- sklearn/utils/tests/test_validation.py | 5 +++++ sklearn/utils/validation.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ecd25ffd63386..fa34b1e00667f 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -877,3 +877,8 @@ def test_check_sample_weight(): with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"): _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2))) + + # int dtype will be converted to float64 instead + X = np.ones((5, 2), dtype=np.int) + sample_weight = _check_sample_weight(None, X, dtype=X.dtype) + assert sample_weight.dtype == np.float64 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index c3cc1f6cb054f..8b3ae72ddd711 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -993,7 +993,8 @@ def _check_sample_weight(sample_weight, X, dtype=None, X : nd-array, list or sparse matrix Input data. dtype: dtype - dtype of the validated `sample_weight`. + dtype of the validated `sample_weight`. Note that if `dtype` is not + one of `float32`, `float64`, the output will be of dtype `float64`. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. When order is None (default), if ``sample_weights`` is an ndarray, @@ -1007,6 +1008,9 @@ def _check_sample_weight(sample_weight, X, dtype=None, """ n_samples = _num_samples(X) + if dtype is not None and dtype not in [np.float32, np.float64]: + dtype = np.float64 + if sample_weight is None or isinstance(sample_weight, numbers.Number): if order is None: order = 'C' From 380d9eb95efadb19af411fa7e59bb6f12bda0a96 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 13:00:42 +0200 Subject: [PATCH 23/34] Drop order parameter + address review comments --- sklearn/linear_model/stochastic_gradient.py | 4 ++-- sklearn/svm/base.py | 2 +- sklearn/utils/tests/test_validation.py | 3 +-- sklearn/utils/validation.py | 24 ++++++++------------- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index 3b892ecec4e35..25b63a3a0cdce 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -475,7 +475,7 @@ def _partial_fit(self, X, y, alpha, C, # Allocate datastructures from input arguments self._expanded_class_weight = compute_class_weight(self.class_weight, self.classes_, y) - sample_weight = _check_sample_weight(sample_weight, X, order="C") + sample_weight = _check_sample_weight(sample_weight, X) if getattr(self, "coef_", None) is None or coef_init is not None: self._allocate_parameter_mem(n_classes, n_features, @@ -1082,7 +1082,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate, n_samples, n_features = X.shape - sample_weight = _check_sample_weight(sample_weight, X, order="C") + sample_weight = _check_sample_weight(sample_weight, X) # Allocate datastructures from input arguments if getattr(self, "coef_", None) is None: diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index f2ab0baaf3a70..e27abeed7ecee 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -909,7 +909,7 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, y_ind = np.require(y_ind, requirements="W") sample_weight = _check_sample_weight(sample_weight, X, - dtype=np.float64, order='C') + dtype=np.float64) solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual) raw_coef_, n_iter_ = liblinear.train_wrap( diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index fa34b1e00667f..f30e1a1b2ada7 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -861,8 +861,7 @@ def test_check_sample_weight(): # check order="C" parameter sample_weight = np.ones(10)[::2] assert not sample_weight.flags["C_CONTIGUOUS"] - sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1)), - order="C") + sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1))) assert sample_weight.flags["C_CONTIGUOUS"] # check None input diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 8b3ae72ddd711..931be8f4192d6 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -982,29 +982,25 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): raise ValueError('`{}`= {}, must be <= {}.'.format(name, x, max_val)) -def _check_sample_weight(sample_weight, X, dtype=None, - order=None): +def _check_sample_weight(sample_weight, X, dtype=None): """Validate sample weights Parameters ---------- sample_weight : {ndarray, Number or None}, shape (n_samples,) Input sample weights. + X : nd-array, list or sparse matrix Input data. + dtype: dtype dtype of the validated `sample_weight`. Note that if `dtype` is not one of `float32`, `float64`, the output will be of dtype `float64`. - order : 'F', 'C' or None (default=None) - Whether an array will be forced to be fortran or c-style. - When order is None (default), if ``sample_weights`` is an ndarray, - nothing is ensured about the memory layout of the output array, - otherwise it will be of 'C' order by default. - Parameters - ---------- + Returns + ------- sample_weight : ndarray, shape (n_samples,) - Validated sample weights. + Validated sample weights. They are guaranteed to be "C" contiguous. """ n_samples = _num_samples(X) @@ -1012,19 +1008,17 @@ def _check_sample_weight(sample_weight, X, dtype=None, dtype = np.float64 if sample_weight is None or isinstance(sample_weight, numbers.Number): - if order is None: - order = 'C' if sample_weight is None: - sample_weight = np.ones(n_samples, dtype=dtype, order=order) + sample_weight = np.ones(n_samples, dtype=dtype) else: sample_weight = np.full(n_samples, sample_weight, - dtype=dtype, order=order) + dtype=dtype) else: if dtype is None: dtype = [np.float64, np.float32] sample_weight = check_array( sample_weight, accept_sparse=False, - ensure_2d=False, dtype=dtype, order=order + ensure_2d=False, dtype=dtype, order="C" ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") From 3fa5f73c1a5540ff85bd55450744cd6b67f0fb56 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 13:02:45 +0200 Subject: [PATCH 24/34] Fix typo --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 931be8f4192d6..8a4462e371a33 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1000,7 +1000,7 @@ def _check_sample_weight(sample_weight, X, dtype=None): Returns ------- sample_weight : ndarray, shape (n_samples,) - Validated sample weights. They are guaranteed to be "C" contiguous. + Validated sample weight. It is guaranteed to be "C" contiguous. """ n_samples = _num_samples(X) From 08e204fa9ece0155919bee9a00713c17a642020a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 14:07:40 +0200 Subject: [PATCH 25/34] Fix tests --- sklearn/linear_model/logistic.py | 2 +- sklearn/linear_model/sag.py | 3 +-- sklearn/utils/tests/test_validation.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index c087fb7038c0c..10a4d32e51275 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -827,7 +827,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, # and check length # Otherwise set them to 1 for all examples sample_weight = _check_sample_weight(sample_weight, X, - dtype=X.dtype, order='C') + dtype=X.dtype) # If class_weights is a dict (provided by the user), the weights # are assigned to the original labels. If it is "balanced", then diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py index 0a092447a49b1..fa02c7a4a0ef8 100644 --- a/sklearn/linear_model/sag.py +++ b/sklearn/linear_model/sag.py @@ -252,8 +252,7 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., n_classes = int(y.max()) + 1 if loss == 'multinomial' else 1 # initialization - sample_weight = _check_sample_weight(sample_weight, X, - dtype=X.dtype, order='C') + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) if 'coef' in warm_start_mem.keys(): coef_init = warm_start_mem['coef'] diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index f30e1a1b2ada7..2ef58aa8df4a6 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -858,7 +858,7 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val, def test_check_sample_weight(): - # check order="C" parameter + # check array order sample_weight = np.ones(10)[::2] assert not sample_weight.flags["C_CONTIGUOUS"] sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1))) From 44d99c1c06d6bddf9e4dcc0c9845fba184ffbd87 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 15:08:26 +0200 Subject: [PATCH 26/34] Address review --- sklearn/linear_model/ridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index e111d2061514b..4600b56f8d89d 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -429,7 +429,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', " %d != %d" % (n_samples, n_samples_)) if has_sw: - sample_weight = _check_sample_weight(sample_weight, y) + sample_weight = _check_sample_weight(sample_weight, X) if solver not in ['sag', 'saga']: # SAG supports sample_weight directly. For other solvers, From 3fc9d1aaf9b715c48600e465147a5b76b3510265 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 10:43:13 +0200 Subject: [PATCH 27/34] Attempt to fix 32bit / 64bit comparison test --- sklearn/linear_model/ridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 4600b56f8d89d..cc3b6a518add5 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -429,7 +429,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', " %d != %d" % (n_samples, n_samples_)) if has_sw: - sample_weight = _check_sample_weight(sample_weight, X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) if solver not in ['sag', 'saga']: # SAG supports sample_weight directly. For other solvers, From 22e107036d793b3d8c8be06b8fe598ed91865ebf Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 10:48:50 +0200 Subject: [PATCH 28/34] Check that float32 dtype is preserved --- sklearn/utils/tests/test_validation.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 2ef58aa8df4a6..e932e5cdaf7f7 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -877,6 +877,12 @@ def test_check_sample_weight(): match="Sample weights must be 1D array or scalar"): _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2))) + # float32 dtype is preserved + X = np.ones((5, 2)) + sample_weight = np.ones(5, dtype=np.float32) + sample_weight = _check_sample_weight(sample_weight, X) + assert sample_weight.dtype == np.float32 + # int dtype will be converted to float64 instead X = np.ones((5, 2), dtype=np.int) sample_weight = _check_sample_weight(None, X, dtype=X.dtype) From 561bb6ae6905d22cfa56980f4361de17d178185a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 18 Jul 2019 16:27:22 +0200 Subject: [PATCH 29/34] Update sklearn/utils/validation.py Co-Authored-By: Guillaume Lemaitre --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 8a4462e371a33..4aefcf82ec247 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -983,7 +983,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): def _check_sample_weight(sample_weight, X, dtype=None): - """Validate sample weights + """Validate sample weights. Parameters ---------- From 71ecf65ec85638f7819a64d894a01acaf69867cc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 18 Jul 2019 16:29:46 +0200 Subject: [PATCH 30/34] Lint --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4aefcf82ec247..d130a6083c143 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -983,7 +983,7 @@ def check_scalar(x, name, target_type, min_val=None, max_val=None): def _check_sample_weight(sample_weight, X, dtype=None): - """Validate sample weights. + """Validate sample weights. Parameters ---------- From e244ad5a75349cc64fa8f2f5f74771f4a97a17b0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 10:14:53 +0200 Subject: [PATCH 31/34] Update sklearn/utils/validation.py Co-Authored-By: Nicolas Hug --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index d130a6083c143..3a79754513237 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1024,6 +1024,6 @@ def _check_sample_weight(sample_weight, X, dtype=None): raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("samples_weight.shape == {}, expected {}!" + raise ValueError("sample_weight.shape == {}, expected {}!" .format(sample_weight.shape, (n_samples,))) return sample_weight From 13f9decda73f91453d94699f41fa8ac44cb46513 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 10:22:41 +0200 Subject: [PATCH 32/34] Improve docstring wording --- sklearn/utils/validation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 3a79754513237..abf51eef8f487 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -994,8 +994,11 @@ def _check_sample_weight(sample_weight, X, dtype=None): Input data. dtype: dtype - dtype of the validated `sample_weight`. Note that if `dtype` is not - one of `float32`, `float64`, the output will be of dtype `float64`. + dtype of the validated `sample_weight`. + If None, and the input `sample_weight` is an array, the dtype of the + input is preserved; otherwise an array with the default numpy dtype + is be allocated. If `dtype` is not one of `float32`, `float64`, + `None`, the output will be of dtype `float64`. Returns ------- From 9cccaf609be562b3b7445d7ea9d2df282a22aed8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 10:42:36 +0200 Subject: [PATCH 33/34] Fix tests --- sklearn/cluster/tests/test_k_means.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 3043bfd238d92..362b0a9145fca 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -909,7 +909,7 @@ def test_sample_weight_length(): # check that an error is raised when passing sample weights # with an incompatible shape km = KMeans(n_clusters=n_clusters, random_state=42) - msg = r'samples_weight.shape == \(2,\), expected \(100,\)' + msg = r'sample_weight.shape == \(2,\), expected \(100,\)' with pytest.raises(ValueError, match=msg): km.fit(X, sample_weight=np.ones(2)) From fb22cfc9dc0870a6269c247e6a97b992c03b4fc2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 16:56:59 +0200 Subject: [PATCH 34/34] Nicolas's comments --- sklearn/utils/tests/test_validation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index e932e5cdaf7f7..2789a59344008 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -870,13 +870,18 @@ def test_check_sample_weight(): # check numbers input sample_weight = _check_sample_weight(2.0, X=np.ones((5, 2))) - assert_allclose(sample_weight, 2*np.ones(5)) + assert_allclose(sample_weight, 2 * np.ones(5)) # check wrong number of dimensions with pytest.raises(ValueError, match="Sample weights must be 1D array or scalar"): _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2))) + # check incorrect n_samples + msg = r"sample_weight.shape == \(4,\), expected \(2,\)!" + with pytest.raises(ValueError, match=msg): + _check_sample_weight(np.ones(4), X=np.ones((2, 2))) + # float32 dtype is preserved X = np.ones((5, 2)) sample_weight = np.ones(5, dtype=np.float32)