From ac19cd65239f7da3fbbc90b8800d9511f8ffa6aa Mon Sep 17 00:00:00 2001
From: Maxwell Aladago <maxwell.aladago@gmail.com>
Date: Thu, 15 Aug 2019 17:03:43 -0400
Subject: [PATCH 1/5] tests for stronger validation for linear regression
 weights

---
 sklearn/linear_model/base.py            |  6 ++---
 sklearn/linear_model/tests/test_base.py | 34 +++++++++++++++++++++++
 sklearn/utils/validation.py             | 36 +++++++++++++------------
 3 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index d2af98d07ac09..95554fa9aa37a 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -27,7 +27,7 @@
 from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
                     MultiOutputMixin)
 from ..utils import check_array, check_X_y
-from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import FLOAT_DTYPES, _check_sample_weight
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
@@ -467,8 +467,8 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=True)
 
-        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
-            raise ValueError("Sample weights must be 1D array or scalar")
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, X.dtype)
 
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
             X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index 2836a63859a10..95bccffa5c3f7 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -93,6 +93,40 @@ def test_linear_regression_sample_weights():
                 assert_almost_equal(inter1, coefs2[0])
 
 
+def test_sample_weights():
+    X = [[1], [2]]
+    Y = [1, 2]
+
+    reg = LinearRegression()
+    reg.fit(X, Y)
+    # test should pass if sample weight is None
+    assert_array_almost_equal(reg.coef_, [0])
+    assert_array_almost_equal(reg.intercept_, [0])
+    assert_array_almost_equal(reg.predict(X), [0])
+
+    # improperly defined sample weight
+    sample_weight = ["1", "pi"]
+    np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight)
+
+    # sample weights which can be converted to numeric types passes
+    # if this behavior is not desirable, we can raise an exception
+    sample_weight = ["1", "1"]
+    reg.fit(X, Y, sample_weight)
+    assert_array_almost_equal(reg.coef_, [0])
+    assert_array_almost_equal(reg.intercept_, [0])
+    assert_array_almost_equal(reg.predict(X), [0])
+
+    # sample wight with few samples than X should raise an ValueError
+    sample_weight = [2]
+    reg.fit(X, Y, sample_weight)
+    np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight)
+
+    # sample weights greater than 1D raises ValueError
+    sample_weight = [[1, 1]]
+    reg.fit(X, Y, sample_weight)
+    np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight)
+
+
 def test_raises_value_error_if_sample_weights_greater_than_1d():
     # Sample weights must be either scalar or 1D
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 5de790701aaaa..d615c7137455e 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1025,27 +1025,29 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     """
     n_samples = _num_samples(X)
 
-    if dtype is not None and dtype not in [np.float32, np.float64]:
+    if dtype is  None:
+        dtype = [np.float64, np.float32]
+    elif dtype not in [np.float32, np.float64]:
         dtype = np.float64
 
-    if sample_weight is None or isinstance(sample_weight, numbers.Number):
-        if sample_weight is None:
-            sample_weight = np.ones(n_samples, dtype=dtype)
-        else:
-            sample_weight = np.full(n_samples, sample_weight,
+    if sample_weight is None:
+        sample_weight = np.ones(n_samples, dtype=dtype)
+    elif isinstance(sample_weight, numbers.Number):
+        sample_weight = np.full(n_samples, sample_weight,
                                     dtype=dtype)
     else:
-        if dtype is None:
-            dtype = [np.float64, np.float32]
-        sample_weight = check_array(
-                sample_weight, accept_sparse=False,
-                ensure_2d=False, dtype=dtype, order="C"
-        )
-        if sample_weight.ndim != 1:
-            raise ValueError("Sample weights must be 1D array or scalar")
-
-        if sample_weight.shape != (n_samples,):
-            raise ValueError("sample_weight.shape == {}, expected {}!"
+        try:
+            sample_weight = np.array(sample_weight, dtype=dtype)
+        except ValueError as e:
+            e.args = e.args[0] + ". sample weights must a scalar or" \
+                              " a 1D array numeric types"
+            raise
+
+    if sample_weight.ndim != 1:
+        raise ValueError("Sample weights must be 1D array or scalar")
+
+    if sample_weight.shape != (n_samples,):
+        raise ValueError("sample_weight.shape == {}, expected {}!"
                              .format(sample_weight.shape, (n_samples,)))
     return sample_weight
 

From 4c25185cc43c12a900026d34972ebc9561e2042b Mon Sep 17 00:00:00 2001
From: Maxwell Aladago <maxwell.aladago@gmail.com>
Date: Wed, 21 Aug 2019 03:38:56 -0400
Subject: [PATCH 2/5] refactored validation and tested against all estimators

---
 sklearn/ensemble/forest.py              |  4 +--
 sklearn/linear_model/tests/test_base.py | 34 -------------------------
 sklearn/utils/estimator_checks.py       | 11 ++++++++
 sklearn/utils/validation.py             | 23 +++++++----------
 4 files changed, 22 insertions(+), 50 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index 911b16b67df5f..15fef6214cbab 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -60,7 +60,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from .base import BaseEnsemble, _partition_estimators
 from ..utils.fixes import parallel_helper, _joblib_parallel_args
 from ..utils.multiclass import check_classification_targets
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_sample_weight
 
 
 __all__ = ["RandomForestClassifier",
@@ -243,7 +243,7 @@ def fit(self, X, y, sample_weight=None):
         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         if sample_weight is not None:
-            sample_weight = check_array(sample_weight, ensure_2d=False)
+            sample_weight = _check_sample_weight(sample_weight, X)
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
             # ensemble sorts the indices.
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index 95bccffa5c3f7..2836a63859a10 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -93,40 +93,6 @@ def test_linear_regression_sample_weights():
                 assert_almost_equal(inter1, coefs2[0])
 
 
-def test_sample_weights():
-    X = [[1], [2]]
-    Y = [1, 2]
-
-    reg = LinearRegression()
-    reg.fit(X, Y)
-    # test should pass if sample weight is None
-    assert_array_almost_equal(reg.coef_, [0])
-    assert_array_almost_equal(reg.intercept_, [0])
-    assert_array_almost_equal(reg.predict(X), [0])
-
-    # improperly defined sample weight
-    sample_weight = ["1", "pi"]
-    np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight)
-
-    # sample weights which can be converted to numeric types passes
-    # if this behavior is not desirable, we can raise an exception
-    sample_weight = ["1", "1"]
-    reg.fit(X, Y, sample_weight)
-    assert_array_almost_equal(reg.coef_, [0])
-    assert_array_almost_equal(reg.intercept_, [0])
-    assert_array_almost_equal(reg.predict(X), [0])
-
-    # sample wight with few samples than X should raise an ValueError
-    sample_weight = [2]
-    reg.fit(X, Y, sample_weight)
-    np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight)
-
-    # sample weights greater than 1D raises ValueError
-    sample_weight = [[1, 1]]
-    reg.fit(X, Y, sample_weight)
-    np.testing.assert_raises(ValueError, reg.fit, X, Y, sample_weight)
-
-
 def test_raises_value_error_if_sample_weights_greater_than_1d():
     # Sample weights must be either scalar or 1D
 
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 23109e32a4549..acd76199887f7 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -11,6 +11,7 @@
 from scipy import sparse
 from scipy.stats import rankdata
 import joblib
+import pytest
 
 from . import IS_PYPY
 from .testing import assert_raises, _get_args
@@ -636,6 +637,16 @@ def check_sample_weights_invariance(name, estimator_orig):
                       1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int'))
         y = enforce_estimator_tags_y(estimator1, y)
 
+        # sample weights greater than 1D raises ValueError
+        sample_weight = [[1, 2]]
+        with pytest.raises(ValueError):
+            estimator2.fit(X, y, sample_weight=sample_weight)
+
+        # sample wight with few samples than X raises ValueError
+        sample_weight = [[1, 1]]
+        with pytest.raises(ValueError):
+            estimator2.fit(X, y, sample_weight=sample_weight)
+
         estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)))
         estimator2.fit(X, y=y, sample_weight=None)
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index d615c7137455e..13e60aaffc371 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1025,23 +1025,18 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     """
     n_samples = _num_samples(X)
 
-    if dtype is  None:
-        dtype = [np.float64, np.float32]
-    elif dtype not in [np.float32, np.float64]:
+    if dtype not in [np.float32, np.float64]:
         dtype = np.float64
 
-    if sample_weight is None:
-        sample_weight = np.ones(n_samples, dtype=dtype)
-    elif isinstance(sample_weight, numbers.Number):
-        sample_weight = np.full(n_samples, sample_weight,
+    if sample_weight is None or isinstance(sample_weight, numbers.Number):
+        if sample_weight is None:
+            sample_weight = np.ones(n_samples, dtype=dtype)
+        elif isinstance(sample_weight, numbers.Number):
+            sample_weight = np.full(n_samples, sample_weight,
                                     dtype=dtype)
-    else:
-        try:
-            sample_weight = np.array(sample_weight, dtype=dtype)
-        except ValueError as e:
-            e.args = e.args[0] + ". sample weights must a scalar or" \
-                              " a 1D array numeric types"
-            raise
+        return sample_weight
+
+    sample_weight = np.array(sample_weight, dtype=dtype)
 
     if sample_weight.ndim != 1:
         raise ValueError("Sample weights must be 1D array or scalar")

From 7b807785b7f863cb4171cbbb070b493d91296937 Mon Sep 17 00:00:00 2001
From: Maxwell Aladago <maxwell.aladago@gmail.com>
Date: Wed, 21 Aug 2019 08:46:12 -0400
Subject: [PATCH 3/5] added checks to calibration.py

---
 sklearn/calibration.py       | 8 ++++----
 sklearn/linear_model/base.py | 6 +++---
 sklearn/utils/validation.py  | 3 +++
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index b88a8b8eb37ef..6568c83a53526 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -23,6 +23,7 @@
 from .preprocessing import label_binarize, LabelBinarizer
 from .utils import check_X_y, check_array, indexable, column_or_1d
 from .utils.validation import check_is_fitted, check_consistent_length
+from .utils.validation import _check_sample_weight
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv
@@ -155,6 +156,9 @@ def fit(self, X, y, sample_weight=None):
         else:
             base_estimator = self.base_estimator
 
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
         if self.cv == "prefit":
             calibrated_classifier = _CalibratedClassifier(
                 base_estimator, method=self.method)
@@ -172,12 +176,8 @@ def fit(self, X, y, sample_weight=None):
                 warnings.warn("%s does not support sample_weight. Samples"
                               " weights are only used for the calibration"
                               " itself." % estimator_name)
-                sample_weight = check_array(sample_weight, ensure_2d=False)
                 base_estimator_sample_weight = None
             else:
-                if sample_weight is not None:
-                    sample_weight = check_array(sample_weight, ensure_2d=False)
-                    check_consistent_length(y, sample_weight)
                 base_estimator_sample_weight = sample_weight
             for train, test in cv.split(X, y):
                 this_estimator = clone(base_estimator)
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 95554fa9aa37a..4cc7af51c243f 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -34,7 +34,7 @@
 from ..utils.fixes import sparse_lsqr
 from ..utils.seq_dataset import ArrayDataset32, CSRDataset32
 from ..utils.seq_dataset import ArrayDataset64, CSRDataset64
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted,  _check_sample_weight
 from ..preprocessing.data import normalize as f_normalize
 
 # TODO: bayesian_ridge_regression and bayesian_regression_ard
@@ -118,8 +118,8 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
     centered. This function also systematically makes y consistent with X.dtype
     """
 
-    if isinstance(sample_weight, numbers.Number):
-        sample_weight = None
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X, np.float64)
 
     if check_input:
         X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 13e60aaffc371..afd9339f4c6c8 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1025,6 +1025,9 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     """
     n_samples = _num_samples(X)
 
+    if hasattr(sample_weight, "dtype"):
+        dtype = sample_weight.dtype
+
     if dtype not in [np.float32, np.float64]:
         dtype = np.float64
 

From e679df87e51d8c8b3ce7dccf7ea11912f5bcc2ae Mon Sep 17 00:00:00 2001
From: Maxwell Aladago <maxwell.aladago@gmail.com>
Date: Wed, 21 Aug 2019 08:50:45 -0400
Subject: [PATCH 4/5] flake8 issue

---
 sklearn/linear_model/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 4cc7af51c243f..e77c05e17d8d0 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -14,7 +14,6 @@
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
-import numbers
 import warnings
 
 import numpy as np
@@ -27,7 +26,7 @@
 from ..base import (BaseEstimator, ClassifierMixin, RegressorMixin,
                     MultiOutputMixin)
 from ..utils import check_array, check_X_y
-from ..utils.validation import FLOAT_DTYPES, _check_sample_weight
+from ..utils.validation import FLOAT_DTYPES
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale

From bdeb2c21e91cfdadccbf9f5ac3d9ccc674609c4d Mon Sep 17 00:00:00 2001
From: Maxwell Aladago <maxwell.aladago@gmail.com>
Date: Fri, 23 Aug 2019 08:50:31 -0400
Subject: [PATCH 5/5] removing tests for all estimators

---
 sklearn/linear_model/base.py           |  2 +-
 sklearn/utils/estimator_checks.py      | 11 -----------
 sklearn/utils/tests/test_validation.py |  6 ++++++
 sklearn/utils/validation.py            | 20 ++++++++++++++++++--
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index e77c05e17d8d0..8b651be1e709c 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -118,7 +118,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
     """
 
     if sample_weight is not None:
-        sample_weight = _check_sample_weight(sample_weight, X, np.float64)
+        sample_weight = _check_sample_weight(sample_weight, X)
 
     if check_input:
         X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index acd76199887f7..23109e32a4549 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -11,7 +11,6 @@
 from scipy import sparse
 from scipy.stats import rankdata
 import joblib
-import pytest
 
 from . import IS_PYPY
 from .testing import assert_raises, _get_args
@@ -637,16 +636,6 @@ def check_sample_weights_invariance(name, estimator_orig):
                       1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int'))
         y = enforce_estimator_tags_y(estimator1, y)
 
-        # sample weights greater than 1D raises ValueError
-        sample_weight = [[1, 2]]
-        with pytest.raises(ValueError):
-            estimator2.fit(X, y, sample_weight=sample_weight)
-
-        # sample wight with few samples than X raises ValueError
-        sample_weight = [[1, 1]]
-        with pytest.raises(ValueError):
-            estimator2.fit(X, y, sample_weight=sample_weight)
-
         estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)))
         estimator2.fit(X, y=y, sample_weight=None)
 
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 0f7ffe9a3e4f0..f2113208ab51e 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -912,6 +912,12 @@ def test_check_sample_weight():
     sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
     assert sample_weight.dtype == np.float64
 
+    # wrongly formated sample_weight
+    sample_weight = np.array(["1", "pi", "e"])
+    err_msg = "could not convert string to float: 'pi'"
+    with pytest.raises(ValueError, match=err_msg):
+        _check_sample_weight(sample_weight, X)
+
 
 @pytest.mark.parametrize("toarray", [
     np.array, sp.csr_matrix, sp.csc_matrix])
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index afd9339f4c6c8..bffc3422e7f4e 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1025,6 +1025,13 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     """
     n_samples = _num_samples(X)
 
+    # this check is needed to ensure that we don't change the dtype of
+    # of sample_weight if it's already np.float32.
+    # since sample_weight can be a list or an array, we first
+    # need to verify that it has a dtype attribute before the check.
+    # if dtype is None or any other type besides np.float32, np.float64
+    # is given.
+
     if hasattr(sample_weight, "dtype"):
         dtype = sample_weight.dtype
 
@@ -1039,14 +1046,23 @@ def _check_sample_weight(sample_weight, X, dtype=None):
                                     dtype=dtype)
         return sample_weight
 
+    # at this point, sample_weight is either a list or
+    # an array. These checks will validate that the dtype
+    # of the returned sample_weight is either np.float32 or
+    # np.float64. If sample weight contained elements which
+    # cannot be passed safely to the above types, the
+    # following line will raise a ValueError
     sample_weight = np.array(sample_weight, dtype=dtype)
 
+    # sample_weights must be 1-D arrays
     if sample_weight.ndim != 1:
         raise ValueError("Sample weights must be 1D array or scalar")
 
-    if sample_weight.shape != (n_samples,):
+    # and must have the same number of elements
+    # as X
+    if sample_weight.shape[0] != n_samples:
         raise ValueError("sample_weight.shape == {}, expected {}!"
-                             .format(sample_weight.shape, (n_samples,)))
+                         .format(sample_weight.shape, (n_samples, )))
     return sample_weight