8000 Merge pull request #5431 from hlin117/nan-targets · johannah/scikit-learn@744d161 · GitHub
[go: up one dir, main page]

Skip to content

Commit 744d161

Browse files
committed
Merge pull request scikit-learn#5431 from hlin117/nan-targets
[MRG + 2] Add check to regression models to raise error when targets are NaN
2 parents 3c988d5 + f725485 commit 744d161

File tree

6 files changed

+38
-12
lines changed

6 files changed

+38
-12
lines changed

sklearn/ensemble/forest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,8 @@ def fit(self, X, y, sample_weight=None):
209209
Returns self.
210210
"""
211211
# Validate or convert input data
212-
X = check_array(X, dtype=DTYPE, accept_sparse="csc")
212+
X = check_array(X, accept_sparse="csc", dtype=DTYPE)
213+
y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
213214
if issparse(X):
214215
# Pre-sort indices to avoid that each individual tree of the
215216
# ensemble sorts the indices.

sklearn/linear_model/coordinate_descent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1632,7 +1632,7 @@ def fit(self, X, y):
16321632
# X and y must be of type float64
16331633
X = check_array(X, dtype=np.float64, order='F',
16341634
copy=self.copy_X and self.fit_intercept)
1635-
y = np.asarray(y, dtype=np.float64)
1635+
y = check_array(y, dtype=np.float64, ensure_2d=False)
16361636

16371637
if hasattr(self, 'l1_ratio'):
16381638
model_str = 'ElasticNet'

sklearn/svm/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from ..base import BaseEstimator, ClassifierMixin
1111
from ..preprocessing import LabelEncoder
1212
from ..multiclass import _ovr_decision_function
13-
from ..utils import check_array, check_random_state, column_or_1d
13+
from ..utils import check_array, check_random_state, column_or_1d 10000 , check_X_y
1414
from ..utils import compute_class_weight, deprecated
1515
from ..utils.extmath import safe_sparse_dot
1616
from ..utils.validation import check_is_fitted
@@ -147,7 +147,7 @@ def fit(self, X, y, sample_weight=None):
147147
raise TypeError("Sparse precomputed kernels are not supported.")
148148
self._sparse = sparse and not callable(self.kernel)
149149

150-
X = check_array(X, accept_sparse='csr', dtype=np.float64, order='C')
150+
X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
151151
y = self._validate_targets(y)
152152

153153
sample_weight = np.asarray([]

sklearn/tree/tree.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from ..base import RegressorMixin
2929
from ..externals import six
3030
from ..feature_selection.from_model import _LearntSelectorMixin
31-
from ..utils import check_array
31+
from ..utils import check_array, check_X_y
3232
from ..utils import check_random_state
3333
from ..utils import compute_sample_weight
3434
from ..utils.multiclass import check_classification_targets
@@ -151,6 +151,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
151151
random_state = check_random_state(self.random_state)
152152
if check_input:
153153
X = check_array(X, dtype=DTYPE, accept_sparse="csc")
154+
y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
154155
if issparse(X):
155156
X.sort_indices()
156157

sklearn/utils/estimator_checks.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,26 @@ def _yield_classifier_checks(name, Classifier):
131131
if 'class_weight' in Classifier().get_params().keys():
132132
yield check_class_weight_classifiers
133133

134+
def check_supervised_y_no_nan(name, Estimator):
135+
# Checks that the Estimator targets are not NaN.
136+
137+
rng = np.random.RandomState(888)
138+
X = rng.randn(10, 5)
139+
y = np.ones(10) * np.inf
140+
y = multioutput_estimator_convert_y_2d(name, y)
141+
142+
errmsg = "Input contains NaN, infinity or a value too large for " \
143+
"dtype('float64')."
144+
try:
145+
Estimator().fit(X, y)
146+
except ValueError as e:
147+
if str(e) != errmsg:
148+
raise ValueError("Estimator {0} raised warning as expected, but "
149+
"does not match expected error message" \
150+
.format(name))
151+
else:
152+
raise ValueError("Estimator {0} should have raised error on fitting "
153+
"array y with NaN value.".format(name))
134154

135155
def _yield_regressor_checks(name, Regressor):
136156
# TODO: test with intercept
@@ -141,6 +161,7 @@ def _yield_regressor_checks(name, Regressor):
141161
yield check_estimators_partial_fit_n_features
142162
yield check_regressors_no_decision_function
143163
yield check_supervised_y_2d
164+
yield check_supervised_y_no_nan
144165
if name != 'CCA':
145166
# check that the regressor handles int input
146167
yield check_regressors_int
@@ -207,10 +228,10 @@ def check_estimator(Estimator):
207228
Parameters
208229
----------
209230
Estimator : class
210-
Class to check.
231+
Class to check. Estimator is a class object (not an instance).
211232
212233
"""
213-
name = Estimator.__class__.__name__
234+
name = Estimator.__name__
214235
check_parameters_default_constructible(name, Estimator)
215236
for check in _yield_all_checks(name, Estimator):
216237
check(name, Estimator)
@@ -695,6 +716,7 @@ def check_estimators_empty_data_messages(name, Estimator):
695716

696717

697718
def check_estimators_nan_inf(name, Estimator):
719+
# Checks that Estimator X's do not contain NaN or inf.
698720
rnd = np.random.RandomState(0)
699721
X_train_finite = rnd.uniform(size=(10, 3))
700722
X_train_nan = rnd.uniform(size=(10, 3))
@@ -1431,9 +1453,8 @@ def param_filter(p):
14311453
def multioutput_estimator_convert_y_2d(name, y):
14321454
# Estimators in mono_output_task_error raise ValueError if y is of 1-D
14331455
# Convert into a 2-D y for those estimators.
1434-
if name in (['MultiTaskElasticNetCV', 'MultiTaskLassoCV',
1435-
'MultiTaskLasso', 'MultiTaskElasticNet']):
1436-
return y[:, np.newaxis]
1456+
if "MultiTask" in name:
1457+
return np.reshape(y, (-1, 1))
14371458
return y
14381459

14391460

@@ -1445,7 +1466,7 @@ def check_non_transformer_estimators_n_iter(name, estimator,
14451466
X, y_ = iris.data, iris.target
14461467

14471468
if multi_output:
1448-
y_ = y_[:, np.newaxis]
1469+
y_ = np.reshape(y_, (-1, 1))
14491470

14501471
set_random_state(estimator, 0)
14511472
if name == 'AffinityPropagation':

sklearn/utils/tests/test_estimator_checks.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from sklearn.utils.estimator_checks import check_estimator
99
from sklearn.utils.estimator_checks import check_estimators_unfitted
1010
from sklearn.ensemble import AdaBoostClassifier
11+
from sklearn.linear_model import MultiTaskElasticNet
1112
from sklearn.utils.validation import check_X_y, check_array
1213

1314

@@ -75,7 +76,8 @@ def test_check_estimator():
7576
msg = "Estimator doesn't check for NaN and inf in predict"
7677
assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict)
7778
# check for sparse matrix input handling
78-
msg = "Estimator type doesn't seem to fail gracefully on sparse data"
79+
name = NoSparseClassifier.__name__
80+
msg = "Estimator " + name + " doesn't seem to fail gracefully on sparse data"
7981
# the check for sparse input handling prints to the stdout,
8082
# instead of raising an error, so as not to remove the original traceback.
8183
# that means we need to jump through some hoops to catch it.
@@ -92,6 +94,7 @@ def test_check_estimator():
9294

9395
# doesn't error on actual estimator
9496
check_estimator(AdaBoostClassifier)
97+
check_estimator(MultiTaskElasticNet)
9598

9699

97100
def test_check_estimators_unfitted():

0 commit comments

Comments
 (0)
0