8000 make check_array convert object to float. · scikit-learn/scikit-learn@b018097 · GitHub
[go: up one dir, main page]

Skip to content

Commit b018097

Browse files
committed
make check_array convert object to float.
fix dtype check, add test. unfriend all multi-output estimators on facebook. try to fix what is happening to y (by doing nothing to y) make test work... Make everything accept object y or say "invalid label" fix multioutput linear models add test for sensible error message.
1 parent c5f7bd1 commit b018097

File tree

14 files changed

+99
-44
lines changed

14 files changed

+99
-44
lines changed

sklearn/ensemble/gradient_boosting.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from __future__ import print_function
2424
from __future__ import division
2525
from abc import ABCMeta, abstractmethod
26-
from warnings import warn
2726
from time import time
2827

2928
import numbers
@@ -1135,10 +1134,12 @@ def feature_importances_(self):
11351134

11361135
def _validate_y(self, y):
11371136
self.n_classes_ = 1
1138-
1137+
if y.dtype is np.dtype(object):
1138+
y = y.astype(np.float)
11391139
# Default implementation
11401140
return y
11411141

1142+
11421143
class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
11431144
"""Gradient Boosting for classification.
11441145

sklearn/gaussian_process/gaussian_process.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111

1212
from ..base import BaseEstimator, RegressorMixin
1313
from ..metrics.pairwise import manhattan_distances
14-
from ..utils import check_random_state, check_array, check_consistent_length
15-
from ..utils.validation import check_is_fitted
14+
from ..utils import check_random_state, check_array, check_X_y
15+
from ..utils.validation import check_is_fitted
1616
from . import regression_models as regression
1717
from . import correlation_models as correlation
1818

@@ -264,12 +264,10 @@ def fit(self, X, y):
264264
self.random_state = check_random_state(self.random_state)
265265

266266
# Force data to 2D numpy.array
267-
X = check_array(X)
268-
y = np.asarray(y)
267+
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
269268
self.y_ndim_ = y.ndim
270269
if y.ndim == 1:
271270
y = y[:, np.newaxis]
272-
check_consistent_length(X, y)
273271

274272
# Check shapes of DOE & observations
275273
n_samples, n_features = X.shape
@@ -882,7 +880,7 @@ def _check_params(self, n_samples=None):
882880
"or array of length n_samples.")
883881

884882
# Check optimizer
885-
if not self.optimizer in self._optimizer_types:
883+
if self.optimizer not in self._optimizer_types:
886884
raise ValueError("optimizer should be one of %s"
887885
% self._optimizer_types)
888886

sklearn/linear_model/base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from ..externals import six
2626
from ..externals.joblib import Parallel, delayed
2727
from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
28-
from ..utils import as_float_array, check_array
28+
from ..utils import as_float_array, check_array, check_X_y
2929
from ..utils.extmath import safe_sparse_dot
3030
from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
3131
from ..utils.fixes import sparse_lsqr
@@ -370,8 +370,8 @@ def fit(self, X, y, n_jobs=1):
370370
n_jobs_ = n_jobs
371371
else:
372372
n_jobs_ = self.n_jobs
373-
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
374-
y = np.asarray(y)
373+
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
374+
y_numeric=True, multi_output=True)
375375

376376
X, y, X_mean, y_mean, X_std = self._center_data(
377377
X, y, self.fit_intercept, self.normalize, self.copy_X)

sklearn/linear_model/bayes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def fit(self, X, y):
132132
-------
133133
self : returns an instance of self.
134134
"""
135-
X, y = check_X_y(X, y, dtype=np.float)
135+
X, y = check_X_y(X, y, dtype=np.float, y_numeric=True)
136136
X, y, X_mean, y_mean, X_std = self._center_data(
137137
X, y, self.fit_intercept, self.normalize, self.copy_X)
138138
n_samples, n_features = X.shape
@@ -342,7 +342,7 @@ def fit(self, X, y):
342342
-------
343343
self : returns an instance of self.
344344
"""
345-
X, y = check_X_y(X, y, dtype=np.float)
345+
X, y = check_X_y(X, y, dtype=np.float, y_numeric=True)
346346

347347
n_samples, n_features = X.shape
348348
coef_ = np.zeros(n_features)

sklearn/linear_model/coordinate_descent.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,6 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
386386
if selection not in ['random', 'cyclic']:
387387
raise ValueError("selection should be either random or cyclic.")
388388
random = (selection == 'random')
389-
models = []
390389

391390
if not multi_output:
392391
coefs = np.empty((n_features, n_alphas), dtype=np.float64)
@@ -615,7 +614,7 @@ def fit(self, X, y):
615614

616615
X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64,
617616
order='F', copy=self.copy_X and self.fit_intercept,
618-
multi_output=True)
617+
multi_output=True, y_numeric=True)
619618

620619
X, y, X_mean, y_mean, X_std, precompute, Xy = \
621620
_pre_fit(X, y, None, self.precompute, self.normalize,

sklearn/linear_model/least_angle.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
from .base import LinearModel
2323
from ..base import RegressorMixin
24-
from ..utils import arrayfuncs, as_float_array, check_array, check_X_y
24+
from ..utils import arrayfuncs, as_float_array, check_X_y
2525
from ..cross_validation import _check_cv as check_cv
2626
from ..utils import ConvergenceWarning
2727
from ..externals.joblib import Parallel, delayed
@@ -419,7 +419,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
419419
for ii in idx:
420420
for i in range(ii, n_active):
421421
indices[i], indices[i + 1] = indices[i + 1], indices[i]
422-
Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i+1])
422+
Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
423423
Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i],
424424
Gram[:, i + 1])
425425

@@ -586,8 +586,7 @@ def fit(self, X, y, Xy=None):
586586
self : object
587587
returns an instance of self.
588588
"""
589-
X = check_array(X)
590-
y = np.asarray(y)
589+
X, y = check_X_y(X, y, y_numeric=True, multi_output=True)
591590
n_features = X.shape[1]
592591

593592
X, y, X_mean, y_mean, X_std = self._center_data(X, y,
@@ -1262,8 +1261,7 @@ def fit(self, X, y, copy_X=True):
12621261
returns an instance of self.
12631262
"""
12641263
self.fit_path = True
1265-
X = check_array(X)
1266-
y = np.asarray(y)
1264+
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
12671265

12681266
X, y, Xmean, ymean, Xstd = LinearModel._center_data(
12691267
X, y, self.fit_intercept, self.normalize, self.copy_X)

sklearn/linear_model/logistic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
529529
"dual=False, got dual=%s" % dual)
530530
# Preprocessing.
531531
X = check_array(X, accept_sparse='csr', dtype=np.float64)
532-
y = check_array(y, ensure_2d=False, copy=copy)
532+
y = check_array(y, ensure_2d=False, copy=copy, dtype=None)
533533
_, n_features = X.shape
534534
check_consistent_length(X, y)
535535
classes = np.unique(y)
@@ -1313,7 +1313,7 @@ def fit(self, X, y):
13131313
"the primal form.")
13141314

13151315
X = check_array(X, accept_sparse='csr', dtype=np.float64)
1316-
y = check_array(y, ensure_2d=False)
1316+
y = check_array(y, ensure_2d=False, dtype=None)
13171317

13181318
if self.multi_class not in ['ovr', 'multinomial']:
13191319
raise ValueError("multi_class backend should be either "

sklearn/linear_model/omp.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -609,8 +609,7 @@ def fit(self, X, y):
609609
self : object
610610
returns an instance of self.
611611
"""
612-
X = check_array(X)
613-
y = np.asarray(y)
612+
X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
614613
n_features = X.shape[1]
615614

616615
X, y, X_mean, y_mean, X_std, Gram, Xy = \
@@ -805,7 +804,7 @@ def fit(self, X, y):
805804
self : object
806805
returns an instance of self.
807806
"""
808-
X, y = check_X_y(X, y)
807+
X, y = check_X_y(X, y, y_numeric=True)
809808
cv = check_cv(self.cv, X, y, classifier=False)
810809
max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1])
811< 10000 code>810
if not self.max_iter

sklearn/linear_model/randomized_l1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def fit(self, X, y):
8888
self : object
8989
Returns an instance of self.
9090
"""
91-
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
91+
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True)
9292
X = as_float_array(X, copy=False)
9393
n_samples, n_features = X.shape
9494

sklearn/linear_model/ridge.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
365365
self.solver = solver
366366

367367
def fit(self, X, y, sample_weight=None):
368-
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True)
368+
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float,
369+
multi_output=True, y_numeric=True)
369370

370371
if ((sample_weight is not None) and
371372
np.atleast_1d(sample_weight).ndim > 1):
@@ -732,7 +733,8 @@ def fit(self, X, y, sample_weight=None):
732733
-------
733734
self : Returns self.
734735
"""
735-
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True)
736+
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float,
737+
multi_output=True, y_numeric=True)
736738

737739
n_samples, n_features = X.shape
738740

sklearn/preprocessing/label.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
464464
if not isinstance(y, list):
465465
# XXX Workaround that will be removed when list of list format is
466466
# dropped
467-
y = check_array(y, accept_sparse='csr', ensure_2d=False)
467+
y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)
468468
if neg_label >= pos_label:
469469
raise ValueError("neg_label={0} must be strictly less than "
470470
"pos_label={1}.".format(neg_label, pos_label))

sklearn/tests/test_common.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from sklearn.externals.six.moves import zip
1717
from sklearn.utils.testing import assert_false, clean_warning_registry
1818
from sklearn.utils.testing import all_estimators
19-
from sklearn.utils.testing import assert_greater
19+
from sklearn.utils.testing import assert_greater
2020
from sklearn.utils.testing import assert_in
2121
from sklearn.utils.testing import SkipTest
2222
from sklearn.utils.testing import ignore_warnings
@@ -28,6 +28,7 @@
2828
from sklearn.cross_validation import train_test_split
2929
from sklearn.linear_model.base import LinearClassifierMixin
3030
from sklearn.utils.estimator_checks import (
31+
check_dtype_object,
3132
check_parameters_default_constructible,
3233
check_regressors_classifiers_sparse_data,
3334
check_transformer,
@@ -88,12 +89,13 @@ def test_non_meta_estimators():
8889
estimators = all_estimators(type_filter=['classifier', 'regressor',
8990
'transformer', 'cluster'])
9091
for name, Estimator in estimators:
92+
if name not in CROSS_DECOMPOSITION:
93+
yield check_dtype_object, name, Estimator
9194
if name not in CROSS_DECOMPOSITION + ['Imputer']:
9295
# Test that all estimators check their input for NaN's and infs
9396
yield check_estimators_nan_inf, name, Estimator
9497

95-
if (name not in ['CCA', '_CCA', 'PLSCanonical', 'PLSRegression',
96-
'PLSSVD', 'GaussianProcess']):
98+
if (name not in CROSS_DECOMPOSITION + ['GaussianProcess']):
9799
# FIXME!
98100
# in particular GaussianProcess!
99101
yield check_estimators_overwrite_params, name, Estimator

sklearn/utils/estimator_checks.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from sklearn.utils.testing import assert_greater
2323
from sklearn.utils.testing import SkipTest
2424
from sklearn.utils.testing import check_skip_travis
25+
from sklearn.utils.testing import assert_raise_message
2526

2627
from sklearn.base import (clone, ClusterMixin, ClassifierMixin, RegressorMixin,
2728
TransformerMixin)
@@ -146,6 +147,39 @@ def check_regressors_classifiers_sparse_data(name, Estimator):
146147
raise
147148

148149

150+
def check_dtype_object(name, Estimator):
151+
rng = np.random.RandomState(0)
152+
X = rng.rand(40, 10).astype(object)
153+
y = (X[:, 0] * 4).astype(np.int)
154+
y = multioutput_estimator_convert_y_2d(name, y)
155+
with warnings.catch_warnings():
156+
estimator = Estimator()
157+
set_fast_parameters(estimator)
158+
159+
if is_supervised(estimator):
160+
estimator.fit(X, y)
161+
else:
162+
estimator.fit(X)
163+
if hasattr(estimator, "predict"):
164+
estimator.predict(X)
165+
166+
if hasattr(estimator, "transform"):
167+
estimator.transform(X)
168+
169+
if is_supervised(estimator):
170+
try:
171+
estimator.fit(X, y.astype(object))
172+
except Exception as e:
173+
if "Unknown label type" not in str(e):
174+
raise
175+
176+
X[0, 0] = {'foo': 'bar'}
177+
if is_supervised(estimator):
178+
assert_raise_message(TypeError, "string or a number", estimator.fit, X, y)
179+
else:
180+
assert_raise_message(TypeError, "string or a number", estimator.fit, X)
181+
182+
149183
def check_transformer(name, Transformer):
150184
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
151185
random_state=0, n_features=2, cluster_std=0.1)
@@ -567,7 +601,7 @@ def check_estimators_unfitted(name, Estimator):
567601
est = Estimator()
568602

569603
assert_raises(NotFittedError, est.predict, X)
570-
604+
571605
if hasattr(est, 'predict'):
572606
assert_raises(NotFittedError, est.predict, X)
573607

@@ -576,7 +610,7 @@ def check_estimators_unfitted(name, Estimator):
576610

577611
if hasattr(est, 'predict_proba'):
578612
assert_raises(NotFittedError, est.predict_proba, X)
579-
613+
580614
if hasattr(est, 'predict_log_proba'):
581615
assert_raises(NotFittedError, est.predict_log_proba, X)
582616

@@ -991,7 +1025,7 @@ def multioutput_estimator_convert_y_2d(name, y):
9911025
return y
9921026

9931027

994-
def check_non_transformer_estimators_n_iter(name, estimator,
1028+
def check_non_transformer_estimators_n_iter(name, estimator,
9951029
multi_output=False):
9961030
# Check if all iterative solvers, run for more than one iteratiom
9971031

0 commit comments

Comments
 (0)
0