diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c49769567f960..0901d6159630e 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -377,6 +377,10 @@ API changes summary - `thresh` parameter is deprecated in favor of new `tol` parameter in :class:`GMM`. See `Enhancements` section for details. By `Hervé Bredin`_. + - Estimators will treat input with dtype object as numeric when possible. + By `Andreas Müller`_ + + .. _changes_0_15_2: diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 820638c5befe5..a073ae6061b1a 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -126,7 +126,7 @@ class in the training data. """ def fit(self, X, y, sample_weight=None): if sample_weight is None: - sample_weight = np.ones_like(y, dtype=np.float) + sample_weight = np.ones_like(y, dtype=np.float64) class_counts = bincount(y, weights=sample_weight) self.priors = class_counts / class_counts.sum() @@ -1146,7 +1146,8 @@ def feature_importances_(self): def _validate_y(self, y): self.n_classes_ = 1 - + if y.dtype.kind == 'O': + y = y.astype(np.float64) # Default implementation return y diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py index 300749eb52392..d1507f98414f2 100644 --- a/sklearn/gaussian_process/gaussian_process.py +++ b/sklearn/gaussian_process/gaussian_process.py @@ -11,8 +11,8 @@ from ..base import BaseEstimator, RegressorMixin from ..metrics.pairwise import manhattan_distances -from ..utils import check_random_state, check_array, check_consistent_length -from ..utils.validation import check_is_fitted +from ..utils import check_random_state, check_array, check_X_y +from ..utils.validation import check_is_fitted from . import regression_models as regression from . import correlation_models as correlation @@ -264,12 +264,10 @@ def fit(self, X, y): self.random_state = check_random_state(self.random_state) # Force data to 2D numpy.array - X = check_array(X) - y = np.asarray(y) + X, y = check_X_y(X, y, multi_output=True, y_numeric=True) self.y_ndim_ = y.ndim if y.ndim == 1: y = y[:, np.newaxis] - check_consistent_length(X, y) # Check shapes of DOE & observations n_samples, n_features = X.shape @@ -883,7 +881,7 @@ def _check_params(self, n_samples=None): "or array of length n_samples.") # Check optimizer - if not self.optimizer in self._optimizer_types: + if self.optimizer not in self._optimizer_types: raise ValueError("optimizer should be one of %s" % self._optimizer_types) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index b2c44793ae0cd..ca4c36e1098b1 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -25,7 +25,7 @@ from ..externals import six from ..externals.joblib import Parallel, delayed from ..base import BaseEstimator, ClassifierMixin, RegressorMixin -from ..utils import as_float_array, check_array +from ..utils import as_float_array, check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale from ..utils.fixes import sparse_lsqr @@ -372,8 +372,8 @@ def fit(self, X, y, n_jobs=1): n_jobs_ = n_jobs else: n_jobs_ = self.n_jobs - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) - y = np.asarray(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=True) X, y, X_mean, y_mean, X_std = self._center_data( X, y, self.fit_intercept, self.normalize, self.copy_X) diff --git a/sklearn/linear_model/bayes.py b/sklearn/linear_model/bayes.py index f37c77c9d5285..2f40648527adf 100644 --- a/sklearn/linear_model/bayes.py +++ b/sklearn/linear_model/bayes.py @@ -132,7 +132,7 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, dtype=np.float) + X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True) X, y, X_mean, y_mean, X_std = self._center_data( X, y, self.fit_intercept, self.normalize, self.copy_X) n_samples, n_features = X.shape @@ -342,7 +342,7 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, dtype=np.float) + X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True) n_samples, n_features = X.shape coef_ = np.zeros(n_features) diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index 0d0b2e895137b..e0df1f50847e3 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -627,7 +627,7 @@ def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64, order='F', copy=self.copy_X and self.fit_intercept, - multi_output=True) + multi_output=True, y_numeric=True) X, y, X_mean, y_mean, X_std, precompute, Xy = \ _pre_fit(X, y, None, self.precompute, self.normalize, diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index c92a0f1841656..f2c352ff353ff 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -21,7 +21,7 @@ from .base import LinearModel from ..base import RegressorMixin -from ..utils import arrayfuncs, as_float_array, check_array, check_X_y +from ..utils import arrayfuncs, as_float_array, check_X_y from ..cross_validation import _check_cv as check_cv from ..utils import ConvergenceWarning from ..externals.joblib import Parallel, delayed @@ -422,7 +422,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, for ii in idx: for i in range(ii, n_active): indices[i], indices[i + 1] = indices[i + 1], indices[i] - Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i+1]) + Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1]) Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i], Gram[:, i + 1]) @@ -589,8 +589,7 @@ def fit(self, X, y, Xy=None): self : object returns an instance of self. """ - X = check_array(X) - y = np.asarray(y) + X, y = check_X_y(X, y, y_numeric=True, multi_output=True) n_features = X.shape[1] X, y, X_mean, y_mean, X_std = self._center_data(X, y, @@ -1268,8 +1267,7 @@ def fit(self, X, y, copy_X=True): returns an instance of self. """ self.fit_path = True - X = check_array(X) - y = np.asarray(y) + X, y = check_X_y(X, y, multi_output=True, y_numeric=True) X, y, Xmean, ymean, Xstd = LinearModel._center_data( X, y, self.fit_intercept, self.normalize, self.copy_X) diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 4c783ebb9382d..69d442a84ee12 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -529,7 +529,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, "dual=False, got dual=%s" % dual) # Preprocessing. X = check_array(X, accept_sparse='csr', dtype=np.float64) - y = check_array(y, ensure_2d=False, copy=copy) + y = check_array(y, ensure_2d=False, copy=copy, dtype=None) _, n_features = X.shape check_consistent_length(X, y) classes = np.unique(y) @@ -1318,7 +1318,7 @@ def fit(self, X, y): "the primal form.") X = check_array(X, accept_sparse='csr', dtype=np.float64) - y = check_array(y, ensure_2d=False) + y = check_array(y, ensure_2d=False, dtype=None) if self.multi_class not in ['ovr', 'multinomial']: raise ValueError("multi_class backend should be either " diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py index 91b9997a08286..c45d80bf1c686 100644 --- a/sklearn/linear_model/omp.py +++ b/sklearn/linear_model/omp.py @@ -609,8 +609,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X = check_array(X) - y = np.asarray(y) + X, y = check_X_y(X, y, multi_output=True, y_numeric=True) n_features = X.shape[1] X, y, X_mean, y_mean, X_std, Gram, Xy = \ @@ -805,7 +804,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, y_numeric=True) X = as_float_array(X, copy=False, force_all_finite=False) cv = check_cv(self.cv, X, y, classifier=False) max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py index e6863848e994d..04a3a2ee87dd7 100644 --- a/sklearn/linear_model/randomized_l1.py +++ b/sklearn/linear_model/randomized_l1.py @@ -88,7 +88,7 @@ def fit(self, X, y): self : object Returns an instance of self. """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], y_numeric=True) X = as_float_array(X, copy=False) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 6e5216090cb9f..9cf6ed43077e0 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -378,7 +378,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, self.solver = solver def fit(self, X, y, sample_weight=None): - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True) + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, + multi_output=True, y_numeric=True) if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1): @@ -743,7 +744,8 @@ def fit(self, X, y, sample_weight=None): ------- self : Returns self. """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, multi_output=True) + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float, + multi_output=True, y_numeric=True) n_samples, n_features = X.shape diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 75455e50211da..7a3593ca48de8 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -464,7 +464,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, if not isinstance(y, list): # XXX Workaround that will be removed when list of list format is # dropped - y = check_array(y, accept_sparse='csr', ensure_2d=False) + y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None) if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " "pos_label={1}.".format(neg_label, pos_label)) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 42ba6cdb4ce8a..19163c50e91bc 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -29,6 +29,7 @@ from sklearn.cross_validation import train_test_split from sklearn.linear_model.base import LinearClassifierMixin from sklearn.utils.estimator_checks import ( + check_dtype_object, check_parameters_default_constructible, check_estimator_sparse_data, check_estimators_dtypes, @@ -96,6 +97,7 @@ def test_non_meta_estimators(): if name not in CROSS_DECOMPOSITION: yield check_estimators_dtypes, name, Estimator yield check_fit_score_takes_y, name, Estimator + yield check_dtype_object, name, Estimator if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']: # SpectralEmbedding is non-deterministic, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2ca110a17d6c0..9c5467f955786 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -23,6 +23,7 @@ from sklearn.utils.testing import SkipTest from sklearn.utils.testing import check_skip_travis from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import assert_raise_message from sklearn.base import clone, ClassifierMixin from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score @@ -149,6 +150,33 @@ def check_estimator_sparse_data(name, Estimator): raise +def check_dtype_object(name, Estimator): + # check that estimators treat dtype object as numeric if possible + rng = np.random.RandomState(0) + X = rng.rand(40, 10).astype(object) + y = (X[:, 0] * 4).astype(np.int) + y = multioutput_estimator_convert_y_2d(name, y) + with warnings.catch_warnings(): + estimator = Estimator() + set_fast_parameters(estimator) + + estimator.fit(X, y) + if hasattr(estimator, "predict"): + estimator.predict(X) + + if hasattr(estimator, "transform"): + estimator.transform(X) + + try: + estimator.fit(X, y.astype(object)) + except Exception as e: + if "Unknown label type" not in str(e): + raise + + X[0, 0] = {'foo': 'bar'} + assert_raise_message(TypeError, "string or a number", estimator.fit, X, y) + + def check_transformer(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index cc954b1ff51b0..018435a5373fd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -227,12 +227,14 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, order, copy, return spmatrix -def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, +def check_array(array, accept_sparse=None, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1): """Input validation on an array, list, sparse matrix or similar. - By default, the input is converted to an at least 2d numpy array. + By default, the input is converted to an at least 2nd numpy array. + If the dtype of the array is object, attempt converting to float, + raising on failure. Parameters ---------- @@ -245,8 +247,9 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, If the input is sparse but not in the allowed format, it will be converted to the first listed format. - dtype : string, type or None (default=none) + dtype : string, type or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. @@ -283,11 +286,19 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, accept_sparse = [accept_sparse] if sp.issparse(array): + if dtype == "numeric": + dtype = None array = _ensure_sparse_format(array, accept_sparse, dtype, order, copy, force_all_finite) else: if ensure_2d: array = np.atleast_2d(array) + if dtype == "numeric": + if hasattr(array, "dtype") and array.dtype.kind == "O": + # if input is object, convert to float. + dtype = np.float64 + else: + dtype = None array = np.array(array, dtype=dtype, order=order, copy=copy) if not allow_nd and array.ndim >= 3: raise ValueError("Found array with dim %d. Expected <= 2" % @@ -311,15 +322,17 @@ def check_array(array, accept_sparse=None, dtype=None, order=None, copy=False, return array -def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, +def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, - ensure_min_features=1): + ensure_min_features=1, y_numeric=False): """Input validation for standard estimators. Checks X and y for consistent length, enforces X 2d and y 1d. Standard input checks are only applied to y. For multi-label y, set multi_output=True to allow 2d and sparse y. + If the dtype of X is object, attempt converting to float, + raising on failure. Parameters ---------- @@ -335,8 +348,9 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, If the input is sparse but not in the allowed format, it will be converted to the first listed format. - dtype : string, type or None (default=none) + dtype : string, type or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. @@ -367,6 +381,9 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, (columns). The default value of 1 rejects empty datasets. This check is only enforced when ``ensure_2d`` is True and ``allow_nd`` is False. + y_numeric : boolean (default=False) + Whether to ensure that y has a numeric type. If dtype of y is object, + it is converted to float64. Should only be used for regression algorithms. Returns ------- @@ -377,10 +394,12 @@ def check_X_y(X, y, accept_sparse=None, dtype=None, order=None, copy=False, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features) if multi_output: - y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False) + y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, dtype=None) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) + if y_numeric and y.dtype.kind == 'O': + y = y.astype(np.float64) check_consistent_length(X, y) @@ -520,7 +539,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. - Checks if the estimator is fitted by verifying the presence of + Checks if the estimator is fitted by verifying the presence of "all_or_any" of the passed attributes and raises a NotFittedError with the given message.