-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[MRG + 2] FIX dtypes to conform to the stricter type cast rules of numpy 1.10 #5398
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4f4c359
affa122
21fab6e
cac1d28
47f19e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
import numpy as np | ||
from scipy import linalg | ||
from ..utils import arpack | ||
from ..utils.validation import check_is_fitted | ||
from ..utils.validation import check_is_fitted, FLOAT_DTYPES | ||
|
||
__all__ = ['PLSCanonical', 'PLSRegression', 'PLSSVD'] | ||
|
||
|
@@ -375,14 +375,14 @@ def transform(self, X, Y=None, copy=True): | |
x_scores if Y is not given, (x_scores, y_scores) otherwise. | ||
""" | ||
check_is_fitted(self, 'x_mean_') | ||
X = check_array(X, copy=copy) | ||
X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) | ||
# Normalize | ||
X -= self.x_mean_ | ||
X /= self.x_std_ | ||
# Apply rotation | ||
x_scores = np.dot(X, self.x_rotations_) | ||
if Y is not None: | ||
Y = check_array(Y, ensure_2d=False, copy=copy) | ||
Y = check_array(Y, ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES) | ||
if Y.ndim == 1: | ||
Y = Y.reshape(-1, 1) | ||
Y -= self.y_mean_ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @GaelVaroquaux @ogrisel @lesteve I am also wondering if instead of explicitly typecasting
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would (?) preserve int as it is without typecasting it when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would (?) preserve int as it is without typecasting it when y_mean_ is
also int!
Yes, but doing linear algebra on ints is meaningless. The fact that no
errors were raised before is actually a problem in my opinion.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay!! |
||
|
@@ -410,7 +410,7 @@ def predict(self, X, copy=True): | |
be an issue in high dimensional space. | ||
""" | ||
check_is_fitted(self, 'x_mean_') | ||
X = check_array(X, copy=copy) | ||
X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) | ||
# Normalize | ||
X -= self.x_mean_ | ||
X /= self.x_std_ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
from ..utils import check_array, as_float_array, check_random_state | ||
from ..utils.extmath import fast_dot | ||
from ..utils.validation import check_is_fitted | ||
from ..utils.validation import FLOAT_DTYPES | ||
|
||
__all__ = ['fastica', 'FastICA'] | ||
|
||
|
@@ -261,7 +262,7 @@ def my_g(x): | |
fun_args = {} if fun_args is None else fun_args | ||
# make interface compatible with other decompositions | ||
# a copy is required only for non whitened data | ||
X = check_array(X, copy=whiten).T | ||
X = check_array(X, copy=whiten, dtype=FLOAT_DTYPES).T | ||
|
||
alpha = fun_args.get('alpha', 1.0) | ||
if not 1 <= alpha <= 2: | ||
|
@@ -540,7 +541,7 @@ def transform(self, X, y=None, copy=True): | |
""" | ||
check_is_fitted(self, 'mixing_') | ||
|
||
X = check_array(X, copy=copy) | ||
X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) | ||
if self.whiten: | ||
X -= self.mean_ | ||
|
||
|
@@ -563,8 +564,7 @@ def inverse_transform(self, X, copy=True): | |
""" | ||
check_is_fitted(self, 'mixing_') | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is that not check_array here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and why would we want to copy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was there an error here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so even if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh! So I must've done this pre-emptively I think... I'll check against master once to confirm! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have replaced with
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we set |
||
if copy: | ||
X = X.copy() | ||
X = check_array(X, copy=(copy and self.whiten), dtype=FLOAT_DTYPES) | ||
X = fast_dot(X, self.mixing_.T) | ||
if self.whiten: | ||
X += self.mean_ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1074,6 +1074,8 @@ def fit(self, X, y): | |
""" | ||
self.fit_path = True | ||
X, y = check_X_y(X, y, y_numeric=True) | ||
X = as_float_array(X, copy=self.copy_X) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why? why not dtypes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is that for memory efficiency? Then at least y was already converted, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, wait, I thought There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually we could squash the three lines into
Can I do that without any side effect? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well it wouldn't convert y and it would upcast int32 to float64 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah okay!! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, if you think upcasting int32 to float64 is an issue, we should get it fixed everywhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
y = as_float_array(y, copy=self.copy_X) | ||
|
||
# init cross-validation generator | ||
cv = check_cv(self.cv, X, y, classifier=False) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -323,7 +323,7 @@ def transform(self, X): | |
""" | ||
check_is_fitted(self, 'scale_') | ||
|
||
X = check_array(X, copy=self.copy, ensure_2d=False) | ||
X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn m D96B ore. That's good! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😕 is that not going to cast a int32 dtype into a float64: import numpy as np
from sklearn.utils.validation import check_array, FLOAT_DTYPES
check_array(np.arange(10, dtype='i4'), dtype=FLOAT_DTYPES).dtype returns dtype('float64'). On the other hand as_float_array seems to do the right thing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😕 is that not going to cast a int32 dtype into a float64:
Correct, but at least it won't cast a float32 into a float64, which is
the most important thing to avoid.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks loic! will fix it!! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You get an array of twice the memory in an 'int32' cast to 'float64' case but this may not happen that much in practice I guess. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You get an array of twice the memory in an 'int32' cast to 'float64' case but
this may not happen that much in practice I guess.
In principle, I agree with you. But here we are trying to get a hot fix
and not to delay the release. Addressing your comment would require
developping new validation code.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that would be too difficult to do. But I also don't know if its high up in the priority list. We could fix that up if someone files an issue saying that their memory crashes after feeding in integer data to some model. |
||
if X.ndim == 1: | ||
warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) | ||
|
||
|
@@ -341,7 +341,7 @@ def inverse_transform(self, X): | |
""" | ||
check_is_fitted(self, 'scale_') | ||
|
||
X = check_array(X, copy=self.copy, ensure_2d=False) | ||
X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES) | ||
if X.ndim == 1: | ||
warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning) | ||
|
||
|
@@ -1068,14 +1068,14 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin): | |
[4, 5]]) | ||
>>> poly = PolynomialFeatures(2) | ||
>>> poly.fit_transform(X) | ||
array([[ 1, 0, 1, 0, 0, 1], | ||
[ 1, 2, 3, 4, 6, 9], | ||
[ 1, 4, 5, 16, 20, 25]]) | ||
array([[ 1., 0., 1., 0., 0., 1.], | ||
[ 1., 2., 3., 4., 6., 9.], | ||
[ 1., 4., 5., 16., 20., 25.]]) | ||
>>> poly = PolynomialFeatures(interaction_only=True) | ||
>>> poly.fit_transform(X) | ||
array([[ 1, 0, 1, 0], | ||
[ 1, 2, 3, 6], | ||
[ 1, 4, 5, 20]]) | ||
array([[ 1., 0., 1., 0.], | ||
[ 1., 2., 3., 6.], | ||
[ 1., 4., 5., 20.]]) | ||
|
||
Attributes | ||
---------- | ||
|
@@ -1149,7 +1149,7 @@ def transform(self, X, y=None): | |
""" | ||
check_is_fitted(self, ['n_input_features_', 'n_output_features_']) | ||
|
||
X = check_array(X) | ||
X = check_array(X, dtype=FLOAT_DTYPES) | ||
n_samples, n_features = X.shape | ||
|
||
if n_features != self.n_input_features_: | ||
|
@@ -1313,7 +1313,7 @@ def binarize(X, threshold=0.0, copy=True): | |
|
||
Parameters | ||
---------- | ||
XX : {array-like, sparse matrix}, shape [n_samples, n_features] | ||
X : {array-like, sparse matrix}, shape [n_samples, n_features] | ||
The data to binarize, element by element. | ||
scipy.sparse matrices should be in CSR or CSC format to avoid an | ||
un-necessary copy. | ||
|
@@ -1438,7 +1438,7 @@ def fit(self, K, y=None): | |
------- | ||
self : returns an instance of self. | ||
""" | ||
K = check_array(K) | ||
K = check_array(K, dtype=FLOAT_DTYPES) | ||
n_samples = K.shape[0] | ||
self.K_fit_rows_ = np.sum(K, axis=0) / n_samples | ||
self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples | ||
|
@@ -1461,9 +1461,7 @@ def transform(self, K, y=None, copy=True): | |
""" | ||
check_is_fitted(self, 'K_fit_all_') | ||
|
||
K = check_array(K) | ||
if copy: | ||
K = K.copy() | ||
K = check_array(K, copy=copy, dtype=FLOAT_DTYPES) | ||
|
||
K_pred_cols = (np.sum(K, axis=1) / | ||
< 10000 td class="blob-code blob-code-context js-file-line"> self.K_fit_rows_.shape[0])[:, np.newaxis] | ||
|
@@ -1503,7 +1501,7 @@ def add_dummy_feature(X, value=1.0): | |
array([[ 1., 0., 1.], | ||
[ 1., 1., 0.]]) | ||
""" | ||
X = check_array(X, accept_sparse=['csc', 'csr', 'coo']) | ||
X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], dtype=FLOAT_DTYPES) | ||
n_samples, n_features = X.shape | ||
shape = (n_samples, n_features + 1) | ||
if sparse.issparse(X): | ||
|
@@ -1558,7 +1556,7 @@ def _transform_selected(X, transform, selected="all", copy=True): | |
if selected == "all": | ||
return transform(X) | ||
|
||
X = check_array(X, accept_sparse='csc', copy=copy) | ||
X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) | ||
|
||
if len(selected) == 0: | ||
return X | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
|
||
import numpy as np | ||
from scipy import sparse | ||
|
||
|
@@ -12,7 +13,7 @@ | |
from sklearn import grid_search | ||
from sklearn import tree | ||
from sklearn.random_projection import sparse_random_matrix | ||
|
||
|
||
def _check_statistics(X, X_true, | ||
strategy, statistics, missing_values): | ||
|
@@ -121,6 +122,18 @@ def test_imputation_mean_median_only_zero(): | |
statistics_median, 0) | ||
|
||
|
||
def safe_median(arr, *args, **kwargs): | ||
# np.median([]) raises a TypeError for numpy >= 1.10.1 | ||
length = arr.size if hasattr(arr, 'size') else len(arr) | ||
return np.nan if length == 0 else np.median(arr, *args, **kwargs) | ||
|
||
|
||
def safe_mean(arr, *args, **kwargs): | ||
# np.mean([]) raises a RuntimeWarning for numpy >= 1.10.1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and before, I think. But the statement is not wrong, I guess ^^ |
||
length = arr.size if hasattr(arr, 'size') else len(arr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the line is missing here now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lol sorry :P done :) |
||
return np.nan if length == 0 else np.mean(arr, *args, **kwargs) | ||
|
||
|
||
def test_imputation_mean_median(): | ||
# Test imputation using the mean and median strategies, when | ||
# missing_values != 0. | ||
|
@@ -134,9 +147,9 @@ def test_imputation_mean_median(): | |
values = np.arange(1, shape[0]+1) | ||
values[4::2] = - values[4::2] | ||
|
||
tests = [("mean", "NaN", lambda z, v, p: np.mean(np.hstack((z, v)))), | ||
tests = [("mean", "NaN", lambda z, v, p: safe_mean(np.hstack((z, v)))), | ||
("mean", 0, lambda z, v, p: np.mean(v)), | ||
("median", "NaN", lambda z, v, p: np.median(np.hstack((z, v)))), | ||
("median", "NaN", lambda z, v, p: safe_median(np.hstack((z, v)))), | ||
("median", 0, lambda z, v, p: np.median(v))] | ||
|
||
for strategy, test_missing_values, true_value_fun in tests: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lesteve @ogrisel Is this better than
*
since we know which version we are testing against?