8000 PERF: Prevent using copy in astype if not needed by jrings · Pull Request #4575 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

PERF: Prevent using copy in astype if not needed #4575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sklearn/cluster/k_means_.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ def _check_test_data(self, X):
warnings.warn("Got data type %s, converted to float "
"to avoid overflows" % X.dtype,
RuntimeWarning, stacklevel=2)
X = X.astype(np.float)
X = astype(X, np.float, copy=False)

return X

Expand Down
1 change: 0 additions & 1 deletion sklearn/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

from ..utils import check_random_state


class Bunch(dict):
"""Container object for datasets

Expand Down
5 changes: 3 additions & 2 deletions sklearn/datasets/twenty_newsgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from .base import Bunch
from .base import load_files
from ..utils import check_random_state
from ..utils.fixes import astype
from ..feature_extraction.text import CountVectorizer
from ..preprocessing import normalize
from ..externals import joblib, six
Expand Down Expand Up @@ -345,8 +346,8 @@ def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None):

# the data is stored as int16 for compactness
# but normalize needs floats
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)
X_train = astype(X_train, np.float64, copy=False)
X_test = astype(X_test, np.float64, copy=False)
normalize(X_train, copy=False)
normalize(X_test, copy=False)

Expand Down
4 changes: 2 additions & 2 deletions sklearn/ensemble/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from ..utils import check_random_state, check_array, check_X_y, column_or_1d
from ..utils import check_consistent_length, deprecated
from ..utils.extmath import logsumexp
from ..utils.fixes import expit, bincount
from ..utils.fixes import astype, expit, bincount
from ..utils.stats import _weighted_percentile
from ..utils.validation import check_is_fitted, NotFittedError
from ..externals import six
Expand Down Expand Up @@ -1180,7 +1180,7 @@ def feature_importances_(self):
def _validate_y(self, y):
self.n_classes_ = 1
if y.dtype.kind == 'O':
y = y.astype(np.float64)
y = astype(y, np.float64, copy=False)
# Default implementation
return y

Expand Down
3 changes: 2 additions & 1 deletion sklearn/linear_model/stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..externals import six

from .sgd_fast import plain_sgd, average_sgd
from ..utils.fixes import astype
from ..utils.seq_dataset import ArrayDataset, CSRDataset
from ..utils import compute_class_weight
from .sgd_fast import Hinge
Expand Down Expand Up @@ -867,7 +868,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
n_iter, sample_weight,
coef_init, intercept_init):
X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64)
y = y.astype(np.float64)
y = astype(y, np.float64, copy=False)

n_samples, n_features = X.shape

Expand Down
5 changes: 3 additions & 2 deletions sklearn/manifold/locally_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_random_state, check_array
from ..utils.arpack import eigsh
from ..utils.fixes import astype
from ..utils.validation import check_is_fitted
from ..neighbors import NearestNeighbors

Expand Down Expand Up @@ -43,9 +44,9 @@ def barycenter_weights(X, Z, reg=1e-3):

n_samples, n_neighbors = X.shape[0], Z.shape[1]
if X.dtype.kind == 'i':
X = X.astype(np.float)
X = astype(X, np.float, copy=False)
if Z.dtype.kind == 'i':
Z = Z.astype(np.float)
Z = astype(Z, np.float, copy=False)
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)

Expand Down
9 changes: 6 additions & 3 deletions sklearn/manifold/spectral_embedding_.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..ext 1E0A ernals import six
from ..utils import check_random_state, check_array, check_symmetric
from ..utils.extmath import _deterministic_vector_sign_flip
from ..utils.fixes import astype
from ..utils.graph import graph_laplacian
from ..utils.sparsetools import connected_components
from ..utils.arpack import eigsh
Expand Down Expand Up @@ -263,7 +264,8 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=No F438 ne,
# problem.
if not sparse.issparse(laplacian):
warnings.warn("AMG works better for sparse matrices")
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
# lobpcg needs native floats
laplacian = astype(laplacian, np.float, copy=False)
laplacian = _set_diag(laplacian, 1)
ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
M = ml.aspreconditioner()
Expand All @@ -276,7 +278,8 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
raise ValueError

elif eigen_solver == "lobpcg":
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
# lobpcg needs native floats
laplacian = astype(laplacian, np.float, copy=True)
if n_nodes < 5 * n_components + 1:
# see note above under arpack why lobpcg has problems with small
# number of nodes
Expand All @@ -287,7 +290,7 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
embedding = diffusion_map.T[:n_components] * dd
else:
# lobpcg needs native floats
laplacian = laplacian.astype(np.float)
laplacian = astype(laplacian, np.float, copy=False)
laplacian = _set_diag(laplacian, 1)
# We increase the number of eigenvectors requested, as lobpcg
# doesn't behave well in low dimension
Expand Down
5 changes: 3 additions & 2 deletions sklearn/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .preprocessing import label_binarize
from .utils import check_X_y, check_array
from .utils.extmath import safe_sparse_dot, logsumexp
from .utils.fixes import astype
from .utils.multiclass import _check_partial_fit_first_call
from .utils.fixes import in1d
from .utils.validation import check_is_fitted
Expand Down Expand Up @@ -473,7 +474,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
raise ValueError(msg % (X.shape[0], y.shape[0]))

# convert to float to support sample weight consistently
Y = Y.astype(np.float64)
Y = astype(Y, np.float64, copy=False)
if sample_weight is not None:
Y *= check_array(sample_weight).T

Expand Down Expand Up @@ -522,7 +523,7 @@ def fit(self, X, y, sample_weight=None):

# convert to float to support sample weight consistently;
# this means we also don't have to cast X to floating point
Y = Y.astype(np.float64)
Y = astype(Y, np.float64, copy=False)
if sample_weight is not None:
Y *= check_array(sample_weight).T

Expand Down
10 changes: 5 additions & 5 deletions sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
from ..utils import check_array
from ..utils import warn_if_not_float
from ..utils.extmath import row_norms
from ..utils.fixes import (combinations_with_replacement as combinations_w_r,
bincount)
from ..utils.fixes import isclose
from ..utils.fixes import (astype,
combinations_with_replacement as combinations_w_r,
bincount, isclose)
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis)
Expand Down Expand Up @@ -347,7 +347,7 @@ def fit(self, X, y=None):
X = check_array(X, accept_sparse='csr', copy=self.copy,
ensure_2d=False)
if warn_if_not_float(X, estimator=self):
X = X.astype(np.float)
X = astype(X, np.float, copy=False)
if sparse.issparse(X):
if self.with_mean:
raise ValueError(
Expand Down Expand Up @@ -380,7 +380,7 @@ def transform(self, X, y=None, copy=None):
copy = copy if copy is not None else self.copy
X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)
if warn_if_not_float(X, estimator=self):
X = X.astype(np.float)
X = astype(X, np.float, copy=False)
if sparse.issparse(X):
if self.with_mean:
raise ValueError(
Expand Down
3 changes: 2 additions & 1 deletion sklearn/utils/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
from scipy import sparse

from .fixes import astype
from .graph_shortest_path import graph_shortest_path


Expand Down Expand Up @@ -113,7 +114,7 @@ def graph_laplacian(csgraph, normed=False, return_diag=False):

if normed and (np.issubdtype(csgraph.dtype, np.int)
or np.issubdtype(csgraph.dtype, np.uint)):
csgraph = csgraph.astype(np.float)
csgraph = astype(csgraph, np.float, copy=False)

if sparse.isspmatrix(csgraph):
return _laplacian_sparse(csgraph, normed=normed,
Expand Down
4 changes: 2 additions & 2 deletions sklearn/utils/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import array

from sklearn.utils import check_random_state

from sklearn.utils.fixes import astype
from ._random import sample_without_replacement

__all__ = ['sample_without_replacement', 'choice']
Expand Down Expand Up @@ -238,7 +238,7 @@ def random_choice_csc(n_samples, classes, class_probability=None,
if classes[j].dtype.kind != 'i':
raise ValueError("class dtype %s is not supported" %
classes[j].dtype)
classes[j] = classes[j].astype(int)
classes[j] = astype(classes[j], int, copy=False)

# use uniform distribution if no class_probability is given
if class_probability is None:
Expand Down
6 changes: 4 additions & 2 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import scipy.sparse as sp

from ..externals import six
from .fixes import astype
from inspect import getargspec


Expand Down Expand Up @@ -250,7 +251,8 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
spmatrix = spmatrix.astype(dtype)
else:
# create new
spmatrix = spmatrix.asformat(accept_sparse[0]).astype(dtype)
spmatrix = spmatrix.asformat(accept_sparse[0])
spmatrix = spmatrix.astype(dtype)
if force_all_finite:
if not hasattr(spmatrix, "data"):
warnings.warn("Can't check %s sparse matrix for nan or inf."
Expand Down Expand Up @@ -444,7 +446,7 @@ def check_X_y(X, y, accept_sparse=None, dtype="numeric", order=None, copy=False,
y = column_or_1d(y, warn=True)
_assert_all_finite(y)
if y_numeric and y.dtype.kind == 'O':
y = y.astype(np.float64)
y = astype(y, np.float64, copy=False)

check_consistent_length(X, y)

Expand Down
0