-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
[WIP] ENH/FIX Support large sparse matrices #9678
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9de3138
ecada4d
3651c17
9e43610
a049743
9e60f95
62268ba
7ad32c7
2d11dca
3588adf
a3fdc01
77281d7
7cd25f9
9591b37
1f530e0
0d00f4c
11690ef
14f0384
f9fe4b8
9f6e0f7
1d0a282
0648248
4e8b9a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,8 @@ | |
|
||
from sklearn.utils import shuffle | ||
from sklearn.utils.fixes import signature | ||
from sklearn.utils.validation import has_fit_parameter, _num_samples | ||
from sklearn.utils.validation import (has_fit_parameter, _num_samples, | ||
LARGE_SPARSE_SUPPORTED) | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.datasets import load_iris, load_boston, make_blobs | ||
|
||
|
@@ -403,6 +404,40 @@ def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel): | |
return X | ||
|
||
|
||
def _generate_sparse_matrix(X_csr): | ||
"""Generate matrices in multiple formats for CSR,CSC and COO matrices | ||
|
||
Parameters | ||
---------- | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove whitespace (also below Returns) |
||
X_csr: CSR Matrix | ||
Input matrix in CSR format | ||
|
||
Returns | ||
------- | ||
|
||
out: iter(Matrices) | ||
In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo', | ||
'coo_64', 'csc_64', 'csr_64'] | ||
""" | ||
|
||
for sparse_format in ['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo']: | ||
yield sparse_format, X_csr.asformat(sparse_format) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we add a Just to be sure that we are not taking in e.g. a CSR matrix, returning it as it is when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There should be a copy=True kwarg iirc |
||
|
||
if LARGE_SPARSE_SUPPORTED: | ||
# Generate large indices matrix only if its supported by scipy | ||
X_coo = X_csr.asformat('coo') | ||
X_coo.row = X_coo.row.astype('int64') | ||
X_coo.col = X_coo.col.astype('int64') | ||
yield "coo_64", X_coo | ||
|
||
for sparse_format in ['csc', 'csr']: | ||
X = X_csr.asformat(sparse_format) | ||
X.indices = X.indices.astype('int64') | ||
X.indptr = X.indptr.astype('int64') | ||
yield sparse_format + "_64", X | ||
|
||
|
||
def check_estimator_sparse_data(name, estimator_orig): | ||
|
||
rng = np.random.RandomState(0) | ||
|
@@ -415,8 +450,7 @@ def check_estimator_sparse_data(name, estimator_orig): | |
with ignore_warnings(category=DeprecationWarning): | ||
estimator = clone(estimator_orig) | ||
y = multioutput_estimator_convert_y_2d(estimator, y) | ||
for sparse_format in ['csr', 'csc', 'dok', 'lil', 'coo', 'dia', 'bsr']: | ||
X = X_csr.asformat(sparse_format) | ||
for matrix_format, X in _generate_sparse_matrix(X_csr): | ||
# catch deprecation warnings | ||
with ignore_warnings(category=(DeprecationWarning, FutureWarning)): | ||
if name in ['Scaler', 'StandardScaler']: | ||
|
@@ -435,12 +469,19 @@ def check_estimator_sparse_data(name, estimator_orig): | |
assert_equal(probs.shape, (X.shape[0], 4)) | ||
except (TypeError, ValueError) as e: | ||
if 'sparse' not in repr(e).lower(): | ||
print("Estimator %s doesn't seem to fail gracefully on " | ||
"sparse data: error message state explicitly that " | ||
"sparse input is not supported if this is not the case." | ||
% name) | ||
raise | ||
except Exception: | ||
if "64" in matrix_format: | ||
raise AssertionError("Estimator %s doesn't seem to " | ||
"support %s matrix yet, also it " | ||
"has not been handled gracefully" | ||
" by accept_large_sparse." | ||
% (name, matrix_format)) | ||
else: | ||
print("Estimator %s doesn't seem to fail gracefully on " | ||
"sparse data: error message state explicitly that " | ||
"sparse input is not supported if this is not" | ||
" the case." % name) | ||
raise | ||
except Exception as e: | ||
print("Estimator %s doesn't seem to fail gracefully on " | ||
"sparse data: it should raise a TypeError if sparse input " | ||
"is explicitly not supported." % name) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,6 @@ | |
import sys | ||
|
||
import numpy as np | ||
|
||
import scipy.sparse as sp | ||
|
||
from sklearn.externals.six.moves import cStringIO as StringIO | ||
|
@@ -24,7 +23,8 @@ | |
from sklearn.linear_model import MultiTaskElasticNet | ||
from sklearn.svm import SVC | ||
from sklearn.neighbors import KNeighborsRegressor | ||
from sklearn.utils.validation import check_X_y, check_array | ||
from sklearn.utils.validation import (check_X_y, check_array, | ||
LARGE_SPARSE_SUPPORTED) | ||
|
||
|
||
class CorrectNotFittedError(ValueError): | ||
|
@@ -161,6 +161,26 @@ def predict(self, X): | |
return np.zeros(X.shape[0]) | ||
|
||
|
||
class LargeSparseNotSetClassifier(BaseEstimator): | ||
def fit(self, X, y): | ||
X, y = check_X_y(X, y, | ||
accept_sparse=("csr", "csc", "coo"), | ||
accept_large_sparse=True, | ||
multi_output=True, | ||
y_numeric=True) | ||
if sp.issparse(X): | ||
if X.getformat() == "coo": | ||
if X.col.dtype == "int64" or X.col.dtype == "int64": | ||
raise ValueError( | ||
"Estimator doesn't support 64-bit indices") | ||
elif X.getformat() in ["csc", "csr"]: | ||
if X.indices.dtype == "int64" or X.indptr.dtype == "int64": | ||
raise ValueError( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. codecov says this is never run. Why not? |
||
"Estimator doesn't support 64-bit indices") | ||
|
||
return self | ||
|
||
|
||
def test_check_estimator(): | ||
# tests that the estimator actually fails on "bad" estimators. | ||
# not a complete test of all checks, which are very extensive. | ||
|
@@ -235,6 +255,15 @@ def test_check_estimator(): | |
sys.stdout = old_stdout | ||
assert_true(msg in string_buffer.getvalue()) | ||
|
||
# Large indices test on bad estimator | ||
msg = ('Estimator LargeSparseNotSetClassifier doesn\'t seem to support ' | ||
r'\S{3}_64 matrix yet, also it has not been handled gracefully by ' | ||
'accept_large_sparse.') | ||
# only supported by scipy version more than 0.14.0 | ||
if LARGE_SPARSE_SUPPORTED: | ||
assert_raises_regex(AssertionError, msg, check_estimator, | ||
LargeSparseNotSetClassifier) | ||
|
||
# doesn't error on actual estimator | ||
check_estimator(AdaBoostClassifier) | ||
check_estimator(AdaBoostClassifier()) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know it's a private function, but this could be reformulated to,