8000 [WIP] ENH/FIX Support large sparse matrices by kdhingra307 · Pull Request #9678 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[WIP] ENH/FIX Support large sparse matrices #9678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9de3138
Fix to Issue #9545, it includes a new param accept_large_sparse to ch…
Sep 3, 2017
ecada4d
Merge branch 'master' of https://github.com/scikit-learn/scikit-learn…
kdhingra307 Nov 22, 2017
3651c17
Cleared Flake8 issues and also chained if loop issue, test cases on t…
kdhingra307 Dec 27, 2017
9e43610
Changed Naming Conventions
kdhingra307 Dec 27, 2017
a049743
test cases added, flake should not give any error now
kdhingra307 Dec 30, 2017
9e60f95
More checks
kdhingra307 Jan 3, 2018
62268ba
Appened Estimator file with int64 matrices
kdhingra307 Jan 17, 2018
7ad32c7
Checked flake and scipy errors
kdhingra307 Jan 19, 2018
2d11dca
Merged- latest changes
kdhingra307 Jan 19, 2018
3588adf
Estimator Check now supports int32 and int64 based indices, _large_sp…
kdhingra307 Feb 6, 2018
a3fdc01
Merged to base
kdhingra307 Feb 6, 2018
77281d7
Merged to latest remote
kdhingra307 Feb 8, 2018
7cd25f9
Dummy Estimator added, also SAG 64bit indices error case handled
kdhingra307 Feb 9, 2018
9591b37
Merge branch 'master' of http://github.com/scikit-learn/scikit-learn …
Feb 11, 2018
1f530e0
Added scipy check, along with tuple of matrix format
Feb 11, 2018
0d00f4c
Removed extra paranthesis and used LooseVersion instead of NumpyVersion
Feb 11, 2018
11690ef
Added check for unsupported scipy version in validation.py along with…
Feb 12, 2018
14f0384
modified sparse version check for test_validation
Feb 12, 2018
f9fe4b8
Reverted all of the unnecessary changes
Feb 13, 2018
9f6e0f7
Reverted unnecessary changed, which got missed in previous commit
Feb 13, 2018
1d0a282
sorted flake8 issue in nmf.py
Feb 13, 2018
0648248
Revised whole algorithms from ground up plus more flatter now
Feb 13, 2018
4e8b9a0
Changed few norms, default case still needed to be changed
kdhingra307 Mar 3, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions sklearn/decomposition/nmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
"""
# so W and Ht are both in C order in memory
Ht = check_array(H.T, order='C')
X = check_array(X, accept_sparse='csr')
X = check_array(X, accept_sparse='csr', accept_large_sparse=True)

rng = check_random_state(random_state)

Expand Down Expand Up @@ -972,7 +972,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
factorization with the beta-divergence. Neural Computation, 23(9).
"""

X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
X = check_array(X, accept_sparse=('csr', 'csc'),
dtype=float, accept_large_sparse=True)
check_non_negative(X, "NMF (input X)")
beta_loss = _check_string_param(solver, regularization, beta_loss, init)

Expand Down Expand Up @@ -1225,7 +1226,8 @@ def fit_transform(self, X, y=None, W=None, H=None):
W : array, shape (n_samples, n_components)
Transformed data.
"""
X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
X = check_array(X, accept_sparse=('csr', 'csc'),
dtype=float, accept_large_sparse=True)

W, H, n_iter_ = non_negative_factorization(
X=X, W=W, H=H, n_components=self.n_components, init=self.init,
Expand Down
5 changes: 3 additions & 2 deletions sklearn/decomposition/truncated_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def fit_transform(self, X, y=None):
X_new : array, shape (n_samples, n_components)
Reduced version of X. This will always be a dense array.
"""
X = check_array(X, accept_sparse=['csr', 'csc'])
X = check_array(X, accept_sparse=[
'csr', 'csc'], accept_large_sparse=True)
random_state = check_random_state(self.random_state)

if self.algorithm == "arpack":
Expand Down Expand Up @@ -207,7 +208,7 @@ def transform(self, X):
X_new : array, shape (n_samples, n_components)
Reduced version of X. This will always be a dense array.
"""
X = check_array(X, accept_sparse='csr')
X = check_array(X, accept_sparse='csr', accept_large_sparse=True)
return safe_sparse_dot(X, self.components_.T)

def inverse_transform(self, X):
Expand Down
8 changes: 5 additions & 3 deletions sklearn/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ def fit(self, X, y):
def _decision_function(self, X):
check_is_fitted(self, "coef_")

X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
X = check_array(X, accept_sparse=[
'csr', 'csc', 'coo'], accept_large_sparse=True)
return safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_

Expand Down Expand Up @@ -297,7 +298,7 @@ class would be predicted.
raise NotFittedError("This %(name)s instance is not fitted "
"yet" % {'name': type(self).__name__})

X = check_array(X, accept_sparse='csr')
X = check_array(X, accept_sparse='csr', accept_large_sparse=True)

n_features = self.coef_.shape[1]
if X.shape[1] != n_features:
Expand Down Expand Up @@ -479,7 +480,8 @@ def fit(self, X, y, sample_weight=None):

n_jobs_ = self.n_jobs
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
y_numeric=True, multi_output=True)
y_numeric=True, multi_output=True,
accept_large_sparse=True)

if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
raise ValueError("Sample weights must be 1D array or scalar")
Expand Down
8 changes: 6 additions & 2 deletions sklearn/linear_model/logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,8 +1216,12 @@ def fit(self, X, y, sample_weight=None):
else:
_dtype = np.float64

X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
order="C")
if self.solver not in ['sag', 'saga']:
X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
order="C", accept_large_sparse=True)
else:
X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
order="C")
check_classification_targets(y)
self.classes_ = np.unique(y)
n_samples, n_features = X.shape
Expand Down
3 changes: 2 additions & 1 deletion sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,7 +1443,8 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
raise ValueError("'%d' is not a supported axis" % axis)

X = check_array(X, sparse_format, copy=copy,
estimator='the normalize function', dtype=FLOAT_DTYPES)
estimator='the normalize function', dtype=FLOAT_DTYPES,
accept_large_sparse=True)
if axis == 0:
X = X.T

Expand Down
12 changes: 9 additions & 3 deletions sklearn/svm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ..utils import column_or_1d, check_X_y
from ..utils import compute_class_weight
from ..utils.extmath import safe_sparse_dot
from ..utils.validation import check_is_fitted
from ..utils.validation import check_is_fitted, _check_large_sparse
from ..utils.multiclass import check_classification_targets
from ..externals import six
from ..exceptions import ConvergenceWarning
Expand Down Expand Up @@ -144,7 +144,8 @@ def fit(self, X, y, sample_weight=None):
raise TypeError("Sparse precomputed kernels are not supported.")
self._sparse = sparse and not callable(self.kernel)

X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
X, y = check_X_y(X, y, dtype=np.float64,
order='C', accept_sparse='csr')
y = self._validate_targets(y)

sample_weight = np.asarray([]
Expand Down Expand Up @@ -188,7 +189,8 @@ def fit(self, X, y, sample_weight=None):
self.shape_fit_ = X.shape

# In binary case, we need to flip the sign of coef, intercept and
# decision function. Use self._intercept_ and self._dual_coef_ internally.
# decision function. Use self._intercept_ and self._dual_coef_
# internally.
self._intercept_ = self.intercept_.copy()
self._dual_coef_ = self.dual_coef_
if self._impl in ['c_svc', 'nu_svc'] and len(self.classes_) == 2:
Expand Down Expand Up @@ -864,6 +866,10 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
libsvm_sparse.set_verbosity_wrap(verbose)
liblinear.set_verbosity_wrap(verbose)

# Liblinear doesn't support 64bit sparse matrix indices yet
if sp.issparse(X):
_check_large_sparse(X)

# LibLinear wants targets as doubles, even for classification
y_ind = np.asarray(y_ind, dtype=np.float64).ravel()
if sample_weight is None:
Expand Down
59 changes: 50 additions & 9 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@

from sklearn.utils import shuffle
from sklearn.utils.fixes import signature
from sklearn.utils.validation import has_fit_parameter, _num_samples
from sklearn.utils.validation import (has_fit_parameter, _num_samples,
LARGE_SPARSE_SUPPORTED)
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris, load_boston, make_blobs

Expand Down Expand Up @@ -403,6 +404,40 @@ def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):
return X


def _generate_sparse_matrix(X_csr):
"""Generate matrices in multiple formats for CSR,CSC and COO matrices
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it's a private function, but this could be reformulated to,

Generate 32 and 64 bit indexed sparse matrices in CSR, CSC and COO format


Parameters
----------

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove whitespace (also below Returns)

X_csr: CSR Matrix
Input matrix in CSR format

Returns
-------

out: iter(Matrices)
In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
'coo_64', 'csc_64', 'csr_64']
"""

for sparse_format in ['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo']:
yield sparse_format, X_csr.asformat(sparse_format)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add a X_csr.copy().asformat here (and on L430) ?

Just to be sure that we are not taking in e.g. a CSR matrix, returning it as it is when sparse_format="csr" that get's then in place modified by some estimator and so the next iteration of sparse_format="csc" would use this modified version.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a copy=True kwarg iirc


if LARGE_SPARSE_SUPPORTED:
# Generate large indices matrix only if its supported by scipy
X_coo = X_csr.asformat('coo')
X_coo.row = X_coo.row.astype('int64')
X_coo.col = X_coo.col.astype('int64')
yield "coo_64", X_coo

for sparse_format in ['csc', 'csr']:
X = X_csr.asformat(sparse_format)
X.indices = X.indices.astype('int64')
X.indptr = X.indptr.astype('int64')
yield sparse_format + "_64", X


def check_estimator_sparse_data(name, estimator_orig):

rng = np.random.RandomState(0)
Expand All @@ -415,8 +450,7 @@ def check_estimator_sparse_data(name, estimator_orig):
with ignore_warnings(category=DeprecationWarning):
estimator = clone(estimator_orig)
y = multioutput_estimator_convert_y_2d(estimator, y)
for sparse_format in ['csr', 'csc', 'dok', 'lil', 'coo', 'dia', 'bsr']:
X = X_csr.asformat(sparse_format)
for matrix_format, X in _generate_sparse_matrix(X_csr):
# catch deprecation warnings
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
if name in ['Scaler', 'StandardScaler']:
Expand All @@ -435,12 +469,19 @@ def check_estimator_sparse_data(name, estimator_orig):
assert_equal(probs.shape, (X.shape[0], 4))
except (TypeError, ValueError) as e:
if 'sparse' not in repr(e).lower():
print("Estimator %s doesn't seem to fail gracefully on "
"sparse data: error message state explicitly that "
"sparse input is not supported if this is not the case."
% name)
raise
except Exception:
if "64" in matrix_format:
raise AssertionError("Estimator %s doesn't seem to "
"support %s matrix yet, also it "
"has not been handled gracefully"
" by accept_large_sparse."
% (name, matrix_format))
else:
print("Estimator %s doesn't seem to fail gracefully on "
"sparse data: error message state explicitly that "
"sparse input is not supported if this is not"
" the case." % name)
raise
except Exception as e:
print("Estimator %s doesn't seem to fail gracefully on "
"sparse data: it should raise a TypeError if sparse input "
"is explicitly not supported." % name)
Expand Down
33 changes: 31 additions & 2 deletions sklearn/utils/tests/test_estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys

import numpy as np

import scipy.sparse as sp

from sklearn.externals.six.moves import cStringIO as StringIO
Expand All @@ -24,7 +23,8 @@
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_X_y, check_array
from sklearn.utils.validation import (check_X_y, check_array,
LARGE_SPARSE_SUPPORTED)


class CorrectNotFittedError(ValueError):
Expand Down Expand Up @@ -161,6 +161,26 @@ def predict(self, X):
return np.zeros(X.shape[0])


class LargeSparseNotSetClassifier(BaseEstimator):
def fit(self, X, y):
X, y = check_X_y(X, y,
accept_sparse=("csr", "csc", "coo"),
accept_large_sparse=True,
multi_output=True,
y_numeric=True)
if sp.issparse(X):
if X.getformat() == "coo":
if X.col.dtype == "int64" or X.col.dtype == "int64":
raise ValueError(
"Estimator doesn't support 64-bit indices")
elif X.getformat() in ["csc", "csr"]:
if X.indices.dtype == "int64" or X.indptr.dtype == "int64":
raise ValueError(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

codecov says this is never run. Why not?

"Estimator doesn't support 64-bit indices")

return self


def test_check_estimator():
# tests that the estimator actually fails on "bad" estimators.
# not a complete test of all checks, which are very extensive.
Expand Down Expand Up @@ -235,6 +255,15 @@ def test_check_estimator():
sys.stdout = old_stdout
assert_true(msg in string_buffer.getvalue())

# Large indices test on bad estimator
msg = ('Estimator LargeSparseNotSetClassifier doesn\'t seem to support '
r'\S{3}_64 matrix yet, also it has not been handled gracefully by '
'accept_large_sparse.')
# only supported by scipy version more than 0.14.0
if LARGE_SPARSE_SUPPORTED:
assert_raises_regex(AssertionError, msg, check_estimator,
LargeSparseNotSetClassifier)

# doesn't error on actual estimator
check_estimator(AdaBoostClassifier)
check_estimator(AdaBoostClassifier())
Expand Down
40 changes: 39 additions & 1 deletion sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest
import numpy as np
import scipy.sparse as sp
from scipy import __version__ as scipy_version

from sklearn.utils.testing import assert_true, assert_false, assert_equal
from sklearn.utils.testing import assert_raises
Expand All @@ -35,7 +36,8 @@
check_is_fitted,
check_consistent_length,
assert_all_finite,
check_memory
check_memory,
LARGE_SPARSE_SUPPORTED
)
import sklearn

Expand Down Expand Up @@ -427,6 +429,42 @@ def test_check_array_accept_sparse_no_exception():
check_array(X_csr, accept_sparse=('csr',))


def test_check_array_accept_large_sparse_no_exception():
# When large sparse are allowed
if LARGE_SPARSE_SUPPORTED:
X = sp.rand(10, 1000, format='csr')
X.indices = X.indices.astype('int64')
X.indptr = X.indptr.astype('int64')
check_array(X, accept_large_sparse=True, accept_sparse=True)


def test_check_array_accept_large_sparse_raise_exception():
# When large sparse are not allowed
if LARGE_SPARSE_SUPPORTED:
X = sp.rand(10, 1000, format='csr')
X.indices = X.indices.astype('int64')
X.indptr = X.indptr.astype('int64')
msg = "Only sparse matrices with 32-bit integer indices" + \
" are accepted. Got int64 indices."
assert_raise_message(ValueError, msg.format([]),
check_array, X, accept_sparse=True,
accept_large_sparse=False)


def test_check_array_large_indices_non_supported_scipy_version():
# Large indices should not be allowed for scipy<0.14.0
if not LARGE_SPARSE_SUPPORTED:
X = sp.rand(10, 1000, format='csr')
X.indices = X.indices.astype('int64')
X.indptr = X.indptr.astype('int64')
msg = ("Scipy version %s does not support large"
" indices, please upgrade your scipy"
" to 0.14.0 or above" % scipy_version)

assert_raise_message(ValueError, msg.format([]), check_array,
X, accept_sparse='csc')


def test_check_array_min_samples_and_features_messages():
# empty list is considered 2D by default:
msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."
Expand Down
Loading
0