scikit-learn · kdhingra307 · Sep 3, 2017 · Nov 22, 2017 · Dec 27, 2017 · Dec 27, 2017
diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py
@@ -474,7 +474,7 @@ def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
     """
     # so W and Ht are both in C order in memory
     Ht = check_array(H.T, order='C')
-    X = check_array(X, accept_sparse='csr')
+    X = check_array(X, accept_sparse='csr', accept_large_sparse=True)
 
     rng = check_random_state(random_state)
 
@@ -972,7 +972,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
     factorization with the beta-divergence. Neural Computation, 23(9).
     """
 
-    X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
+    X = check_array(X, accept_sparse=('csr', 'csc'),
+                    dtype=float, accept_large_sparse=True)
     check_non_negative(X, "NMF (input X)")
     beta_loss = _check_string_param(solver, regularization, beta_loss, init)
 
@@ -1225,7 +1226,8 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : array, shape (n_samples, n_components)
             Transformed data.
         """
-        X = check_array(X, accept_sparse=('csr', 'csc'), dtype=float)
+        X = check_array(X, accept_sparse=('csr', 'csc'),
+                        dtype=float, accept_large_sparse=True)
 
         W, H, n_iter_ = non_negative_factorization(
             X=X, W=W, H=H, n_components=self.n_components, init=self.init,

diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py
@@ -157,7 +157,8 @@ def fit_transform(self, X, y=None):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse=['csr', 'csc'])
+        X = check_array(X, accept_sparse=[
+                        'csr', 'csc'], accept_large_sparse=True)
         random_state = check_random_state(self.random_state)
 
         if self.algorithm == "arpack":
@@ -207,7 +208,7 @@ def transform(self, X):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse='csr', accept_large_sparse=True)
         return safe_sparse_dot(X, self.components_.T)
 
     def inverse_transform(self, X):

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -236,7 +236,8 @@ def fit(self, X, y):
     def _decision_function(self, X):
         check_is_fitted(self, "coef_")
 
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
+        X = check_array(X, accept_sparse=[
+                        'csr', 'csc', 'coo'], accept_large_sparse=True)
         return safe_sparse_dot(X, self.coef_.T,
                                dense_output=True) + self.intercept_
 
@@ -297,7 +298,7 @@ class would be predicted.
             raise NotFittedError("This %(name)s instance is not fitted "
                                  "yet" % {'name': type(self).__name__})
 
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse='csr', accept_large_sparse=True)
 
         n_features = self.coef_.shape[1]
         if X.shape[1] != n_features:
@@ -479,7 +480,8 @@ def fit(self, X, y, sample_weight=None):
 
         n_jobs_ = self.n_jobs
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         y_numeric=True, multi_output=True)
+                         y_numeric=True, multi_output=True,
+                         accept_large_sparse=True)
 
         if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
             raise ValueError("Sample weights must be 1D array or scalar")

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -1216,8 +1216,12 @@ def fit(self, X, y, sample_weight=None):
         else:
             _dtype = np.float64
 
-        X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
-                         order="C")
+        if self.solver not in ['sag', 'saga']:
+            X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
+                             order="C", accept_large_sparse=True)
+        else:
+            X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
+                             order="C")
         check_classification_targets(y)
         self.classes_ = np.unique(y)
         n_samples, n_features = X.shape

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -1443,7 +1443,8 @@ def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
         raise ValueError("'%d' is not a supported axis" % axis)
 
     X = check_array(X, sparse_format, copy=copy,
-                    estimator='the normalize function', dtype=FLOAT_DTYPES)
+                    estimator='the normalize function', dtype=FLOAT_DTYPES,
+                    accept_large_sparse=True)
     if axis == 0:
         X = X.T
 

diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -14,7 +14,7 @@
 from ..utils import column_or_1d, check_X_y
 from ..utils import compute_class_weight
 from ..utils.extmath import safe_sparse_dot
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_large_sparse
 from ..utils.multiclass import check_classification_targets
 from ..externals import six
 from ..exceptions import ConvergenceWarning
@@ -144,7 +144,8 @@ def fit(self, X, y, sample_weight=None):
             raise TypeError("Sparse precomputed kernels are not supported.")
         self._sparse = sparse and not callable(self.kernel)
 
-        X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
+        X, y = check_X_y(X, y, dtype=np.float64,
+                         order='C', accept_sparse='csr')
         y = self._validate_targets(y)
 
         sample_weight = np.asarray([]
@@ -188,7 +189,8 @@ def fit(self, X, y, sample_weight=None):
         self.shape_fit_ = X.shape
 
         # In binary case, we need to flip the sign of coef, intercept and
-        # decision function. Use self._intercept_ and self._dual_coef_ internally.
+        # decision function. Use self._intercept_ and self._dual_coef_
+        # internally.
         self._intercept_ = self.intercept_.copy()
         self._dual_coef_ = self.dual_coef_
         if self._impl in ['c_svc', 'nu_svc'] and len(self.classes_) == 2:
@@ -864,6 +866,10 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
     libsvm_sparse.set_verbosity_wrap(verbose)
     liblinear.set_verbosity_wrap(verbose)
 
+    # Liblinear doesn't support 64bit sparse matrix indices yet
+    if sp.issparse(X):
+        _check_large_sparse(X)
+
     # LibLinear wants targets as doubles, even for classification
     y_ind = np.asarray(y_ind, dtype=np.float64).ravel()
     if sample_weight is None:

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -54,7 +54,8 @@
 
 from sklearn.utils import shuffle
 from sklearn.utils.fixes import signature
-from sklearn.utils.validation import has_fit_parameter, _num_samples
+from sklearn.utils.validation import (has_fit_parameter, _num_samples,
+                                      LARGE_SPARSE_SUPPORTED)
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import load_iris, load_boston, make_blobs
 
@@ -403,6 +404,40 @@ def pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):
     return X
 
 
+def _generate_sparse_matrix(X_csr):
+    """Generate matrices in multiple formats for CSR,CSC and COO matrices
+
+        Parameters
+        ----------
+
+        X_csr: CSR Matrix
+            Input matrix in CSR format
+
+        Returns
+        -------
+
+        out: iter(Matrices)
+            In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',
+             'coo_64', 'csc_64', 'csr_64']
+    """
+
+    for sparse_format in ['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo']:
+        yield sparse_format, X_csr.asformat(sparse_format)
+
+    if LARGE_SPARSE_SUPPORTED:
+        # Generate large indices matrix only if its supported by scipy
+        X_coo = X_csr.asformat('coo')
+        X_coo.row = X_coo.row.astype('int64')
+        X_coo.col = X_coo.col.astype('int64')
+        yield "coo_64", X_coo
+
+        for sparse_format in ['csc', 'csr']:
+            X = X_csr.asformat(sparse_format)
+            X.indices = X.indices.astype('int64')
+            X.indptr = X.indptr.astype('int64')
+            yield sparse_format + "_64", X
+
+
 def check_estimator_sparse_data(name, estimator_orig):
 
     rng = np.random.RandomState(0)
@@ -415,8 +450,7 @@ def check_estimator_sparse_data(name, estimator_orig):
     with ignore_warnings(category=DeprecationWarning):
         estimator = clone(estimator_orig)
     y = multioutput_estimator_convert_y_2d(estimator, y)
-    for sparse_format in ['csr', 'csc', 'dok', 'lil', 'coo', 'dia', 'bsr']:
-        X = X_csr.asformat(sparse_format)
+    for matrix_format, X in _generate_sparse_matrix(X_csr):
         # catch deprecation warnings
         with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
             if name in ['Scaler', 'StandardScaler']:
@@ -435,12 +469,19 @@ def check_estimator_sparse_data(name, estimator_orig):
                 assert_equal(probs.shape, (X.shape[0], 4))
         except (TypeError, ValueError) as e:
             if 'sparse' not in repr(e).lower():
-                print("Estimator %s doesn't seem to fail gracefully on "
-                      "sparse data: error message state explicitly that "
-                      "sparse input is not supported if this is not the case."
-                      % name)
-                raise
-        except Exception:
+                if "64" in matrix_format:
+                    raise AssertionError("Estimator %s doesn't seem to "
+                                         "support %s matrix yet, also it "
+                                         "has not been handled gracefully"
+                                         " by accept_large_sparse."
+                                         % (name, matrix_format))
+                else:
+                    print("Estimator %s doesn't seem to fail gracefully on "
+                          "sparse data: error message state explicitly that "
+                          "sparse input is not supported if this is not"
+                          " the case." % name)
+                    raise
+        except Exception as e:
             print("Estimator %s doesn't seem to fail gracefully on "
                   "sparse data: it should raise a TypeError if sparse input "
                   "is explicitly not supported." % name)

diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
@@ -2,7 +2,6 @@
 import sys
 
 import numpy as np
-
 import scipy.sparse as sp
 
 from sklearn.externals.six.moves import cStringIO as StringIO
@@ -24,7 +23,8 @@
 from sklearn.linear_model import MultiTaskElasticNet
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsRegressor
-from sklearn.utils.validation import check_X_y, check_array
+from sklearn.utils.validation import (check_X_y, check_array,
+                                      LARGE_SPARSE_SUPPORTED)
 
 
 class CorrectNotFittedError(ValueError):
@@ -161,6 +161,26 @@ def predict(self, X):
         return np.zeros(X.shape[0])
 
 
+class LargeSparseNotSetClassifier(BaseEstimator):
+    def fit(self, X, y):
+        X, y = check_X_y(X, y,
+                         accept_sparse=("csr", "csc", "coo"),
+                         accept_large_sparse=True,
+                         multi_output=True,
+                         y_numeric=True)
+        if sp.issparse(X):
+            if X.getformat() == "coo":
+                if X.col.dtype == "int64" or X.col.dtype == "int64":
+                    raise ValueError(
+                        "Estimator doesn't support 64-bit indices")
+            elif X.getformat() in ["csc", "csr"]:
+                if X.indices.dtype == "int64" or X.indptr.dtype == "int64":
+                    raise ValueError(
+                        "Estimator doesn't support 64-bit indices")
+
+        return self
+
+
 def test_check_estimator():
     # tests that the estimator actually fails on "bad" estimators.
     # not a complete test of all checks, which are very extensive.
@@ -235,6 +255,15 @@ def test_check_estimator():
         sys.stdout = old_stdout
     assert_true(msg in string_buffer.getvalue())
 
+    # Large indices test on bad estimator
+    msg = ('Estimator LargeSparseNotSetClassifier doesn\'t seem to support '
+           r'\S{3}_64 matrix yet, also it has not been handled gracefully by '
+           'accept_large_sparse.')
+    # only supported by scipy version more than 0.14.0
+    if LARGE_SPARSE_SUPPORTED:
+        assert_raises_regex(AssertionError, msg, check_estimator,
+                            LargeSparseNotSetClassifier)
+
     # doesn't error on actual estimator
     check_estimator(AdaBoostClassifier)
     check_estimator(AdaBoostClassifier())

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -9,6 +9,7 @@
 import pytest
 import numpy as np
 import scipy.sparse as sp
+from scipy import __version__ as scipy_version
 
 from sklearn.utils.testing import assert_true, assert_false, assert_equal
 from sklearn.utils.testing import assert_raises
@@ -35,7 +36,8 @@
     check_is_fitted,
     check_consistent_length,
     assert_all_finite,
-    check_memory
+    check_memory,
+    LARGE_SPARSE_SUPPORTED
 )
 import sklearn
 
@@ -427,6 +429,42 @@ def test_check_array_accept_sparse_no_exception():
     check_array(X_csr, accept_sparse=('csr',))
 
 
+def test_check_array_accept_large_sparse_no_exception():
+    # When large sparse are allowed
+    if LARGE_SPARSE_SUPPORTED:
+        X = sp.rand(10, 1000, format='csr')
+        X.indices = X.indices.astype('int64')
+        X.indptr = X.indptr.astype('int64')
+        check_array(X, accept_large_sparse=True, accept_sparse=True)
+
+
+def test_check_array_accept_large_sparse_raise_exception():
+    # When large sparse are not allowed
+    if LARGE_SPARSE_SUPPORTED:
+        X = sp.rand(10, 1000, format='csr')
+        X.indices = X.indices.astype('int64')
+        X.indptr = X.indptr.astype('int64')
+        msg = "Only sparse matrices with 32-bit integer indices" + \
+            " are accepted. Got int64 indices."
+        assert_raise_message(ValueError, msg.format([]),
+                             check_array, X, accept_sparse=True,
+                             accept_large_sparse=False)
+
+
+def test_check_array_large_indices_non_supported_scipy_version():
+    # Large indices should not be allowed for scipy<0.14.0
+    if not LARGE_SPARSE_SUPPORTED:
+        X = sp.rand(10, 1000, format='csr')
+        X.indices = X.indices.astype('int64')
+        X.indptr = X.indptr.astype('int64')
+        msg = ("Scipy version %s does not support large"
+               " indices, please upgrade your scipy"
+               " to 0.14.0 or above" % scipy_version)
+
+        assert_raise_message(ValueError, msg.format([]), check_array,
+                             X, accept_sparse='csc')
+
+
 def test_check_array_min_samples_and_features_messages():
     # empty list is considered 2D by default:
     msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."