diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 1221571e1f658..3d96489c48a72 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -308,6 +308,10 @@ Changelog removes accents from strings that are in NFKD normalized form. :pr:`15100` by :user:`Daniel Grady `. +- |Fix| Fixed a bug that caused :class:`feature_extraction.DictVectorizer` to raise + an `OverflowError` during the `transform` operation when producing a `scipy.sparse` + matrix on large input data. :pr:`15463` by :user:`Norvan Sahiner `. + :mod:`sklearn.feature_selection` ................................ diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index 6e820f93a3b60..ca49263f57913 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -154,7 +154,7 @@ def _transform(self, X, fitting): X = [X] if isinstance(X, Mapping) else X indices = array("i") - indptr = array("i", [0]) + indptr = [0] # XXX we could change values to an array.array as well, but it # would require (heuristic) conversion of dtype to typecode... values = [] @@ -182,7 +182,6 @@ def _transform(self, X, fitting): raise ValueError("Sample sequence X is empty.") indices = np.frombuffer(indices, dtype=np.intc) - indptr = np.frombuffer(indptr, dtype=np.intc) shape = (len(indptr) - 1, len(vocab)) result_matrix = sp.csr_matrix((values, indices, indptr),