BUG Corrects tag in TfidfTransformer (#20919)

thomasjpfan · web-flow · commit cfc1695dec0c · 2021-09-03T11:08:21.000+02:00
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -1559,7 +1559,12 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        X = self._validate_data(X, accept_sparse=("csr", "csc"))
+        # large sparse data is not supported for 32bit platforms because
+        # _document_frequency uses np.bincount which works on arrays of
+        # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
+        X = self._validate_data(
+            X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT
+        )
         if not sp.issparse(X):
             X = sp.csr_matrix(X)
         dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
@@ -1648,7 +1653,7 @@ def idf_(self, value):
         )
 
     def _more_tags(self):
-        return {"X_types": "sparse"}
+        return {"X_types": ["2darray", "sparse"]}
 
 
 class TfidfVectorizer(CountVectorizer):
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -37,6 +37,7 @@
 from sklearn.pipeline import make_pipeline
 
 from sklearn.utils import IS_PYPY
+from sklearn.utils._tags import _DEFAULT_TAGS, _safe_tags
 from sklearn.utils._testing import (
     SkipTest,
     set_random_state,
@@ -308,6 +309,21 @@ def test_search_cv(estimator, check, request):
         check(estimator)
 
 
+@pytest.mark.parametrize(
+    "estimator", _tested_estimators(), ids=_get_check_estimator_ids
+)
+def test_valid_tag_types(estimator):
+    """Check that estimator tags are valid."""
+    tags = _safe_tags(estimator)
+
+    for name, tag in tags.items():
+        correct_tags = type(_DEFAULT_TAGS[name])
+        if name == "_xfail_checks":
+            # _xfail_checks can be a dictionary
+            correct_tags = (correct_tags, dict)
+        assert isinstance(tag, correct_tags)
+
+
 @pytest.mark.parametrize(
     "estimator", _tested_estimators(), ids=_get_check_estimator_ids
 )