Revert "[MRG] ENH apply sparse_threshold even if all columns are sparse (scikit-learn#12304)"

Xing · Xing · commit fa3352e532d5 · 2019-04-28T15:16:38.000-04:00
This reverts commit 1dc7cc0.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -34,10 +34,6 @@ Changelog
   columns with types not convertible to a numeric.
   :issue:`11912` by :user:`Adrin Jalali <adrinjalali>`.
 
-- |API| :class:`compose.ColumnTransformer` now applies the ``sparse_threshold``
-  even if all transformation results are sparse. :issue:`12304` by `Andreas
-  Müller`_.
-
 :mod:`sklearn.datasets`
 ............................
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -85,11 +85,12 @@ class ColumnTransformer(_BaseComposition, TransformerMixin):
         estimator must support `fit` and `transform`.
 
     sparse_threshold : float, default = 0.3
-        If the output of the different transfromers contains sparse matrices,
-        these will be stacked as a sparse matrix if the overall density is
-        lower than this value. Use ``sparse_threshold=0`` to always return
-        dense.  When the transformed output consists of all dense data, the
-        stacked result will be dense, and this keyword will be ignored.
+        If the transformed output consists of a mix of sparse and dense data,
+        it will be stacked as a sparse matrix if the density is lower than this
+        value. Use ``sparse_threshold=0`` to always return dense.
+        When the transformed output consists of all sparse or all dense data,
+        the stacked result will be sparse or dense, respectively, and this
+        keyword will be ignored.
 
     n_jobs : int or None, optional (default=None)
         Number of jobs to run in parallel.
@@ -455,7 +456,9 @@ def fit_transform(self, X, y=None):
         Xs, transformers = zip(*result)
 
         # determine if concatenated output will be sparse or not
-        if any(sparse.issparse(X) for X in Xs):
+        if all(sparse.issparse(X) for X in Xs):
+            self.sparse_output_ = True
+        elif any(sparse.issparse(X) for X in Xs):
             nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
             total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X)
                         else X.size for X in Xs)
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -402,13 +402,13 @@ def test_column_transformer_sparse_threshold():
     X_array = np.array([['a', 'b'], ['A', 'B']], dtype=object).T
     # above data has sparsity of 4 / 8 = 0.5
 
-    # apply threshold even if all sparse
+    # if all sparse, keep sparse (even if above threshold)
     col_trans = ColumnTransformer([('trans1', OneHotEncoder(), [0]),
                                    ('trans2', OneHotEncoder(), [1])],
                                   sparse_threshold=0.2)
     res = col_trans.fit_transform(X_array)
-    assert not sparse.issparse(res)
-    assert not col_trans.sparse_output_
+    assert sparse.issparse(res)
+    assert col_trans.sparse_output_
 
     # mixed -> sparsity of (4 + 2) / 8 = 0.75
     for thres in [0.75001, 1]: