BUG: DataFrameGroupBy with numeric_only and empty non-numeric data

pandas-dev · jreback · Jun 2, 2021 · May 6, 2021 · May 6, 2021 · May 5, 2021
commit 2357da03bb1617a14696dd03f714208c35551571
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out,
                 val = values[i, j]
 
                 # not nan
-                if val == val:
+                if not checknull(val):
                     nobs[lab, j] += 1
 
                     if nobs[lab, j] == 1:

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -67,10 +67,7 @@
     validate_func_kwargs,
 )
 from pandas.core.apply import GroupByApply
-from pandas.core.base import (
-    DataError,
-    SpecificationError,
-)
+from pandas.core.base import SpecificationError
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
@@ -516,16 +513,9 @@ def _cython_transform(
 
         obj = self._selected_obj
 
-        is_numeric = is_numeric_dtype(obj.dtype)
-        if numeric_only and not is_numeric:
-            raise DataError("No numeric types to aggregate")
-
-        try:
-            result = self.grouper._cython_operation(
-                "transform", obj._values, how, axis, **kwargs
-            )
-        except (NotImplementedError, TypeError):
-            raise DataError("No numeric types to aggregate")
+        result = self.grouper._cython_operation(
+            "transform", obj._values, how, axis, **kwargs
+        )
 
         return obj._constructor(result, index=self.obj.index, name=obj.name)
 
@@ -1064,7 +1054,6 @@ def _cython_agg_general(
         # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy
 
         data: Manager2D = self._get_data_to_aggregate()
-        orig = data
 
         if numeric_only:
             data = data.get_numeric_data(copy=False)
@@ -1087,9 +1076,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
         #  continue and exclude the block
         new_mgr = data.grouped_reduce(array_func, ignore_failures=True)
 
-        if not len(new_mgr) and len(orig):
-            # If the original Manager was already empty, no need to raise
-            raise DataError("No numeric types to aggregate")
         if len(new_mgr) < len(data):
             warnings.warn(
                 f"Dropping invalid columns in {type(self).__name__}.{how} "

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1339,20 +1339,12 @@ def _agg_general(
 
         with group_selection_context(self):
             # try a cython aggregation if we can
-            result = None
-            try:
-                result = self._cython_agg_general(
-                    how=alias,
-                    alt=npfunc,
-                    numeric_only=numeric_only,
-                    min_count=min_count,
-                )
-            except DataError:
-                pass
-
-            # apply a non-cython aggregation
-            if result is None:
-                result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
+            result = self._cython_agg_general(
+                how=alias,
+                alt=npfunc,
+                numeric_only=numeric_only,
+                min_count=min_count,
+            )
             return result.__finalize__(self.obj, method="groupby")
 
     def _agg_py_fallback(

diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column():
         columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
     )
 
-    result = df.groupby(level=1, axis=1).sum()
-    expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]})
+    gb = df.groupby(level=1, axis=1)
+    result = gb.sum(numeric_only=False)
+    expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
 
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -18,7 +18,6 @@
     bdate_range,
 )
 import pandas._testing as tm
-from pandas.core.groupby.groupby import DataError
 
 
 @pytest.mark.parametrize(
@@ -98,9 +97,9 @@ def test_cython_agg_nothing_to_agg():
 
     frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
 
-    msg = "No numeric types to aggregate"
-    with pytest.raises(DataError, match=msg):
-        frame[["b"]].groupby(frame["a"]).mean()
+    result = frame[["b"]].groupby(frame["a"]).mean()
+    expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
+    tm.assert_frame_equal(result, expected)
 
 
 def test_cython_agg_nothing_to_agg_with_dates():

diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -433,15 +433,22 @@ def test_agg_over_numpy_arrays():
         ],
         columns=["category", "arraydata"],
     )
-    result = df.groupby("category").agg(sum)
+    gb = df.groupby("category")
 
     expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
     expected_index = Index([1, 2], name="category")
     expected_column = ["arraydata"]
     expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
 
+    alt = gb.sum(numeric_only=False)
+    tm.assert_frame_equal(alt, expected)
+
+    result = gb.agg("sum", numeric_only=False)
     tm.assert_frame_equal(result, expected)
 
+    # FIXME: the original version of this test called `gb.agg(sum)`
+    #  and that raises TypeError if `numeric_only=False` is passed
+
 
 @pytest.mark.parametrize("as_period", [True, False])
 def test_agg_tzaware_non_datetime_result(as_period):
@@ -524,9 +531,14 @@ def test_sum_uint64_overflow():
     )
 
     expected.index.name = 0
-    result = df.groupby(0).sum()
+    result = df.groupby(0).sum(numeric_only=False)
     tm.assert_frame_equal(result, expected)
 
+    # out column is non-numeric, so with numeric_only=True it is dropped
+    result2 = df.groupby(0).sum(numeric_only=True)
+    expected2 = expected[[]]
+    tm.assert_frame_equal(result2, expected2)
+
 
 @pytest.mark.parametrize(
     "structure, expected",

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -638,7 +638,7 @@ def test_as_index_select_column():
 def test_groupby_as_index_select_column_sum_empty_df():
     # GH 35246
     df = DataFrame(columns=["A", "B", "C"])
-    left = df.groupby(by="A", as_index=False)["B"].sum()
+    left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
     assert type(left) is DataFrame
     assert left.to_dict() == {"A": {}, "B": {}}
 
@@ -1861,6 +1861,49 @@ def get_result():
                         get_result()
 
                     return
+    else:
+        # ie. DataFrameGroupBy
+        if op in ["prod", "sum"]:
+            # ops that require more than just ordered-ness
+            if method != "apply":
+                # FIXME: apply goes through different code path
+                if df.dtypes[0].kind == "M":
+                    # GH#41291
+                    # datetime64 -> prod and sum are invalid
+                    result = get_result()
+
+                    # with numeric_only=True, these are dropped, and we get
+                    # an empty DataFrame back
+                    expected = df.set_index(keys)[[]]
+                    tm.assert_equal(result, expected)
+                    return
+
+                elif isinstance(values, Categorical):
+                    # GH#41291
+                    # Categorical doesn't implement sum or prod
+                    result = get_result()
+
+                    # with numeric_only=True, these are dropped, and we get
+                    # an empty DataFrame back
+                    expected = df.set_index(keys)[[]]
+                    if len(keys) != 1 and op == "prod":
+                        # TODO: why just prod and not sum?
+                        # Categorical is special without 'observed=True'
+                        lev = Categorical([0], dtype=values.dtype)
+                        mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
+                        expected = DataFrame([], columns=[], index=mi)
+
+                    tm.assert_equal(result, expected)
+                    return
+
+                elif df.dtypes[0] == object:
+                    # FIXME: the test is actually wrong here, xref #41341
+                    result = get_result()
+                    # In this case we have list-of-list, will raise TypeError,
+                    # and subsequently be dropped as nuisance columns
+                    expected = df.set_index(keys)[[]]
+                    tm.assert_equal(result, expected)
+                    return
 
     result = get_result()
     expected = df.set_index(keys)[columns]
@@ -2313,12 +2356,17 @@ def test_groupby_all_nan_groups_drop():
 
 def test_groupby_empty_multi_column():
     # GH 15106
-    result = DataFrame(data=[], columns=["A", "B", "C"]).groupby(["A", "B"]).sum()
+    df = DataFrame(data=[], columns=["A", "B", "C"])
+    gb = df.groupby(["A", "B"])
+    result = gb.sum(numeric_only=False)
     expected = DataFrame(
         [], columns=["C"], index=MultiIndex([[], []], [[], []], names=["A", "B"])
     )
     tm.assert_frame_equal(result, expected)
 
+    result = gb.sum(numeric_only=True)
+    tm.assert_frame_equal(result, expected[[]])
+
 
 def test_groupby_filtered_df_std():
     # GH 16174