From 5247771ad658bc1416faf23e12c1c4d91d24d868 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 24 May 2024 22:20:33 +0000 Subject: [PATCH 1/3] fix: multi-index label in DataFrameGroupBy --- bigframes/core/groupby/__init__.py | 4 +++- .../pandas/core/groupby/__init__.py | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 91c5e54d89..474256d112 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -340,7 +340,9 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: for f in func ] column_labels = [ - (col_id, f) for col_id in self._aggregated_columns() for f in func + (self._block.col_id_to_label[col_id], f) + for col_id in self._aggregated_columns() + for f in func ] agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index f9bedc2a7b..6011dbfe5b 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -1092,6 +1092,17 @@ def agg(self, func, **kwargs): [2 rows x 2 columns] + Multiple aggregations + + >>> df.groupby('A').agg(['min', 'max']) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.56286 1.267767 + + [2 rows x 4 columns] + Args: func (function, str, list, dict or None): Function to use for aggregating the data. @@ -1140,6 +1151,17 @@ def aggregate(self, func, **kwargs): [2 rows x 2 columns] + Multiple aggregations + + >>> df.groupby('A').agg(['min', 'max']) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.56286 1.267767 + + [2 rows x 4 columns] + Args: func (function, str, list, dict or None): Function to use for aggregating the data. From d1238f3eb336a78250e9d327ca4211a4e03e56df Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 28 May 2024 23:49:39 +0000 Subject: [PATCH 2/3] fix: correct index labels with multi-index DataFrameGroupBy --- bigframes/core/groupby/__init__.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 474256d112..1ce8ef3a15 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -339,9 +339,18 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: for col_id in self._aggregated_columns() for f in func ] + + aggregated_columns = pd.MultiIndex.from_tuples( + [ + self._block.col_id_to_label[col_id] + for col_id in self._aggregated_columns() + ], + names=[*self._block.column_labels.names], + ).to_frame(index=False) + column_labels = [ - (self._block.col_id_to_label[col_id], f) - for col_id in self._aggregated_columns() + tuple(col_id) + (f,) + for col_id in aggregated_columns.to_numpy() for f in func ] agg_block, _ = self._block.aggregate( From e3a8d9f47c5028037f1f955f2730c2f9148838de Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 29 May 2024 18:17:11 +0000 Subject: [PATCH 3/3] add tests --- bigframes/core/groupby/__init__.py | 32 ++++++++++++++++++++---------- tests/system/small/test_groupby.py | 17 ++++++++++++++++ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 1ce8ef3a15..6e3a91cc1c 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -340,19 +340,29 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: for f in func ] - aggregated_columns = pd.MultiIndex.from_tuples( - [ - self._block.col_id_to_label[col_id] + if self._block.column_labels.nlevels > 1: + # Restructure MultiIndex for proper format: (idx1, idx2, func) + # rather than ((idx1, idx2), func). + aggregated_columns = pd.MultiIndex.from_tuples( + [ + self._block.col_id_to_label[col_id] + for col_id in self._aggregated_columns() + ], + names=[*self._block.column_labels.names], + ).to_frame(index=False) + + column_labels = [ + tuple(col_id) + (f,) + for col_id in aggregated_columns.to_numpy() + for f in func + ] + else: + column_labels = [ + (self._block.col_id_to_label[col_id], f) for col_id in self._aggregated_columns() - ], - names=[*self._block.column_labels.names], - ).to_frame(index=False) + for f in func + ] - column_labels = [ - tuple(col_id) + (f,) - for col_id in aggregated_columns.to_numpy() - for f in func - ] agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 02d9bf9725..b332d48574 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -144,6 +144,23 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_agg_list_w_column_multi_index( + scalars_df_index, scalars_pandas_df_index +): + columns = ["int64_too", "string_col", "bool_col"] + multi_columns = pd.MultiIndex.from_tuples(zip(["a", "b", "a"], columns)) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.groupby(level=0).agg(["count", "min"]) + pd_result = pd_df.groupby(level=0).agg(["count", "min"]) + + bf_result_computed = bf_result.to_pandas() + pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + + @pytest.mark.parametrize( ("as_index"), [