diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 95d9aee996..f9896784bb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1375,10 +1375,16 @@ def aggregate( ) -> typing.Tuple[Block, typing.Sequence[str]]: """ Apply aggregations to the block. + Arguments: by_column_id: column id of the aggregation key, this is preserved through the transform and used as index. aggregations: input_column_id, operation tuples dropna: whether null keys should be dropped + + Returns: + Tuple[Block, Sequence[str]]: + The first element is the grouped block. The second is the + column IDs corresponding to each applied aggregation. """ if column_labels is None: column_labels = pd.Index(range(len(aggregations))) diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index f9c98d320c..40e96f6f42 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -16,7 +16,7 @@ import datetime import typing -from typing import Literal, Optional, Sequence, Tuple, Union +from typing import Iterable, Literal, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -29,7 +29,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -from bigframes.core.groupby import aggs, series_group_by +from bigframes.core.groupby import aggs, group_by, series_group_by import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.validations as validations @@ -54,6 +54,7 @@ def __init__( selected_cols: typing.Optional[typing.Sequence[str]] = None, dropna: bool = True, as_index: bool = True, + by_key_is_singular: bool = False, ): # TODO(tbergeron): Support more group-by expression types self._block = block @@ -64,6 +65,9 @@ def __init__( ) } self._by_col_ids = by_col_ids + self._by_key_is_singular = by_key_is_singular + if by_key_is_singular: + assert len(by_col_ids) == 1, "singular key should be exactly one group key" self._dropna = dropna self._as_index = as_index @@ -163,6 +167,16 @@ def describe(self, include: None | Literal["all"] = None): ) ) + def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]: + for group_keys, filtered_block in group_by.block_groupby_iter( + self._block, + by_col_ids=self._by_col_ids, + by_key_is_singular=self._by_key_is_singular, + dropna=self._dropna, + ): + filtered_df = df.DataFrame(filtered_block) + yield group_keys, filtered_df + def size(self) -> typing.Union[df.DataFrame, series.Series]: agg_block, _ = self._block.aggregate_size( by_column_ids=self._by_col_ids, diff --git a/bigframes/core/groupby/group_by.py b/bigframes/core/groupby/group_by.py new file mode 100644 index 0000000000..f00ff7c0b0 --- /dev/null +++ b/bigframes/core/groupby/group_by.py @@ -0,0 +1,91 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +from typing import Sequence + +import pandas as pd + +from bigframes.core import blocks +from bigframes.core import expression as ex +import bigframes.enums +import bigframes.operations as ops + + +def block_groupby_iter( + block: blocks.Block, + *, + by_col_ids: Sequence[str], + by_key_is_singular: bool, + dropna: bool, +): + original_index_columns = block._index_columns + original_index_labels = block._index_labels + by_col_ids = by_col_ids + block = block.reset_index( + level=None, + # Keep the original index columns so they can be recovered. + drop=False, + allow_duplicates=True, + replacement=bigframes.enums.DefaultIndexKind.NULL, + ).set_index( + by_col_ids, + # Keep by_col_ids in-place so the ordering doesn't change. + drop=False, + append=False, + ) + block.cached( + force=True, + # All DataFrames will be filtered by by_col_ids, so + # force block.cached() to cluster by the new index by explicitly + # setting `session_aware=False`. This will ensure that the filters + # are more efficient. + session_aware=False, + ) + keys_block, _ = block.aggregate(by_col_ids, dropna=dropna) + for chunk in keys_block.to_pandas_batches(): + # Convert to MultiIndex to make sure we get tuples, + # even for singular keys. + by_keys_index = chunk.index + if not isinstance(by_keys_index, pd.MultiIndex): + by_keys_index = pd.MultiIndex.from_frame(by_keys_index.to_frame()) + + for by_keys in by_keys_index: + filtered_block = ( + # To ensure the cache is used, filter first, then reset the + # index before yielding the DataFrame. + block.filter( + functools.reduce( + ops.and_op.as_expr, + ( + ops.eq_op.as_expr(by_col, ex.const(by_key)) + for by_col, by_key in zip(by_col_ids, by_keys) + ), + ), + ).set_index( + original_index_columns, + # We retained by_col_ids in the set_index call above, + # so it's safe to drop the duplicates now. + drop=True, + append=False, + index_labels=original_index_labels, + ) + ) + + if by_key_is_singular: + yield by_keys[0], filtered_block + else: + yield by_keys, filtered_block diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index 1839180b0e..1f2632078d 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -16,7 +16,7 @@ import datetime import typing -from typing import Literal, Sequence, Union +from typing import Iterable, Literal, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -28,7 +28,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -from bigframes.core.groupby import aggs +from bigframes.core.groupby import aggs, group_by import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.core.validations as validations @@ -52,6 +52,8 @@ def __init__( by_col_ids: typing.Sequence[str], value_name: blocks.Label = None, dropna=True, + *, + by_key_is_singular: bool = False, ): # TODO(tbergeron): Support more group-by expression types self._block = block @@ -60,6 +62,10 @@ def __init__( self._value_name = value_name self._dropna = dropna # Applies to aggregations but not windowing + self._by_key_is_singular = by_key_is_singular + if by_key_is_singular: + assert len(by_col_ids) == 1, "singular key should be exactly one group key" + @property def _session(self) -> session.Session: return self._block.session @@ -89,6 +95,19 @@ def describe(self, include: None | Literal["all"] = None): ) ).droplevel(level=0, axis=1) + def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]: + for group_keys, filtered_block in group_by.block_groupby_iter( + self._block, + by_col_ids=self._by_col_ids, + by_key_is_singular=self._by_key_is_singular, + dropna=self._dropna, + ): + filtered_series = series.Series( + filtered_block.select_column(self._value_column) + ) + filtered_series.name = self._value_name + yield group_keys, filtered_series + def all(self) -> series.Series: return self._aggregate(agg_ops.all_op) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f4d968a336..ea5136f6f5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3913,11 +3913,17 @@ def _groupby_level( as_index: bool = True, dropna: bool = True, ): + if utils.is_list_like(level): + by_key_is_singular = False + else: + by_key_is_singular = True + return groupby.DataFrameGroupBy( self._block, by_col_ids=self._resolve_levels(level), as_index=as_index, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def _groupby_series( @@ -3930,10 +3936,14 @@ def _groupby_series( as_index: bool = True, dropna: bool = True, ): + # Pandas makes a distinction between groupby with a list of keys + # versus groupby with a single item in some methods, like __iter__. if not isinstance(by, bigframes.series.Series) and utils.is_list_like(by): by = list(by) + by_key_is_singular = False else: by = [typing.cast(typing.Union[blocks.Label, bigframes.series.Series], by)] + by_key_is_singular = True block = self._block col_ids: typing.Sequence[str] = [] @@ -3963,6 +3973,7 @@ def _groupby_series( by_col_ids=col_ids, as_index=as_index, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def abs(self) -> DataFrame: diff --git a/bigframes/series.py b/bigframes/series.py index da2f3f07c4..4e51181617 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1854,12 +1854,18 @@ def _groupby_level( level: int | str | typing.Sequence[int] | typing.Sequence[str], dropna: bool = True, ) -> bigframes.core.groupby.SeriesGroupBy: + if utils.is_list_like(level): + by_key_is_singular = False + else: + by_key_is_singular = True + return groupby.SeriesGroupBy( self._block, self._value_column, by_col_ids=self._resolve_levels(level), value_name=self.name, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def _groupby_values( @@ -1871,8 +1877,10 @@ def _groupby_values( ) -> bigframes.core.groupby.SeriesGroupBy: if not isinstance(by, Series) and _is_list_like(by): by = list(by) + by_key_is_singular = False else: by = [typing.cast(typing.Union[blocks.Label, Series], by)] + by_key_is_singular = True block = self._block grouping_cols: typing.Sequence[str] = [] @@ -1904,6 +1912,7 @@ def _groupby_values( by_col_ids=grouping_cols, value_name=self.name, dropna=dropna, + by_key_is_singular=by_key_is_singular, ) def apply( diff --git a/tests/unit/core/test_groupby.py b/tests/unit/core/test_groupby.py new file mode 100644 index 0000000000..8df0e5344e --- /dev/null +++ b/tests/unit/core/test_groupby.py @@ -0,0 +1,271 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pytest + +import bigframes.core.utils as utils +import bigframes.pandas as bpd + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") + + +# All tests in this file require polars to be installed to pass. +@pytest.fixture(scope="module") +def polars_session(): + from bigframes.testing import polars_session + + return polars_session.TestSession() + + +def test_groupby_df_iter_by_key_singular(polars_session): + pd_df = pd.DataFrame({"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]}) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby("colA"), pd_df.groupby("colA")): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_key_list(polars_session): + pd_df = pd.DataFrame({"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]}) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby(["colA"]), pd_df.groupby(["colA"])): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_key_list_multiple(polars_session): + pd_df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": [1, 2, 3, 4, 5], + "colC": [True, False, True, False, True], + } + ) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip( # type: ignore + bf_df.groupby(["colA", "colB"]), pd_df.groupby(["colA", "colB"]) + ): + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_level_singular(polars_session): + pd_df = pd.DataFrame( + {"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]} + ).set_index("colA") + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby(level=0), pd_df.groupby(level=0)): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_level_list_one_item(polars_session): + pd_df = pd.DataFrame( + {"colA": ["a", "a", "b", "c", "c"], "colB": [1, 2, 3, 4, 5]} + ).set_index("colA") + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip(bf_df.groupby(level=[0]), pd_df.groupby(level=[0])): # type: ignore + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + + # In pandas 2.x, we get a warning from pandas: "Creating a Groupby + # object with a length-1 list-like level parameter will yield indexes + # as tuples in a future version. To keep indexes as scalars, create + # Groupby objects with a scalar level parameter instead. + if utils.is_list_like(pd_key): + assert bf_key == tuple(pd_key) + else: + assert bf_key == (pd_key,) + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_df_iter_by_level_list_multiple(polars_session): + pd_df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": [1, 2, 3, 4, 5], + "colC": [True, False, True, False, True], + } + ).set_index(["colA", "colB"]) + bf_df = bpd.DataFrame(pd_df, session=polars_session) + + for bf_group, pd_group in zip( # type: ignore + bf_df.groupby(level=[0, 1]), pd_df.groupby(level=[0, 1]) + ): + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_level_singular(polars_session): + series_index = ["a", "a", "b"] + pd_series = pd.Series([1, 2, 3], index=series_index) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(level=0), pd_series.groupby(level=0) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_level_list_one_item(polars_session): + series_index = ["a", "a", "b"] + pd_series = pd.Series([1, 2, 3], index=series_index) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(level=[0]), pd_series.groupby(level=[0]) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + + # In pandas 2.x, we get a warning from pandas: "Creating a Groupby + # object with a length-1 list-like level parameter will yield indexes + # as tuples in a future version. To keep indexes as scalars, create + # Groupby objects with a scalar level parameter instead. + if utils.is_list_like(pd_key): + assert bf_key == tuple(pd_key) + else: + assert bf_key == (pd_key,) + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_level_list_multiple(polars_session): + pd_df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": [1, 2, 3, 4, 5], + "colC": [True, False, True, False, True], + } + ).set_index(["colA", "colB"]) + pd_series = pd_df["colC"] + bf_df = bpd.DataFrame(pd_df, session=polars_session) + bf_series = bf_df["colC"] + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(level=[0, 1]), pd_series.groupby(level=[0, 1]) + ): + bf_key, bf_group_df = bf_group + bf_result = bf_group_df.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_series(polars_session): + pd_groups = pd.Series(["a", "a", "b"]) + bf_groups = bpd.Series(pd_groups, session=polars_session) + pd_series = pd.Series([1, 2, 3]) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby(bf_groups), pd_series.groupby(pd_groups) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_series_list_one_item(polars_session): + pd_groups = pd.Series(["a", "a", "b"]) + bf_groups = bpd.Series(pd_groups, session=polars_session) + pd_series = pd.Series([1, 2, 3]) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby([bf_groups]), pd_series.groupby([pd_groups]) + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_groupby_series_iter_by_series_list_multiple(polars_session): + pd_group_a = pd.Series(["a", "a", "b", "c", "c"]) + bf_group_a = bpd.Series(pd_group_a, session=polars_session) + pd_group_b = pd.Series([0, 0, 0, 1, 1]) + bf_group_b = bpd.Series(pd_group_b, session=polars_session) + pd_series = pd.Series([1, 2, 3, 4, 5]) + bf_series = bpd.Series(pd_series, session=polars_session) + bf_series.name = pd_series.name + + for bf_group, pd_group in zip( # type: ignore + bf_series.groupby([bf_group_a, bf_group_b]), + pd_series.groupby([pd_group_a, pd_group_b]), + ): + bf_key, bf_group_series = bf_group + bf_result = bf_group_series.to_pandas() + pd_key, pd_result = pd_group + assert bf_key == pd_key + pandas.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 306b65806b..1e39ec8f94 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -1259,11 +1259,11 @@ def size(self): **Examples:** - For SeriesGroupBy: - >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + For SeriesGroupBy: + >>> lst = ['a', 'a', 'b'] >>> ser = bpd.Series([1, 2, 3], index=lst) >>> ser @@ -1301,6 +1301,74 @@ def size(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __iter__(self): + r""" + Groupby iterator. + + This method provides an iterator over the groups created by the ``resample`` + or ``groupby`` operation on the object. The method yields tuples where + the first element is the label (group key) corresponding to each group or + resampled bin, and the second element is the subset of the data that falls + within that group or bin. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + For SeriesGroupBy: + + >>> lst = ["a", "a", "b"] + >>> ser = bpd.Series([1, 2, 3], index=lst) + >>> ser + a 1 + a 2 + b 3 + dtype: Int64 + >>> for x, y in ser.groupby(level=0): + ... print(f"{x}\n{y}\n") + a + a 1 + a 2 + dtype: Int64 + b + b 3 + dtype: Int64 + + For DataFrameGroupBy: + + >>> data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]] + >>> df = bpd.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 1 2 3 + 1 1 5 6 + 2 7 8 9 + + [3 rows x 3 columns] + >>> for x, y in df.groupby(by=["a"]): + ... print(f'{x}\n{y}\n') + (1,) + a b c + 0 1 2 3 + 1 1 5 6 + + [2 rows x 3 columns] + (7,) + + a b c + 2 7 8 9 + + [1 rows x 3 columns] + + + Returns: + Iterable[Label | Tuple, bigframes.pandas.Series | bigframes.pandas.DataFrame]: + Generator yielding sequence of (name, subsetted object) + for each group. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + class SeriesGroupBy(GroupBy): def agg(self, func):