From 80a45b54f3003398a52bd8490e6de9635000be91 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 1 Nov 2023 22:40:53 +0000 Subject: [PATCH 1/3] feat: add __iter__, iterrows, itertuples, keys methods --- bigframes/dataframe.py | 18 +++++ bigframes/series.py | 6 ++ tests/system/small/test_dataframe.py | 50 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 79 +++++++++++++++++++ .../bigframes_vendored/pandas/core/generic.py | 31 +++++++- 5 files changed, 183 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3369fb4868..07d3fb430a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -303,6 +303,9 @@ def __len__(self): rows, _ = self.shape return rows + def __iter__(self): + return iter(self.columns) + def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], @@ -1472,12 +1475,27 @@ def isin(self, values) -> DataFrame: f"isin(), you passed a [{type(values).__name__}]" ) + def keys(self) -> pandas.Index: + return self.columns + def items(self): column_ids = self._block.value_columns column_labels = self._block.column_labels for col_id, col_label in zip(column_ids, column_labels): yield col_label, bigframes.series.Series(self._block.select_column(col_id)) + def iterrows(self) -> Iterable[tuple[typing.Hashable, bigframes.series.Series]]: + for df in self.to_pandas_batches(): + for item in df.iterrows(): + yield item + + def itertuples( + self, index: bool = True, name: typing.Optional[str] = "Pandas" + ) -> Iterable[tuple[typing.Any, ...]]: + for df in self.to_pandas_batches(): + for item in df.itertuples(index=index, name=name): + yield item + def dropna( self, *, diff --git a/bigframes/series.py b/bigframes/series.py index 37d00d16f3..5e11211ae2 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,6 +16,7 @@ from __future__ import annotations +import itertools import numbers import textwrap import typing @@ -148,6 +149,11 @@ def _set_internal_query_job(self, query_job: bigquery.QueryJob): def __len__(self): return self.shape[0] + def __iter__(self) -> typing.Iterator: + return itertools.chain.from_iterable( + map(lambda x: x.index, self._block.to_pandas_batches()) + ) + def copy(self) -> Series: return Series(self._block) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index c96faa3526..e63f94953a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -787,6 +787,56 @@ def test_apply_series_scalar_callable( pandas.testing.assert_series_equal(bf_result, pd_result) +def test_df_keys( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + scalars_df_index.keys(), scalars_pandas_df_index.keys() + ) + + +def test_df_iter( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + list(scalars_df_index), list(scalars_pandas_df_index) + ) + + +def test_iterrows( + scalars_df_index, + scalars_pandas_df_index, +): + for (bf_index, bf_series), (pd_index, pd_series) in zip( + scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() + ): + assert bf_index == pd_index + pandas.testing.assert_series_equal(bf_series, pd_series) + + +@pytest.mark.parametrize( + ( + "index", + "name", + ), + [ + ( + True, + "my_df", + ), + (False, None), + ], +) +def test_itertuples(scalars_df_index, index, name): + # Numeric has slightly different representation as a result of conversions. + bf_tuples = scalars_df_index.itertuples(index, name) + pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) + for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): + assert bf_tuple == pd_tuple + + def test_df_isin_list(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = ["Hello, World!", 55555, 2.51, pd.NA, True] diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 013d170114..92a3153467 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -947,6 +947,85 @@ def isin(self, values): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def keys(self): + """ + Get the 'info axis'. + + This is index for Series, columns for DataFrame. + + Returns: + Index: Info axis. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df.keys() + Index(['A', 'B'], dtype='object') + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def iterrows(self): + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields: + a tuple (index, data) where data contains row values as a Series + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> index, row = next(df.iterrows()) + >>> index + 0 + >>> row + A 1 + B 4 + Name: 0, dtype: object + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def itertuples(self, index: bool = True, name: str | None = "Pandas"): + """ + Iterate over DataFrame rows as namedtuples. + + Args: + index (bool, default True): + If True, return the index as the first element of the tuple. + name (str or None, default "Pandas"): + The name of the returned namedtuples or None to return regular + tuples. + + Returns: + iterator: + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> next(df.itertuples(name="Pair")) + Pair(Index=0, A=1, B=4) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def items(self): """ Iterate over (column name, Series) pairs. diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 27d2e84537..127efe6a3d 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -1,7 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/generic.py from __future__ import annotations -from typing import Literal, Optional +from typing import Iterator, Literal, Optional from bigframes import constants from third_party.bigframes_vendored.pandas.core import indexing @@ -35,6 +35,35 @@ def size(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __iter__(self) -> Iterator: + """ + Iterate over info axis. + + Returns + iterator: Info axis as iterator. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> for x in df: + ... print(x) + A + B + + >>> series = bpd.Series(["a", "b", "c"], index=[10, 20, 30]) + >>> for x in series: + ... print(x) + 10 + 20 + 30 + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ------------------------------------------------------------------------- # Unary Methods From 4f4bac44ea9ca5b046c239cce388d2b48f95afb2 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 2 Nov 2023 19:56:39 +0000 Subject: [PATCH 2/3] fix mypy and df_iter test --- bigframes/dataframe.py | 2 +- tests/system/small/test_dataframe.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 56123fffbf..1cbe4f5cb9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1488,7 +1488,7 @@ def items(self): for col_id, col_label in zip(column_ids, column_labels): yield col_label, bigframes.series.Series(self._block.select_column(col_id)) - def iterrows(self) -> Iterable[tuple[typing.Hashable, bigframes.series.Series]]: + def iterrows(self) -> Iterable[tuple[typing.Any, bigframes.series.Series]]: for df in self.to_pandas_batches(): for item in df.iterrows(): yield item diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5fb331a875..bd5930e508 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -816,9 +816,8 @@ def test_df_iter( scalars_df_index, scalars_pandas_df_index, ): - pandas.testing.assert_index_equal( - list(scalars_df_index), list(scalars_pandas_df_index) - ) + for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): + assert bf_i == df_i def test_iterrows( From fd52ad92319f1cf3689ce9123f073e189eabee3e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 2 Nov 2023 21:13:14 +0000 Subject: [PATCH 3/3] fix mypy --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1cbe4f5cb9..04a5456e26 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1488,7 +1488,7 @@ def items(self): for col_id, col_label in zip(column_ids, column_labels): yield col_label, bigframes.series.Series(self._block.select_column(col_id)) - def iterrows(self) -> Iterable[tuple[typing.Any, bigframes.series.Series]]: + def iterrows(self) -> Iterable[tuple[typing.Any, pandas.Series]]: for df in self.to_pandas_batches(): for item in df.iterrows(): yield item