From 6c4d3a36c61a89fb4aa9bd209f7775944b83572e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Tue, 1 Jul 2025 14:05:08 -0500 Subject: [PATCH] feat: `df.to_pandas_batches()` returns one empty DataFrame if `df` is empty --- bigframes/core/blocks.py | 16 ++++++++++++++++ tests/system/small/test_dataframe_io.py | 22 ++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6d476cc795..dbbf9ee864 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -620,15 +620,31 @@ def to_pandas_batches( ordered=True, use_explicit_destination=allow_large_results, ) + + total_batches = 0 for df in execute_result.to_pandas_batches( page_size=page_size, max_results=max_results ): + total_batches += 1 self._copy_index_to_pandas(df) if squeeze: yield df.squeeze(axis=1) else: yield df + # To reduce the number of edge cases to consider when working with the + # results of this, always return at least one DataFrame. See: + # b/428918844. + if total_batches == 0: + df = pd.DataFrame( + { + col: pd.Series([], dtype=self.expr.get_column_type(col)) + for col in itertools.chain(self.value_columns, self.index_columns) + } + ) + self._copy_index_to_pandas(df) + yield df + def _copy_index_to_pandas(self, df: pd.DataFrame): """Set the index on pandas DataFrame to match this block. diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index afe3b53d6d..f738a32ec0 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -347,6 +347,28 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): pd.testing.assert_series_equal(actual, expected) +def test_to_pandas_batches_w_empty_dataframe(session): + """Verify to_pandas_batches() APIs returns at least one DataFrame. + + See b/428918844 for additional context. + """ + empty = bpd.DataFrame( + { + "idx1": [], + "idx2": [], + "col1": pandas.Series([], dtype="string[pyarrow]"), + "col2": pandas.Series([], dtype="Int64"), + }, + session=session, + ).set_index(["idx1", "idx2"], drop=True) + + results = list(empty.to_pandas_batches()) + assert len(results) == 1 + assert list(results[0].index.names) == ["idx1", "idx2"] + assert list(results[0].columns) == ["col1", "col2"] + pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) + + @pytest.mark.parametrize("allow_large_results", (True, False)) def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): """Verify to_pandas_batches() APIs returns the expected page size.