E5E7 fix: ensure `page_size` works correctly in `to_pandas_batches` when `max_results` is not set by chelsea-lin · Pull Request #1588 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 64 additions & 5 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,39 @@ def to_pandas(
) -> pandas.DataFrame | pandas.Series:
"""Write DataFrame to pandas DataFrame.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> df = bpd.DataFrame({'col': [4, 2, 2]})

Download the data from BigQuery and convert it into an in-memory pandas DataFrame.

>>> df.to_pandas()
col
0 10BC0 4
1 2
2 2

Estimate job statistics without processing or downloading data by using `dry_run=True`.

>>> df.to_pandas(dry_run=True) # doctest: +SKIP
columnCount 1
columnDtypes {'col': Int64}
indexLevel 1
indexDtypes [Int64]
projectId bigframes-dev
location US
jobType QUERY
destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_...
useLegacySql False
referencedTables None
totalBytesProcessed 0
cacheHit False
statementType SELECT
creationTime 2025-04-02 20:17:12.038000+00:00
dtype: object

Args:
max_download_size (int, default None):
Download size threshold in MB. If max_download_size is exceeded when downloading data
Expand Down Expand Up @@ -1666,9 +1699,6 @@ def to_pandas(
downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas
Series containing dry run statistics will be returned.
"""

# TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job

if dry_run:
dry_run_stats, dry_run_job = self._block._compute_dry_run(
max_download_size=max_download_size,
Expand Down Expand Up @@ -1702,11 +1732,40 @@ def to_pandas_batches(
page_size and max_results determine the size and number of batches,
see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]})

Iterate through the results in batches, limiting the total rows yielded
across all batches via `max_results`:

>>> for df_batch in df.to_pandas_batches(max_results=3):
... print(df_batch)
col
0 4
1 3
2 2

Alternatively, control the approximate size of each batch using `page_size`
and fetch batches manually using `next()`:

>>> it = df.to_pandas_batches(page_size=2)
>>> next(it)
col
0 4
1 3
>>> next(it)
col
2 2
3 2

Args:
page_size (int, default None):
The size of each batch.
The maximum number of rows of each batch. Non-positive values are ignored.
max_results (int, default None):
If given, only download this many rows at maximum.
The maximum total number of rows of all batches.
allow_large_results (bool, default None):
If not None, overrides the global setting to allow or disallow large query results
over the default size limit of 10 GB.
Expand Down
33 changes: 33 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,39 @@ def to_pandas(
) -> pandas.Series:
"""Writes Series to pandas Series.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series([4, 3, 2])

Download the data from BigQuery and convert it into an in-memory pandas Series.

>>> s.to_pandas()
0 4
1 3
2 2
dtype: Int64

Estimate job statistics without processing or downloading data by using `dry_run=True`.

>>> s.to_pandas(dry_run=True) # doctest: +SKIP
columnCount 1
columnDtypes {None: Int64}
indexLevel 1
indexDtypes [Int64]
projectId bigframes-dev
location US
jobType QUERY
destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_...
useLegacySql False
referencedTables None
totalBytesProcessed 0
cacheHit False
statementType SELECT
creationTime 2025-04-03 18:54:59.219000+00:00
dtype: object

Args:
max_download_size (int, default None):
Download size threshold in MB. If max_download_size is exceeded when downloading data
Expand Down
8 changes: 7 additions & 1 deletion bigframes/session/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,13 @@ def execute(

# Though we provide the read client, iterator may or may not use it based on what is efficient for the result
def iterator_supplier():
return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient)
# Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154
if iterator._page_size is not None or iterator.max_results is not None:
return iterator.to_arrow_iterable(bqstorage_client=None)
else:
return iterator.to_arrow_iterable(
bqstorage_client=self.bqstoragereadclient
)

if query_job:
size_bytes = self.bqclient.get_table(query_job.destination).num_bytes
Expand Down
2 changes: 1 addition & 1 deletion tests/system/large/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_to_pandas_batches_override_global_option(
page_size=500, max_results=1500, allow_large_results=True
)
)
assert len(w) == 2
assert len(w) == 1
assert issubclass(w[0].category, FutureWarning)
assert "The query result size has exceeded 10 GB." in str(w[0].message)

Expand Down
0