From b162224507381611d4bbe7b204185cc344db070a Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 3 Apr 2025 05:39:56 +0000 Subject: [PATCH 1/3] fix: page_size without max_results does not work with to_pandas_batches --- bigframes/dataframe.py | 66 +++++++++++++++++++++++++++++++++-- bigframes/session/executor.py | 8 ++++- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2056c192ad..b28762c440 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1634,6 +1634,39 @@ def to_pandas( ) -> pandas.DataFrame | pandas.Series: """Write DataFrame to pandas DataFrame. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col': [4, 2, 2]}) + + Download the data from BigQuery and convert it into an in-memory pandas DataFrame. + + >>> df.to_pandas() + col + 0 4 + 1 2 + 2 2 + + Estimate job statistics without processing or downloading data by using `dry_run=True`. + + >>> df.to_pandas(dry_run=True) # doctest: +SKIP + columnCount 1 + columnDtypes {'col': Int64} + indexLevel 1 + indexDtypes [Int64] + projectId bigframes-dev + location US + jobType QUERY + destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_... + useLegacySql False + referencedTables None + totalBytesProcessed 0 + cacheHit False + statementType SELECT + creationTime 2025-04-02 20:17:12.038000+00:00 + dtype: object + Args: max_download_size (int, default None): Download size threshold in MB. If max_download_size is exceeded when downloading data @@ -1702,11 +1735,40 @@ def to_pandas_batches( page_size and max_results determine the size and number of batches, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]}) + + Iterate through the results in batches, limiting the total rows yielded + across all batches via `max_results`: + + >>> for df_batch in df.to_pandas_batches(max_results=3): + ... print(df_batch) + col + 0 4 + 1 3 + 2 2 + + Alternatively, control the approximate size of each batch using `page_size` + and fetch batches manually using `next()`: + + >>> it = df.to_pandas_batches(page_size=2) + >>> next(it) + col + 0 4 + 1 3 + >>> next(it) + col + 2 2 + 3 2 + Args: page_size (int, default None): - The size of each batch. + The maximum number of rows of each batch. Non-positive values are ignored. max_results (int, default None): - If given, only download this many rows at maximum. + The maximum total number of rows of all batches. allow_large_results (bool, default None): If not None, overrides the global setting to allow or disallow large query results over the default size limit of 10 GB. diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index aabbbdcf5d..150122b7dd 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -264,7 +264,13 @@ def execute( # Though we provide the read client, iterator may or may not use it based on what is efficient for the result def iterator_supplier(): - return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient) + # Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154 + if iterator._page_size is not None or iterator.max_results is not None: + return iterator.to_arrow_iterable(bqstorage_client=None) + else: + return iterator.to_arrow_iterable( + bqstorage_client=self.bqstoragereadclient + ) if query_job: size_bytes = self.bqclient.get_table(query_job.destination).num_bytes From 806e762ad6b02c7d31830b0a641f78c9772d6d62 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 3 Apr 2025 18:57:15 +0000 Subject: [PATCH 2/3] add series.to_pandas code example --- bigframes/dataframe.py | 3 --- bigframes/series.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b28762c440..135522ebd5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1699,9 +1699,6 @@ def to_pandas( downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas Series containing dry run statistics will be returned. """ - - # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job - if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( max_download_size=max_download_size, diff --git a/bigframes/series.py b/bigframes/series.py index d2a3dcf78f..817aef0c2a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -385,6 +385,39 @@ def to_pandas( ) -> pandas.Series: """Writes Series to pandas Series. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([4, 3, 2]) + + Download the data from BigQuery and convert it into an in-memory pandas Series. + + >>> s.to_pandas() + 0 4 + 1 3 + 2 2 + dtype: Int64 + + Estimate job statistics without processing or downloading data by using `dry_run=True`. + + >>> s.to_pandas(dry_run=True) # doctest: +SKIP + columnCount 1 + columnDtypes {None: Int64} + indexLevel 1 + indexDtypes [Int64] + projectId bigframes-dev + location US + jobType QUERY + destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_... + useLegacySql False + referencedTables None + totalBytesProcessed 0 + cacheHit False + statementType SELECT + creationTime 2025-04-03 18:54:59.219000+00:00 + dtype: object + Args: max_download_size (int, default None): Download size threshold in MB. If max_download_size is exceeded when downloading data From 7e0d968f21203e6dd1cf7051041339b0abb88763 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 3 Apr 2025 19:01:00 +0000 Subject: [PATCH 3/3] fix test_to_pandas_batches_override_global_option: workaround solution can skip one warning message --- tests/system/large/test_dataframe_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py index 76a7001fe3..ee9daa4e31 100644 --- a/tests/system/large/test_dataframe_io.py +++ b/tests/system/large/test_dataframe_io.py @@ -44,7 +44,7 @@ def test_to_pandas_batches_override_global_option( page_size=500, max_results=1500, allow_large_results=True ) ) - assert len(w) == 2 + assert len(w) == 1 assert issubclass(w[0].category, FutureWarning) assert "The query result size has exceeded 10 GB." in str(w[0].message)