diff --git a/bigframes/session/_io/bigquery/read_gbq_query.py b/bigframes/session/_io/bigquery/read_gbq_query.py index aed77615ce..b650266a0d 100644 --- a/bigframes/session/_io/bigquery/read_gbq_query.py +++ b/bigframes/session/_io/bigquery/read_gbq_query.py @@ -32,6 +32,28 @@ import bigframes.session +def should_return_query_results(query_job: bigquery.QueryJob) -> bool: + """Returns True if query_job is the kind of query we expect results from. + + If the query was DDL or DML, return some job metadata. See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type + for possible statement types. Note that destination table does exist + for some DDL operations such as CREATE VIEW, but we don't want to + read from that. See internal issue b/444282709. + """ + + if query_job.statement_type == "SELECT": + return True + + if query_job.statement_type == "SCRIPT": + # Try to determine if the last statement is a SELECT. Alternatively, we + # could do a jobs.list request using query_job as the parent job and + # try to determine the statement type of the last child job. + return query_job.destination != query_job.ddl_target_table + + return False + + def create_dataframe_from_query_job_stats( query_job: Optional[bigquery.QueryJob], *, session: bigframes.session.Session ) -> dataframe.DataFrame: diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 49b1195235..94d8db6f36 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -42,6 +42,7 @@ from google.cloud import bigquery_storage_v1 import google.cloud.bigquery import google.cloud.bigquery as bigquery +import google.cloud.bigquery.table from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas import pyarrow as pa @@ -1004,7 +1005,7 @@ def read_gbq_query( configuration=configuration, ) query_job_for_metrics = query_job - rows = None + rows: Optional[google.cloud.bigquery.table.RowIterator] = None else: job_config = typing.cast( bigquery.QueryJobConfig, @@ -1037,21 +1038,14 @@ def read_gbq_query( query_job=query_job_for_metrics, row_iterator=rows ) - # It's possible that there's no job and corresponding destination table. - # In this case, we must create a local node. + # It's possible that there's no job and therefore no corresponding + # destination table. In this case, we must create a local node. # # TODO(b/420984164): Tune the threshold for which we download to # local node. Likely there are a wide range of sizes in which it # makes sense to download the results beyond the first page, even if # there is a job and destination table available. - if ( - rows is not None - and destination is None - and ( - query_job_for_metrics is None - or query_job_for_metrics.statement_type == "SELECT" - ) - ): + if query_job_for_metrics is None and rows is not None: return bf_read_gbq_query.create_dataframe_from_row_iterator( rows, session=self._session, @@ -1059,22 +1053,43 @@ def read_gbq_query( columns=columns, ) - # If there was no destination table and we've made it this far, that - # means the query must have been DDL or DML. Return some job metadata, - # instead. - if not destination: + # We already checked rows, so if there's no destination table, then + # there are no results to return. + if destination is None: return bf_read_gbq_query.create_dataframe_from_query_job_stats( query_job_for_metrics, session=self._session, ) + # If the query was DDL or DML, return some job metadata. See + # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type + # for possible statement types. Note that destination table does exist + # for some DDL operations such as CREATE VIEW, but we don't want to + # read from that. See internal issue b/444282709. + if ( + query_job_for_metrics is not None + and not bf_read_gbq_query.should_return_query_results(query_job_for_metrics) + ): + return bf_read_gbq_query.create_dataframe_from_query_job_stats( + query_job_for_metrics, + session=self._session, + ) + + # Speed up counts by getting counts from result metadata. + if rows is not None: + n_rows = rows.total_rows + elif query_job_for_metrics is not None: + n_rows = query_job_for_metrics.result().total_rows + else: + n_rows = None + return self.read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", index_col=index_col, columns=columns, use_cache=configuration["query"]["useQueryCache"], force_total_order=force_total_order, - n_rows=query_job.result().total_rows, + n_rows=n_rows, # max_results and filters are omitted because they are already # handled by to_query(), above. ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 892f8c8898..38d66bceb2 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -430,18 +430,63 @@ def test_read_gbq_w_max_results( assert bf_result.shape[0] == max_results -def test_read_gbq_w_script_no_select(session, dataset_id: str): - ddl = f""" - CREATE TABLE `{dataset_id}.test_read_gbq_w_ddl` ( - `col_a` INT64, - `col_b` STRING - ); - - INSERT INTO `{dataset_id}.test_read_gbq_w_ddl` - VALUES (123, 'hello world'); - """ - df = session.read_gbq(ddl).to_pandas() - assert df["statement_type"][0] == "SCRIPT" +@pytest.mark.parametrize( + ("sql_template", "expected_statement_type"), + ( + pytest.param( + """ + CREATE OR REPLACE TABLE `{dataset_id}.test_read_gbq_w_ddl` ( + `col_a` INT64, + `col_b` STRING + ); + """, + "CREATE_TABLE", + id="ddl-create-table", + ), + pytest.param( + # From https://cloud.google.com/bigquery/docs/boosted-tree-classifier-tutorial + """ + CREATE OR REPLACE VIEW `{dataset_id}.test_read_gbq_w_create_view` + AS + SELECT + age, + workclass, + marital_status, + education_num, + occupation, + hours_per_week, + income_bracket, + CASE + WHEN MOD(functional_weight, 10) < 8 THEN 'training' + WHEN MOD(functional_weight, 10) = 8 THEN 'evaluation' + WHEN MOD(functional_weight, 10) = 9 THEN 'prediction' + END AS dataframe + FROM + `bigquery-public-data.ml_datasets.census_adult_income`; + """, + "CREATE_VIEW", + id="ddl-create-view", + ), + pytest.param( + """ + CREATE OR REPLACE TABLE `{dataset_id}.test_read_gbq_w_dml` ( + `col_a` INT64, + `col_b` STRING + ); + + INSERT INTO `{dataset_id}.test_read_gbq_w_dml` + VALUES (123, 'hello world'); + """, + "SCRIPT", + id="dml", + ), + ), +) +def test_read_gbq_w_script_no_select( + session, dataset_id: str, sql_template: str, expected_statement_type: str +): + df = session.read_gbq(sql_template.format(dataset_id=dataset_id)).to_pandas() + assert df["statement_type"][0] == expected_statement_type def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id):