From 1bc248e6da1e6c0e7193a14136eba08c19eea8bc Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 30 Apr 2025 02:48:55 +0000 Subject: [PATCH 1/9] feat: add dry_run parameter to read_gbq() and read_gbq_query() --- bigframes/core/blocks.py | 70 +++---------------- bigframes/pandas/io/api.py | 73 +++++++++++++++++++- bigframes/session/__init__.py | 90 +++++++++++++++++++++++-- bigframes/session/dry_run_jobs.py | 104 +++++++++++++++++++++++++++++ bigframes/session/loader.py | 49 +++++++++++++- tests/system/small/test_session.py | 64 ++++++++++++++++++ 6 files changed, 382 insertions(+), 68 deletions(-) create mode 100644 bigframes/session/dry_run_jobs.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cc3b70f8a8..d4e65c1495 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -22,7 +22,6 @@ from __future__ import annotations import ast -import copy import dataclasses import datetime import functools @@ -30,17 +29,7 @@ import random import textwrap import typing -from typing import ( - Any, - Iterable, - List, - Literal, - Mapping, - Optional, - Sequence, - Tuple, - Union, -) +from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union import warnings import bigframes_vendored.constants as constants @@ -69,6 +58,7 @@ import bigframes.exceptions as bfe import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +from bigframes.session import dry_run_jobs # Type constraint for wherever column labels are used Label = typing.Hashable @@ -821,60 +811,18 @@ def _compute_dry_run( if sampling.enable_downsampling: raise NotImplementedError("Dry run with sampling is not supported") - index: List[Any] = [] - values: List[Any] = [] - - index.append("columnCount") - values.append(len(self.value_columns)) - index.append("columnDtypes") - values.append( - { - col: self.expr.get_column_type(self.resolve_label_exact_or_error(col)) - for col in self.column_labels - } - ) - - index.append("indexLevel") - values.append(self.index.nlevels) - index.append("indexDtypes") - values.append(self.index.dtypes) - expr = self._apply_value_keys_to_expr(value_keys=value_keys) query_job = self.session._executor.dry_run(expr, ordered) - job_api_repr = copy.deepcopy(query_job._properties) - - job_ref = job_api_repr["jobReference"] - for key, val in job_ref.items(): - index.append(key) - values.append(val) - - index.append("jobType") - values.append(job_api_repr["configuration"]["jobType"]) - - query_config = job_api_repr["configuration"]["query"] - for key in ("destinationTable", "useLegacySql"): - index.append(key) - values.append(query_config.get(key)) - - query_stats = job_api_repr["statistics"]["query"] - for key in ( - "referencedTables", - "totalBytesProcessed", - "cacheHit", - "statementType", - ): - index.append(key) - values.append(query_stats.get(key)) - index.append("creationTime") - values.append( - pd.Timestamp( - job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC" - ) + column_dtypes = { + col: self.expr.get_column_type(self.resolve_label_exact_or_error(col)) + for col in self.column_labels + } + return ( + dry_run_jobs.get_stats_with_dtypes(query_job, column_dtypes, self.index.dtypes), + query_job, ) - return pd.Series(values, index=index), query_job - def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): expr = self._expr if value_keys is not None: diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index a119ff67b0..91229f1528 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -25,6 +25,7 @@ Literal, MutableSequence, Optional, + overload, Sequence, Tuple, Union, @@ -155,6 +156,38 @@ def read_json( read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json) +@overload +def read_gbq( # type: ignore[overload-overlap] + query_or_table: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + filters: vendored_pandas_gbq.FiltersType = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[False] = ..., +) -> bigframes.dataframe.DataFrame: + ... + + +@overload +def read_gbq( + query_or_table: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + filters: vendored_pandas_gbq.FiltersType = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[True] = ..., +) -> pandas.Series: + ... + + def read_gbq( query_or_table: str, *, @@ -165,7 +198,8 @@ def read_gbq( filters: vendored_pandas_gbq.FiltersType = (), use_cache: Optional[bool] = None, col_order: Iterable[str] = (), -) -> bigframes.dataframe.DataFrame: + dry_run: bool = False, +) -> bigframes.dataframe.DataFrame | pandas.Series: _set_default_session_location_if_possible(query_or_table) return global_session.with_default_session( bigframes.session.Session.read_gbq, @@ -177,6 +211,7 @@ def read_gbq( filters=filters, use_cache=use_cache, col_order=col_order, + dry_run=dry_run, ) @@ -208,6 +243,38 @@ def read_gbq_object_table( ) +@overload +def read_gbq_query( # type: ignore[overload-overlap] + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + filters: vendored_pandas_gbq.FiltersType = ..., + dry_run: Literal[False] = ..., +) -> bigframes.dataframe.DataFrame: + ... + + +@overload +def read_gbq_query( + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + filters: vendored_pandas_gbq.FiltersType = ..., + dry_run: Literal[True] = ..., +) -> pandas.Series: + ... + + def read_gbq_query( query: str, *, @@ -218,7 +285,8 @@ def read_gbq_query( use_cache: Optional[bool] = None, col_order: Iterable[str] = (), filters: vendored_pandas_gbq.FiltersType = (), -) -> bigframes.dataframe.DataFrame: + dry_run: bool = False, +) -> bigframes.dataframe.DataFrame | pandas.Series: _set_default_session_location_if_possible(query) return global_session.with_default_session( bigframes.session.Session.read_gbq_query, @@ -230,6 +298,7 @@ def read_gbq_query( use_cache=use_cache, col_order=col_order, filters=filters, + dry_run=dry_run, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 6379a6f2e8..8d1b1928bd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -30,6 +30,7 @@ Literal, MutableSequence, Optional, + overload, Sequence, Tuple, Union, @@ -388,6 +389,38 @@ def close(self): self.bqclient, self.cloudfunctionsclient, self.session_id ) + @overload + def read_gbq( + self, + query_or_table: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[False] = ..., + ) -> dataframe.DataFrame: + ... + + @overload + def read_gbq( + self, + query_or_table: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[True] = ..., + ) -> pandas.Series: + ... + def read_gbq( self, query_or_table: str, @@ -399,8 +432,9 @@ def read_gbq( filters: third_party_pandas_gbq.FiltersType = (), use_cache: Optional[bool] = None, col_order: Iterable[str] = (), + dry_run: bool = False # Add a verify index argument that fails if the index is not unique. - ) -> dataframe.DataFrame: + ) -> dataframe.DataFrame | pandas.Series: # TODO(b/281571214): Generate prompt to show the progress of read_gbq. if columns and col_order: raise ValueError( @@ -410,7 +444,7 @@ def read_gbq( columns = col_order if bf_io_bigquery.is_query(query_or_table): - return self._loader.read_gbq_query( + return self._loader.read_gbq_query( # type: ignore # for dry_run overload query_or_table, index_col=index_col, columns=columns, @@ -419,6 +453,20 @@ def read_gbq( api_name="read_gbq", use_cache=use_cache, filters=filters, + dry_run=dry_run, + ) + elif dry_run: + query = f"SELECT * FROM `{query_or_table}`" + return self._loader.read_gbq_query( + query, + index_col=index_col, + columns=columns, + configuration=configuration, + max_results=max_results, + api_name="read_gbq", + use_cache=use_cache, + filters=filters, + dry_run=True, ) else: if configuration is not None: @@ -446,6 +494,38 @@ def _register_object( ): self._objects.append(weakref.ref(object)) + @overload + def read_gbq_query( + self, + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + dry_run: Literal[False] = ..., + ) -> dataframe.DataFrame: + ... + + @overload + def read_gbq_query( + self, + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + use_cache: Optional[bool] = ..., + col_order: Iterable[str] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + dry_run: Literal[True] = ..., + ) -> pandas.Series: + ... + def read_gbq_query( self, query: str, @@ -457,7 +537,8 @@ def read_gbq_query( use_cache: Optional[bool] = None, col_order: Iterable[str] = (), filters: third_party_pandas_gbq.FiltersType = (), - ) -> dataframe.DataFrame: + dry_run: bool = False, + ) -> dataframe.DataFrame | pandas.Series: """Turn a SQL query into a DataFrame. Note: Because the results are written to a temporary table, ordering by @@ -523,7 +604,7 @@ def read_gbq_query( elif col_order: columns = col_order - return self._loader.read_gbq_query( + return self._loader.read_gbq_query( # type: ignore # for dry_run overload query=query, index_col=index_col, columns=columns, @@ -532,6 +613,7 @@ def read_gbq_query( api_name="read_gbq_query", use_cache=use_cache, filters=filters, + dry_run=dry_run, ) def read_gbq_table( diff --git a/bigframes/session/dry_run_jobs.py b/bigframes/session/dry_run_jobs.py new file mode 100644 index 0000000000..fc7866c3d2 --- /dev/null +++ b/bigframes/session/dry_run_jobs.py @@ -0,0 +1,104 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +from typing import Dict, Sequence + +from google.cloud import bigquery +import pandas + +from bigframes import dtypes + + +def get_stats_with_inferred_dtypes( + query_job: bigquery.QueryJob, + value_cols: Sequence[str], + index_col: Sequence[str], +) -> pandas.Series: + if query_job.schema is None: + # If the schema is not available, don't bother inferring dtypes. + return get_stats(query_job) + + col_dtypes = dtypes.bf_type_from_type_kind(query_job.schema) + + if value_cols: + value_col_dtypes = { + col: col_dtypes[col] for col in value_cols if col in col_dtypes + } + else: + # Use every column that is not mentioned as an index column + value_col_dtypes = { + col: dtype for col, dtype in col_dtypes.items() if col not in set(index_col) + } + + index_dtypes = [col_dtypes[col] for col in index_col] + + return get_stats_with_dtypes(query_job, value_col_dtypes, index_dtypes) + + +def get_stats_with_dtypes( + query_job: bigquery.QueryJob, + column_dtypes: Dict[str, dtypes.Dtype], + index_dtypes: Sequence[dtypes.Dtype], +) -> pandas.Series: + index = ["columnCount", "columnDtypes", "indexLevel", "indexDtypes"] + values = [len(column_dtypes), column_dtypes, len(index_dtypes), index_dtypes] + + s = pandas.Series(values, index=index) + + return pandas.concat([s, get_stats(query_job)]) + + +def get_stats( + query_job: bigquery.QueryJob, +) -> pandas.Series: + """Returns important stats from the query job as a Pandas Series.""" + + index = [] + values = [] + + job_api_repr = copy.deepcopy(query_job._properties) + + job_ref = job_api_repr["jobReference"] + for key, val in job_ref.items(): + index.append(key) + values.append(val) + + index.append("jobType") + values.append(job_api_repr["configuration"]["jobType"]) + + query_config = job_api_repr["configuration"]["query"] + for key in ("destinationTable", "useLegacySql"): + index.append(key) + values.append(query_config.get(key)) + + query_stats = job_api_repr["statistics"]["query"] + for key in ( + "referencedTables", + "totalBytesProcessed", + "cacheHit", + "statementType", + ): + index.append(key) + values.append(query_stats.get(key)) + + index.append("creationTime") + values.append( + pandas.Timestamp( + job_api_repr["statistics"]["creationTime"], unit="ms", tz="UTC" + ) + ) + + return pandas.Series(values, index=index) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 76f12ae438..0cb1d964b5 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -30,6 +30,7 @@ List, Literal, Optional, + overload, Sequence, Tuple, ) @@ -49,6 +50,7 @@ import bigframes.core.schema as schemata import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers +from bigframes.session import dry_run_jobs import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.metrics @@ -468,6 +470,7 @@ def read_gbq_table( columns=columns, api_name=api_name, use_cache=use_cache, + dry_run=False, ) # ----------------------------------------- @@ -626,6 +629,38 @@ def read_bigquery_load_job( api_name="read_gbq_table", ) + @overload + def read_gbq_query( + self, + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + api_name: str = ..., + use_cache: Optional[bool] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + dry_run: Literal[False] = ..., + ) -> dataframe.DataFrame: + ... + + @overload + def read_gbq_query( + self, + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + configuration: Optional[Dict] = ..., + max_results: Optional[int] = ..., + api_name: str = ..., + use_cache: Optional[bool] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + dry_run: Literal[True] = ..., + ) -> pandas.Series: + ... + def read_gbq_query( self, query: str, @@ -637,7 +672,8 @@ def read_gbq_query( api_name: str = "read_gbq_query", use_cache: Optional[bool] = None, filters: third_party_pandas_gbq.FiltersType = (), - ) -> dataframe.DataFrame: + dry_run: bool = False, + ) -> dataframe.DataFrame | pandas.Series: import bigframes.dataframe as dataframe configuration = _transform_read_gbq_configuration(configuration) @@ -683,6 +719,17 @@ def read_gbq_query( time_travel_timestamp=None, ) + if dry_run: + job_config = typing.cast( + bigquery.QueryJobConfig, + bigquery.QueryJobConfig.from_api_repr(configuration), + ) + job_config.dry_run = True + query_job = self._bqclient.query(query, job_config=job_config) + return dry_run_jobs.get_stats_with_inferred_dtypes( + query_job, list(columns), index_cols + ) + # No cluster candidates as user query might not be clusterable (eg because of ORDER BY clause) destination, query_job = self._query_to_destination( query, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index c7bf5b3f5e..4cd3cf31ee 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1727,3 +1727,67 @@ def test_read_gbq_duplicate_columns_xfail( index_col=index_col, columns=columns, ) + + +def test_read_gbq_with_table_ref_dry_run(scalars_table_id, session): + result = session.read_gbq(scalars_table_id, dry_run=True) + + assert isinstance(result, pd.Series) + _assert_dry_run_stats_are_valid(result) + + +def test_read_gbq_with_query_dry_run(scalars_table_id, session): + query = f"SELECT * FROM {scalars_table_id} LIMIT 10;" + result = session.read_gbq(query, dry_run=True) + + assert isinstance(result, pd.Series) + _assert_dry_run_stats_are_valid(result) + + +def test_read_gbq_dry_run_with_column_and_index(scalars_table_id, session): + query = f"SELECT * FROM {scalars_table_id} LIMIT 10;" + result = session.read_gbq( + query, dry_run=True, columns=["int64_col", "float64_col"], index_col="int64_too" + ) + + assert isinstance(result, pd.Series) + _assert_dry_run_stats_are_valid(result) + assert result["columnCount"] == 2 + assert result["columnDtypes"] == { + "int64_col": pd.Int64Dtype(), + "float64_col": pd.Float64Dtype(), + } + assert result["indexLevel"] == 1 + assert result["indexDtypes"] == [pd.Int64Dtype()] + + +def test_read_gbq_query_dry_run(scalars_table_id, session): + query = f"SELECT * FROM {scalars_table_id} LIMIT 10;" + result = session.read_gbq_query(query, dry_run=True) + + assert isinstance(result, pd.Series) + _assert_dry_run_stats_are_valid(result) + + +def _assert_dry_run_stats_are_valid(result: pd.Series): + expected_index = pd.Index( + [ + "columnCount", + "columnDtypes", + "indexLevel", + "indexDtypes", + "projectId", + "location", + "jobType", + "destinationTable", + "useLegacySql", + "referencedTables", + "totalBytesProcessed", + "cacheHit", + "statementType", + "creationTime", + ] + ) + + pd.testing.assert_index_equal(result.index, expected_index) + assert result["columnCount"] + result["indexLevel"] > 0 From 073c34158a76f1afc68efe33e8ec4847d10b9945 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 30 Apr 2025 17:53:34 +0000 Subject: [PATCH 2/9] fix lint --- bigframes/core/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d4e65c1495..0c378e7054 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -819,7 +819,9 @@ def _compute_dry_run( for col in self.column_labels } return ( - dry_run_jobs.get_stats_with_dtypes(query_job, column_dtypes, self.index.dtypes), + dry_run_jobs.get_stats_with_dtypes( + query_job, column_dtypes, self.index.dtypes + ), query_job, ) From 2bceff77c1dacc6ed7d313c9f2a32afabed815b3 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 30 Apr 2025 16:59:59 -0700 Subject: [PATCH 3/9] chore(main): release 2.2.0 (#1643) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 43 +++++++++++++++++++++++ bigframes/version.py | 4 +-- third_party/bigframes_vendored/version.py | 4 +-- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b1e331d1d..b6c08af05e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,49 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.1.0...v2.2.0) (2025-04-30) + + +### Features + +* Add gemini-2.0-flash-001 and gemini-2.0-flash-lite-001 to fine tune score endponts and multimodal endpoints ([#1650](https://github.com/googleapis/python-bigquery-dataframes/issues/1650)) ([4fb54df](https://github.com/googleapis/python-bigquery-dataframes/commit/4fb54dfe448604a90fc1818cf18b1e77e1e7227b)) +* Add GeminiTextGenerator.predict structured output ([#1653](https://github.com/googleapis/python-bigquery-dataframes/issues/1653)) ([6199023](https://github.com/googleapis/python-bigquery-dataframes/commit/6199023a6a71e72e926f5879e74a15215bc6e4a0)) +* DataFrames.__getitem__ support for slice input ([#1668](https://github.com/googleapis/python-bigquery-dataframes/issues/1668)) ([563f0cb](https://github.com/googleapis/python-bigquery-dataframes/commit/563f0cbdf4a18c3cd1bd2a4b52de823165638911)) +* Print right origin of `PreviewWarning` for the `bpd.udf` ([#1629](https://github.com/googleapis/python-bigquery-dataframes/issues/1629)) ([48d10d1](https://github.com/googleapis/python-bigquery-dataframes/commit/48d10d1f0150a29dd3b91f505f8d3874e0b88c42)) +* Session.bytes_processed_sum will be updated when allow_large_re… ([#1669](https://github.com/googleapis/python-bigquery-dataframes/issues/1669)) ([ae312db](https://github.com/googleapis/python-bigquery-dataframes/commit/ae312dbed25da6da5e2817d5c9838654c2a1ad1c)) +* Short circuit query for local scan ([#1618](https://github.com/googleapis/python-bigquery-dataframes/issues/1618)) ([e84f232](https://github.com/googleapis/python-bigquery-dataframes/commit/e84f232b0fc5e2167a7cddb355cf0c8837ae5422)) +* Support names parameter in read_csv for bigquery engine ([#1659](https://github.com/googleapis/python-bigquery-dataframes/issues/1659)) ([3388191](https://github.com/googleapis/python-bigquery-dataframes/commit/33881914ab5b8d0e701eabd9c731aed1deab3d49)) +* Support passing list of values to bigframes.core.sql.simple_literal ([#1641](https://github.com/googleapis/python-bigquery-dataframes/issues/1641)) ([102d363](https://github.com/googleapis/python-bigquery-dataframes/commit/102d363aa7e3245ff262c817bc756ea0eaee57e7)) +* Support write api as loading option ([#1617](https://github.com/googleapis/python-bigquery-dataframes/issues/1617)) ([c46ad06](https://github.com/googleapis/python-bigquery-dataframes/commit/c46ad0647785a9207359eba0fb5b6f7a16610f2a)) + + +### Bug Fixes + +* DataFrame accessors is not pupulated ([#1639](https://github.com/googleapis/python-bigquery-dataframes/issues/1639)) ([28afa2c](https://github.com/googleapis/python-bigquery-dataframes/commit/28afa2c73c0517f9365fab05193706631b656551)) +* Prefer remote schema instead of throwing on materialize conflicts ([#1644](https://github.com/googleapis/python-bigquery-dataframes/issues/1644)) ([53fc25b](https://github.com/googleapis/python-bigquery-dataframes/commit/53fc25bfc86e166b91e5001506051b1cac34c996)) +* Remove itertools.pairwise usage ([#1638](https://github.com/googleapis/python-bigquery-dataframes/issues/1638)) ([9662745](https://github.com/googleapis/python-bigquery-dataframes/commit/9662745265c8c6e42f372629bd2c7806542cee1a)) +* Resolve issue where pre-release versions of google-auth are installed ([#1491](https://github.com/googleapis/python-bigquery-dataframes/issues/1491)) ([ebb7a5e](https://github.com/googleapis/python-bigquery-dataframes/commit/ebb7a5e2b24fa57d6fe6a76d9b857ad44c67d194)) +* Resolve some of the typo errors ([#1655](https://github.com/googleapis/python-bigquery-dataframes/issues/1655)) ([cd7fbde](https://github.com/googleapis/python-bigquery-dataframes/commit/cd7fbde026522f53a23a4bb6585ad8629769fad1)) + + +### Performance Improvements + +* Fold row count ops when known ([#1656](https://github.com/googleapis/python-bigquery-dataframes/issues/1656)) ([c958dbe](https://github.com/googleapis/python-bigquery-dataframes/commit/c958dbea32b77cec9fddfc09e3b40d1da220a42c)) +* Use flyweight for node fields ([#1654](https://github.com/googleapis/python-bigquery-dataframes/issues/1654)) ([8482bfc](https://github.com/googleapis/python-bigquery-dataframes/commit/8482bfc1d4caa91a35c4fbf0be420301d05ad544)) + + +### Dependencies + +* Support shapely 1.8.5+ again ([#1651](https://github.com/googleapis/python-bigquery-dataframes/issues/1651)) ([ae83e61](https://github.com/googleapis/python-bigquery-dataframes/commit/ae83e61c49ade64d6f727e9f364bd2f1aeec6e19)) + + +### Documentation + +* Add JSON data types notebook ([#1647](https://github.com/googleapis/python-bigquery-dataframes/issues/1647)) ([9128c4a](https://github.com/googleapis/python-bigquery-dataframes/commit/9128c4a31dab487bc23f67c43380abd0beda5b1c)) +* Add sample code snippets for `udf` ([#1649](https://github.com/googleapis/python-bigquery-dataframes/issues/1649)) ([53caa8d](https://github.com/googleapis/python-bigquery-dataframes/commit/53caa8d689e64436f5313095ee27479a06d8e8a8)) +* Fix `bq_dataframes_template` notebook to work if partial ordering mode is enabled ([#1665](https://github.com/googleapis/python-bigquery-dataframes/issues/1665)) ([f442e7a](https://github.com/googleapis/python-bigquery-dataframes/commit/f442e7a07ff273ba3af74eeabafb62110b78f692)) +* Note that `udf` is in preview and must be python 3.11 compatible ([#1629](https://github.com/googleapis/python-bigquery-dataframes/issues/1629)) ([48d10d1](https://github.com/googleapis/python-bigquery-dataframes/commit/48d10d1f0150a29dd3b91f505f8d3874e0b88c42)) + ## [2.1.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.0.0...v2.1.0) (2025-04-22) diff --git a/bigframes/version.py b/bigframes/version.py index b671169b24..c6ca0ee57c 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.1.0" +__version__ = "2.2.0" # {x-release-please-start-date} -__release_date__ = "2025-04-22" +__release_date__ = "2025-04-30" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index b671169b24..c6ca0ee57c 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.1.0" +__version__ = "2.2.0" # {x-release-please-start-date} -__release_date__ = "2025-04-22" +__release_date__ = "2025-04-30" # {x-release-please-end} From 42edce4659a1c3a582f530cca07f13d429ce7fe9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 1 May 2025 19:59:34 +0000 Subject: [PATCH 4/9] create a different stats report for reading gbq tables --- bigframes/core/blocks.py | 4 +- bigframes/pandas/io/api.py | 34 ++++++++++- bigframes/session/__init__.py | 56 ++++++++++++------ .../session/{dry_run_jobs.py => dry_runs.py} | 50 ++++++++++++---- bigframes/session/loader.py | 58 +++++++++++++++++-- tests/system/small/test_session.py | 43 ++++++++++++-- 6 files changed, 203 insertions(+), 42 deletions(-) rename bigframes/session/{dry_run_jobs.py => dry_runs.py} (67%) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0c378e7054..304ae4e9bd 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -58,7 +58,7 @@ import bigframes.exceptions as bfe import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops -from bigframes.session import dry_run_jobs +from bigframes.session import dry_runs # Type constraint for wherever column labels are used Label = typing.Hashable @@ -819,7 +819,7 @@ def _compute_dry_run( for col in self.column_labels } return ( - dry_run_jobs.get_stats_with_dtypes( + dry_runs.get_query_stats_with_dtypes( query_job, column_dtypes, self.index.dtypes ), query_job, diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 91229f1528..ecf8a59bb7 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -305,6 +305,36 @@ def read_gbq_query( read_gbq_query.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_query) +@overload +def read_gbq_table( # type: ignore[overload-overlap] + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + max_results: Optional[int] = ..., + filters: vendored_pandas_gbq.FiltersType = ..., + use_cache: bool = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[False] = ..., +) -> bigframes.dataframe.DataFrame: + ... + + +@overload +def read_gbq_table( + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + max_results: Optional[int] = ..., + filters: vendored_pandas_gbq.FiltersType = ..., + use_cache: bool = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[True] = ..., +) -> pandas.Series: + ... + + def read_gbq_table( query: str, *, @@ -314,7 +344,8 @@ def read_gbq_table( filters: vendored_pandas_gbq.FiltersType = (), use_cache: bool = True, col_order: Iterable[str] = (), -) -> bigframes.dataframe.DataFrame: + dry_run: bool = False, +) -> bigframes.dataframe.DataFrame | pandas.Series: _set_default_session_location_if_possible(query) return global_session.with_default_session( bigframes.session.Session.read_gbq_table, @@ -325,6 +356,7 @@ def read_gbq_table( filters=filters, use_cache=use_cache, col_order=col_order, + dry_run=dry_run, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index bb667dde42..f853df4190 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -384,7 +384,7 @@ def close(self): ) @overload - def read_gbq( + def read_gbq( # type: ignore[overload-overlap] self, query_or_table: str, *, @@ -449,19 +449,6 @@ def read_gbq( filters=filters, dry_run=dry_run, ) - elif dry_run: - query = f"SELECT * FROM `{query_or_table}`" - return self._loader.read_gbq_query( - query, - index_col=index_col, - columns=columns, - configuration=configuration, - max_results=max_results, - api_name="read_gbq", - use_cache=use_cache, - filters=filters, - dry_run=True, - ) else: if configuration is not None: raise ValueError( @@ -470,7 +457,7 @@ def read_gbq( "'configuration' or use a query." ) - return self._loader.read_gbq_table( + return self._loader.read_gbq_table( # type: ignore # for dry_run overload query_or_table, index_col=index_col, columns=columns, @@ -478,6 +465,7 @@ def read_gbq( api_name="read_gbq", use_cache=use_cache if use_cache is not None else True, filters=filters, + dry_run=dry_run, ) def _register_object( @@ -489,7 +477,7 @@ def _register_object( self._objects.append(weakref.ref(object)) @overload - def read_gbq_query( + def read_gbq_query( # type: ignore[overload-overlap] self, query: str, *, @@ -610,6 +598,36 @@ def read_gbq_query( dry_run=dry_run, ) + @overload + def read_gbq_table( # type: ignore[overload-overlap] + self, + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + max_results: Optional[int] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + use_cache: bool = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[False] = ..., + ) -> dataframe.DataFrame: + ... + + @overload + def read_gbq_table( + self, + query: str, + *, + index_col: Iterable[str] | str | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + max_results: Optional[int] = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + use_cache: bool = ..., + col_order: Iterable[str] = ..., + dry_run: Literal[True] = ..., + ) -> pandas.Series: + ... + def read_gbq_table( self, query: str, @@ -620,7 +638,8 @@ def read_gbq_table( filters: third_party_pandas_gbq.FiltersType = (), use_cache: bool = True, col_order: Iterable[str] = (), - ) -> dataframe.DataFrame: + dry_run: bool = False, + ) -> dataframe.DataFrame | pandas.Series: """Turn a BigQuery table into a DataFrame. **Examples:** @@ -651,7 +670,7 @@ def read_gbq_table( elif col_order: columns = col_order - return self._loader.read_gbq_table( + return self._loader.read_gbq_table( # type: ignore # for dry_run overload table_id=query, index_col=index_col, columns=columns, @@ -659,6 +678,7 @@ def read_gbq_table( api_name="read_gbq_table", use_cache=use_cache, filters=filters, + dry_run=dry_run, ) def read_gbq_table_streaming( diff --git a/bigframes/session/dry_run_jobs.py b/bigframes/session/dry_runs.py similarity index 67% rename from bigframes/session/dry_run_jobs.py rename to bigframes/session/dry_runs.py index fc7866c3d2..03a588dc35 100644 --- a/bigframes/session/dry_run_jobs.py +++ b/bigframes/session/dry_runs.py @@ -14,7 +14,7 @@ from __future__ import annotations import copy -from typing import Dict, Sequence +from typing import Any, Dict, List, Sequence from google.cloud import bigquery import pandas @@ -22,14 +22,42 @@ from bigframes import dtypes -def get_stats_with_inferred_dtypes( +def get_table_stats(table: bigquery.Table) -> pandas.Series: + values: List[Any] = [] + index: List[Any] = [] + + # Indicate that no query is executed. + index.append("isQuery") + values.append(False) + + # Populate column and index types + col_dtypes = dtypes.bf_type_from_type_kind(table.schema) + index.append("tableColumnCount") + values.append(len(col_dtypes)) + index.append("tableColumnTypes") + values.append(col_dtypes) + + for key in ("numBytes", "numRows", "location", "type"): + index.append(key) + values.append(table._properties[key]) + + index.append("creationTime") + values.append(table.created) + + index.append("lastModifidTime") + values.append(table.modified) + + return pandas.Series(values, index=index) + + +def get_query_stats_with_inferred_dtypes( query_job: bigquery.QueryJob, value_cols: Sequence[str], - index_col: Sequence[str], + index_cols: Sequence[str], ) -> pandas.Series: if query_job.schema is None: # If the schema is not available, don't bother inferring dtypes. - return get_stats(query_job) + return get_query_stats(query_job) col_dtypes = dtypes.bf_type_from_type_kind(query_job.schema) @@ -40,15 +68,17 @@ def get_stats_with_inferred_dtypes( else: # Use every column that is not mentioned as an index column value_col_dtypes = { - col: dtype for col, dtype in col_dtypes.items() if col not in set(index_col) + col: dtype + for col, dtype in col_dtypes.items() + if col not in set(index_cols) } - index_dtypes = [col_dtypes[col] for col in index_col] + index_dtypes = [col_dtypes[col] for col in index_cols] - return get_stats_with_dtypes(query_job, value_col_dtypes, index_dtypes) + return get_query_stats_with_dtypes(query_job, value_col_dtypes, index_dtypes) -def get_stats_with_dtypes( +def get_query_stats_with_dtypes( query_job: bigquery.QueryJob, column_dtypes: Dict[str, dtypes.Dtype], index_dtypes: Sequence[dtypes.Dtype], @@ -58,10 +88,10 @@ def get_stats_with_dtypes( s = pandas.Series(values, index=index) - return pandas.concat([s, get_stats(query_job)]) + return pandas.concat([s, get_query_stats(query_job)]) -def get_stats( +def get_query_stats( query_job: bigquery.QueryJob, ) -> pandas.Series: """Returns important stats from the query job as a Pandas Series.""" diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 85f66efd54..34f7521835 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -50,7 +50,7 @@ import bigframes.core.schema as schemata import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers -from bigframes.session import dry_run_jobs +from bigframes.session import dry_runs import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.metrics @@ -348,6 +348,48 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() + @overload + def read_gbq_table( # type: ignore[overload-overlap] + self, + table_id: str, + *, + index_col: Iterable[str] + | str + | Iterable[int] + | int + | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + names: Optional[Iterable[str]] = ..., + max_results: Optional[int] = ..., + api_name: str = ..., + use_cache: bool = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + enable_snapshot: bool = ..., + dry_run: Literal[False] = ..., + ) -> dataframe.DataFrame: + ... + + @overload + def read_gbq_table( + self, + table_id: str, + *, + index_col: Iterable[str] + | str + | Iterable[int] + | int + | bigframes.enums.DefaultIndexKind = ..., + columns: Iterable[str] = ..., + names: Optional[Iterable[str]] = ..., + max_results: Optional[int] = ..., + api_name: str = ..., + use_cache: bool = ..., + filters: third_party_pandas_gbq.FiltersType = ..., + enable_snapshot: bool = ..., + dry_run: Literal[True] = ..., + ) -> pandas.Series: + ... + def read_gbq_table( self, table_id: str, @@ -364,7 +406,8 @@ def read_gbq_table( use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), enable_snapshot: bool = True, - ) -> dataframe.DataFrame: + dry_run: bool = False, + ) -> dataframe.DataFrame | pandas.Series: import bigframes._tools.strings import bigframes.dataframe as dataframe @@ -490,15 +533,18 @@ def read_gbq_table( time_travel_timestamp=None, ) - return self.read_gbq_query( + return self.read_gbq_query( # type: ignore # for dry_run overload query, index_col=index_cols, columns=columns, api_name=api_name, use_cache=use_cache, - dry_run=False, + dry_run=dry_run, ) + if dry_run: + return dry_runs.get_table_stats(table) + # ----------------------------------------- # Validate table access and features # ----------------------------------------- @@ -650,7 +696,7 @@ def load_file( return table_id @overload - def read_gbq_query( + def read_gbq_query( # type: ignore[overload-overlap] self, query: str, *, @@ -746,7 +792,7 @@ def read_gbq_query( ) job_config.dry_run = True query_job = self._bqclient.query(query, job_config=job_config) - return dry_run_jobs.get_stats_with_inferred_dtypes( + return dry_runs.get_query_stats_with_inferred_dtypes( query_job, list(columns), index_cols ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2922d29790..0a4a461734 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1837,7 +1837,7 @@ def test_read_gbq_with_table_ref_dry_run(scalars_table_id, session): result = session.read_gbq(scalars_table_id, dry_run=True) assert isinstance(result, pd.Series) - _assert_dry_run_stats_are_valid(result) + _assert_table_dry_run_stats_are_valid(result) def test_read_gbq_with_query_dry_run(scalars_table_id, session): @@ -1845,7 +1845,7 @@ def test_read_gbq_with_query_dry_run(scalars_table_id, session): result = session.read_gbq(query, dry_run=True) assert isinstance(result, pd.Series) - _assert_dry_run_stats_are_valid(result) + _assert_query_dry_run_stats_are_valid(result) def test_read_gbq_dry_run_with_column_and_index(scalars_table_id, session): @@ -1855,7 +1855,7 @@ def test_read_gbq_dry_run_with_column_and_index(scalars_table_id, session): ) assert isinstance(result, pd.Series) - _assert_dry_run_stats_are_valid(result) + _assert_query_dry_run_stats_are_valid(result) assert result["columnCount"] == 2 assert result["columnDtypes"] == { "int64_col": pd.Int64Dtype(), @@ -1865,15 +1865,29 @@ def test_read_gbq_dry_run_with_column_and_index(scalars_table_id, session): assert result["indexDtypes"] == [pd.Int64Dtype()] +def test_read_gbq_table_dry_run(scalars_table_id, session): + result = session.read_gbq_table(scalars_table_id, dry_run=True) + + assert isinstance(result, pd.Series) + _assert_table_dry_run_stats_are_valid(result) + + +def test_read_gbq_table_dry_run_with_max_results(scalars_table_id, session): + result = session.read_gbq_table(scalars_table_id, dry_run=True, max_results=100) + + assert isinstance(result, pd.Series) + _assert_query_dry_run_stats_are_valid(result) + + def test_read_gbq_query_dry_run(scalars_table_id, session): query = f"SELECT * FROM {scalars_table_id} LIMIT 10;" result = session.read_gbq_query(query, dry_run=True) assert isinstance(result, pd.Series) - _assert_dry_run_stats_are_valid(result) + _assert_query_dry_run_stats_are_valid(result) -def _assert_dry_run_stats_are_valid(result: pd.Series): +def _assert_query_dry_run_stats_are_valid(result: pd.Series): expected_index = pd.Index( [ "columnCount", @@ -1895,3 +1909,22 @@ def _assert_dry_run_stats_are_valid(result: pd.Series): pd.testing.assert_index_equal(result.index, expected_index) assert result["columnCount"] + result["indexLevel"] > 0 + + +def _assert_table_dry_run_stats_are_valid(result: pd.Series): + expected_index = pd.Index( + [ + "isQuery", + "tableColumnCount", + "tableColumnTypes", + "numBytes", + "numRows", + "location", + "type", + "creationTime", + "lastModifidTime", + ] + ) + + pd.testing.assert_index_equal(result.index, expected_index) + assert result["tableColumnCount"] == len(result["tableColumnTypes"]) From 4d6a59ab5a42ae7b810d16135b9d2b2a14bd610c Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 1 May 2025 20:01:24 +0000 Subject: [PATCH 5/9] fix lint --- bigframes/session/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 34f7521835..9a16f5d22e 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -696,7 +696,7 @@ def load_file( return table_id @overload - def read_gbq_query( # type: ignore[overload-overlap] + def read_gbq_query( # type: ignore[overload-overlap] self, query: str, *, From 36247957c01480a19170e3c0c88fd6aef5847dd5 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 1 May 2025 20:02:24 +0000 Subject: [PATCH 6/9] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/session/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 34f7521835..9a16f5d22e 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -696,7 +696,7 @@ def load_file( return table_id @overload - def read_gbq_query( # type: ignore[overload-overlap] + def read_gbq_query( # type: ignore[overload-overlap] self, query: str, *, From 669c041a7a2be636de6b7caa54a1424bcbde82ef Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 1 May 2025 22:36:42 +0000 Subject: [PATCH 7/9] rename column count and column dtypes --- bigframes/session/dry_runs.py | 4 ++-- tests/system/small/test_session.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index 03a588dc35..fafe62bc5e 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -32,9 +32,9 @@ def get_table_stats(table: bigquery.Table) -> pandas.Series: # Populate column and index types col_dtypes = dtypes.bf_type_from_type_kind(table.schema) - index.append("tableColumnCount") + index.append("columnCount") values.append(len(col_dtypes)) - index.append("tableColumnTypes") + index.append("columnDtypes") values.append(col_dtypes) for key in ("numBytes", "numRows", "location", "type"): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 0a4a461734..bc70a34023 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1915,8 +1915,8 @@ def _assert_table_dry_run_stats_are_valid(result: pd.Series): expected_index = pd.Index( [ "isQuery", - "tableColumnCount", - "tableColumnTypes", + "columnCount", + "columnDtypes", "numBytes", "numRows", "location", @@ -1927,4 +1927,4 @@ def _assert_table_dry_run_stats_are_valid(result: pd.Series): ) pd.testing.assert_index_equal(result.index, expected_index) - assert result["tableColumnCount"] == len(result["tableColumnTypes"]) + assert result["columnCount"] == len(result["columnDtypes"]) From 7dc90d966bb01c47b57bdbb20a6299b91e890394 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 1 May 2025 22:41:53 +0000 Subject: [PATCH 8/9] fix typo --- bigframes/session/dry_runs.py | 2 +- tests/system/small/test_session.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index fafe62bc5e..4d5b41345e 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -44,7 +44,7 @@ def get_table_stats(table: bigquery.Table) -> pandas.Series: index.append("creationTime") values.append(table.created) - index.append("lastModifidTime") + index.append("lastModifiedTime") values.append(table.modified) return pandas.Series(values, index=index) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index bc70a34023..ad01a95509 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1922,7 +1922,7 @@ def _assert_table_dry_run_stats_are_valid(result: pd.Series): "location", "type", "creationTime", - "lastModifidTime", + "lastModifiedTime", ] ) From 80e08bfcb579b5b4f94cb5a521bd0644ba083b88 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 1 May 2025 22:45:26 +0000 Subject: [PATCH 9/9] format code --- bigframes/core/blocks.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ef7d62e543..6426b7b22b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -819,12 +819,11 @@ def _compute_dry_run( col: self.expr.get_column_type(self.resolve_label_exact_or_error(col)) for col in self.column_labels } - return ( - dry_runs.get_query_stats_with_dtypes( - query_job, column_dtypes, self.index.dtypes - ), - query_job, + + dry_run_stats = dry_runs.get_query_stats_with_dtypes( + query_job, column_dtypes, self.index.dtypes ) + return dry_run_stats, query_job def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): expr = self._expr