From 8fc051f21fa3f575e0a1bd8c44968077e54fb799 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Tue, 7 Oct 2025 11:42:55 -0700 Subject: [PATCH 01/22] fix!: Address the series input type issue in bigframes functions (#2123) * fix!: Address the series input type issue in bigframes functions * fix the tests * fix doctest * fix error message --- bigframes/functions/_function_session.py | 12 ++++++- bigframes/functions/function_template.py | 18 ++++++++--- bigframes/session/__init__.py | 3 +- .../large/functions/test_managed_function.py | 31 ++++++++++++------- .../large/functions/test_remote_function.py | 28 +++++++++++------ .../small/functions/test_remote_function.py | 13 ++++---- tests/unit/functions/test_remote_function.py | 19 ++---------- 7 files changed, 72 insertions(+), 52 deletions(-) diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 9a38ef1957..a456f05417 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -983,7 +983,17 @@ def _convert_row_processor_sig( if len(signature.parameters) >= 1: first_param = next(iter(signature.parameters.values())) param_type = first_param.annotation - if (param_type == bf_series.Series) or (param_type == pandas.Series): + # Type hints for Series inputs should use pandas.Series because the + # underlying serialization process converts the input to a string + # representation of a pandas Series (not bigframes Series). Using + # bigframes Series will lead to TypeError when creating the function + # remotely. See more from b/445182819. + if param_type == bf_series.Series: + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "Argument type hint must be Pandas Series, not BigFrames Series.", + ) + if param_type == pandas.Series: msg = bfe.format_message("input_types=Series is in preview.") warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning) return signature.replace( diff --git a/bigframes/functions/function_template.py b/bigframes/functions/function_template.py index dd31de7243..a3680a7a88 100644 --- a/bigframes/functions/function_template.py +++ b/bigframes/functions/function_template.py @@ -363,8 +363,16 @@ def generate_managed_function_code( return {udf_name}(*args)""" ) - udf_code_block = textwrap.dedent( - f"{udf_code}\n{func_code}\n{bigframes_handler_code}" - ) - - return udf_code_block + udf_code_block = [] + if not capture_references and is_row_processor: + # Enable postponed evaluation of type annotations. This converts all + # type hints to strings at runtime, which is necessary for correctly + # handling the type annotation of pandas.Series after the UDF code is + # serialized for remote execution. See more from b/445182819. + udf_code_block.append("from __future__ import annotations") + + udf_code_block.append(udf_code) + udf_code_block.append(func_code) + udf_code_block.append(bigframes_handler_code) + + return textwrap.dedent("\n".join(udf_code_block)) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f0cec864b4..df0afb4c8d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -2064,8 +2064,9 @@ def read_gbq_function( note, row processor implies that the function has only one input parameter. + >>> import pandas as pd >>> @bpd.remote_function(cloud_function_service_account="default") - ... def row_sum(s: bpd.Series) -> float: + ... def row_sum(s: pd.Series) -> float: ... return s['a'] + s['b'] + s['c'] >>> row_sum_ref = bpd.read_gbq_function( diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index e74bc8579f..732123ec84 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -701,8 +701,19 @@ def serialize_row(row): } ) + with pytest.raises( + TypeError, + match="Argument type hint must be Pandas Series, not BigFrames Series.", + ): + serialize_row_mf = session.udf( + input_types=bigframes.series.Series, + output_type=str, + dataset=dataset_id, + name=prefixer.create_prefix(), + )(serialize_row) + serialize_row_mf = session.udf( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=str, dataset=dataset_id, name=prefixer.create_prefix(), @@ -762,7 +773,7 @@ def analyze(row): ): analyze_mf = session.udf( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=str, dataset=dataset_id, name=prefixer.create_prefix(), @@ -876,7 +887,7 @@ def serialize_row(row): ) serialize_row_mf = session.udf( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=str, dataset=dataset_id, name=prefixer.create_prefix(), @@ -926,7 +937,7 @@ def test_managed_function_df_apply_axis_1_na_nan_inf(dataset_id, session): try: - def float_parser(row): + def float_parser(row: pandas.Series): import numpy as mynp import pandas as mypd @@ -937,7 +948,7 @@ def float_parser(row): return float(row["text"]) float_parser_mf = session.udf( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=float, dataset=dataset_id, name=prefixer.create_prefix(), @@ -1027,7 +1038,7 @@ def test_managed_function_df_apply_axis_1_series_args(session, dataset_id, scala try: - def analyze(s, x, y): + def analyze(s: pandas.Series, x: bool, y: float) -> str: value = f"value is {s['int64_col']} and {s['float64_col']}" if x: return f"{value}, x is True!" @@ -1036,8 +1047,6 @@ def analyze(s, x, y): return f"{value}, x is False, y is non-positive!" analyze_mf = session.udf( - input_types=[bigframes.series.Series, bool, float], - output_type=str, dataset=dataset_id, name=prefixer.create_prefix(), )(analyze) @@ -1151,7 +1160,7 @@ def is_sum_positive_series(s): return s["int64_col"] + s["int64_too"] > 0 is_sum_positive_series_mf = session.udf( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=bool, dataset=dataset_id, name=prefixer.create_prefix(), @@ -1217,12 +1226,10 @@ def func_for_other(x): def test_managed_function_df_where_other_issue(session, dataset_id, scalars_df_index): try: - def the_sum(s): + def the_sum(s: pandas.Series) -> int: return s["int64_col"] + s["int64_too"] the_sum_mf = session.udf( - input_types=bigframes.series.Series, - output_type=int, dataset=dataset_id, name=prefixer.create_prefix(), )(the_sum) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 55643d9a60..00b1b5f1f0 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -1722,7 +1722,7 @@ def serialize_row(row): ) serialize_row_remote = session.remote_function( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=str, reuse=False, cloud_function_service_account="default", @@ -1771,7 +1771,7 @@ def analyze(row): ) analyze_remote = session.remote_function( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=str, reuse=False, cloud_function_service_account="default", @@ -1895,7 +1895,7 @@ def serialize_row(row): ) serialize_row_remote = session.remote_function( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=str, reuse=False, cloud_function_service_account="default", @@ -1944,7 +1944,7 @@ def test_df_apply_axis_1_na_nan_inf(session): try: - def float_parser(row): + def float_parser(row: pandas.Series): import numpy as mynp import pandas as mypd @@ -1955,7 +1955,6 @@ def float_parser(row): return float(row["text"]) float_parser_remote = session.remote_function( - input_types=bigframes.series.Series, output_type=float, reuse=False, cloud_function_service_account="default", @@ -2055,12 +2054,12 @@ def test_df_apply_axis_1_series_args(session, scalars_dfs): try: @session.remote_function( - input_types=[bigframes.series.Series, float, str, bool], + input_types=[pandas.Series, float, str, bool], output_type=list[str], reuse=False, cloud_function_service_account="default", ) - def foo_list(x, y0: float, y1, y2) -> list[str]: + def foo_list(x: pandas.Series, y0: float, y1, y2) -> list[str]: return ( [str(x["int64_col"]), str(y0), str(y1), str(y2)] if y2 @@ -3087,12 +3086,21 @@ def test_remote_function_df_where_mask_series(session, dataset_id, scalars_dfs): try: # The return type has to be bool type for callable where condition. - def is_sum_positive_series(s): + def is_sum_positive_series(s: pandas.Series) -> bool: return s["int64_col"] + s["int64_too"] > 0 + with pytest.raises( + TypeError, + match="Argument type hint must be Pandas Series, not BigFrames Series.", + ): + session.remote_function( + input_types=bigframes.series.Series, + dataset=dataset_id, + reuse=False, + cloud_function_service_account="default", + )(is_sum_positive_series) + is_sum_positive_series_mf = session.remote_function( - input_types=bigframes.series.Series, - output_type=bool, dataset=dataset_id, reuse=False, cloud_function_service_account="default", diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 28fab19144..15070a3a29 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -20,6 +20,7 @@ import bigframes_vendored.constants as constants import google.api_core.exceptions from google.cloud import bigquery +import pandas import pandas as pd import pyarrow import pytest @@ -1166,7 +1167,7 @@ def test_df_apply_axis_1(session, scalars_dfs, dataset_id_permanent): ] scalars_df, scalars_pandas_df = scalars_dfs - def add_ints(row): + def add_ints(row: pandas.Series) -> int: return row["int64_col"] + row["int64_too"] with pytest.warns( @@ -1174,8 +1175,6 @@ def add_ints(row): match="input_types=Series is in preview.", ): add_ints_remote = session.remote_function( - input_types=bigframes.series.Series, - output_type=int, dataset=dataset_id_permanent, name=get_function_name(add_ints, is_row_processor=True), cloud_function_service_account="default", @@ -1223,11 +1222,11 @@ def test_df_apply_axis_1_ordering(session, scalars_dfs, dataset_id_permanent): ordering_columns = ["bool_col", "int64_col"] scalars_df, scalars_pandas_df = scalars_dfs - def add_ints(row): + def add_ints(row: pandas.Series) -> int: return row["int64_col"] + row["int64_too"] add_ints_remote = session.remote_function( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=int, dataset=dataset_id_permanent, name=get_function_name(add_ints, is_row_processor=True), @@ -1267,7 +1266,7 @@ def add_numbers(row): return row["x"] + row["y"] add_numbers_remote = session.remote_function( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=float, dataset=dataset_id_permanent, name=get_function_name(add_numbers, is_row_processor=True), @@ -1321,7 +1320,7 @@ def echo_len(row): return len(row) echo_len_remote = session.remote_function( - input_types=bigframes.series.Series, + input_types=pandas.Series, output_type=float, dataset=dataset_id_permanent, name=get_function_name(echo_len, is_row_processor=True), diff --git a/tests/unit/functions/test_remote_function.py b/tests/unit/functions/test_remote_function.py index ea09ac59d3..e9e0d0df67 100644 --- a/tests/unit/functions/test_remote_function.py +++ b/tests/unit/functions/test_remote_function.py @@ -17,25 +17,12 @@ import pandas import pytest +import bigframes.exceptions import bigframes.functions.function as bff -import bigframes.series from bigframes.testing import mocks -@pytest.mark.parametrize( - "series_type", - ( - pytest.param( - pandas.Series, - id="pandas.Series", - ), - pytest.param( - bigframes.series.Series, - id="bigframes.series.Series", - ), - ), -) -def test_series_input_types_to_str(series_type): +def test_series_input_types_to_str(): """Check that is_row_processor=True uses str as the input type to serialize a row.""" session = mocks.create_bigquery_session() remote_function_decorator = bff.remote_function( @@ -48,7 +35,7 @@ def test_series_input_types_to_str(series_type): ): @remote_function_decorator - def axis_1_function(myparam: series_type) -> str: # type: ignore + def axis_1_function(myparam: pandas.Series) -> str: # type: ignore return "Hello, " + myparam["str_col"] + "!" # type: ignore # Still works as a normal function. From fa4e46f4eed6b381f497f9f1043e5c8aa6491297 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 7 Oct 2025 12:19:41 -0700 Subject: [PATCH 02/22] refactor: make sqlglot.from_bf_dtype() a top-level function (#2144) --- .../sqlglot/expressions/generic_ops.py | 4 +- bigframes/core/compile/sqlglot/sqlglot_ir.py | 4 +- .../core/compile/sqlglot/sqlglot_types.py | 109 +++++++++--------- .../compile/sqlglot/test_sqlglot_types.py | 34 +++--- 4 files changed, 73 insertions(+), 78 deletions(-) diff --git a/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 6a3825309c..af3b57f77b 100644 --- a/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -18,9 +18,9 @@ from bigframes import dtypes from bigframes import operations as ops +from bigframes.core.compile.sqlglot import sqlglot_types from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler -from bigframes.core.compile.sqlglot.sqlglot_types import SQLGlotType register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op @@ -29,7 +29,7 @@ def _(expr: TypedExpr, op: ops.AsTypeOp) -> sge.Expression: from_type = expr.dtype to_type = op.to_type - sg_to_type = SQLGlotType.from_bigframes_dtype(to_type) + sg_to_type = sqlglot_types.from_bigframes_dtype(to_type) sg_expr = expr.expr if to_type == dtypes.JSON_DTYPE: diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 98dbed4cdd..c7ee13f4e8 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -79,7 +79,7 @@ def from_pyarrow( expressions=[ sge.ColumnDef( this=sge.to_identifier(field.column, quoted=True), - kind=sgt.SQLGlotType.from_bigframes_dtype(field.dtype), + kind=sgt.from_bigframes_dtype(field.dtype), ) for field in schema.items ], @@ -620,7 +620,7 @@ def _select_to_cte(expr: sge.Select, cte_name: sge.Identifier) -> sge.Select: def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: - sqlglot_type = sgt.SQLGlotType.from_bigframes_dtype(dtype) + sqlglot_type = sgt.from_bigframes_dtype(dtype) if value is None: return _cast(sge.Null(), sqlglot_type) elif dtype == dtypes.BYTES_DTYPE: diff --git a/bigframes/core/compile/sqlglot/sqlglot_types.py b/bigframes/core/compile/sqlglot/sqlglot_types.py index 5b0f70077d..64e4363ddf 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_types.py +++ b/bigframes/core/compile/sqlglot/sqlglot_types.py @@ -25,62 +25,57 @@ import bigframes.dtypes -class SQLGlotType: - @classmethod - def from_bigframes_dtype( - cls, - bigframes_dtype: typing.Union[ - bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[typing.Any] - ], - ) -> str: - if bigframes_dtype == bigframes.dtypes.INT_DTYPE: - return "INT64" - elif bigframes_dtype == bigframes.dtypes.FLOAT_DTYPE: - return "FLOAT64" - elif bigframes_dtype == bigframes.dtypes.STRING_DTYPE: - return "STRING" - elif bigframes_dtype == bigframes.dtypes.BOOL_DTYPE: - return "BOOLEAN" - elif bigframes_dtype == bigframes.dtypes.DATE_DTYPE: - return "DATE" - elif bigframes_dtype == bigframes.dtypes.TIME_DTYPE: - return "TIME" - elif bigframes_dtype == bigframes.dtypes.DATETIME_DTYPE: - return "DATETIME" - elif bigframes_dtype == bigframes.dtypes.TIMESTAMP_DTYPE: - return "TIMESTAMP" - elif bigframes_dtype == bigframes.dtypes.BYTES_DTYPE: - return "BYTES" - elif bigframes_dtype == bigframes.dtypes.NUMERIC_DTYPE: - return "NUMERIC" - elif bigframes_dtype == bigframes.dtypes.BIGNUMERIC_DTYPE: - return "BIGNUMERIC" - elif bigframes_dtype == bigframes.dtypes.JSON_DTYPE: - return "JSON" - elif bigframes_dtype == bigframes.dtypes.GEO_DTYPE: - return "GEOGRAPHY" - elif bigframes_dtype == bigframes.dtypes.TIMEDELTA_DTYPE: - return "INT64" - elif isinstance(bigframes_dtype, pd.ArrowDtype): - if pa.types.is_list(bigframes_dtype.pyarrow_dtype): - inner_bigframes_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( - bigframes_dtype.pyarrow_dtype.value_type +def from_bigframes_dtype( + bigframes_dtype: typing.Union[ + bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[typing.Any] + ], +) -> str: + if bigframes_dtype == bigframes.dtypes.INT_DTYPE: + return "INT64" + elif bigframes_dtype == bigframes.dtypes.FLOAT_DTYPE: + return "FLOAT64" + elif bigframes_dtype == bigframes.dtypes.STRING_DTYPE: + return "STRING" + elif bigframes_dtype == bigframes.dtypes.BOOL_DTYPE: + return "BOOLEAN" + elif bigframes_dtype == bigframes.dtypes.DATE_DTYPE: + return "DATE" + elif bigframes_dtype == bigframes.dtypes.TIME_DTYPE: + return "TIME" + elif bigframes_dtype == bigframes.dtypes.DATETIME_DTYPE: + return "DATETIME" + elif bigframes_dtype == bigframes.dtypes.TIMESTAMP_DTYPE: + return "TIMESTAMP" + elif bigframes_dtype == bigframes.dtypes.BYTES_DTYPE: + return "BYTES" + elif bigframes_dtype == bigframes.dtypes.NUMERIC_DTYPE: + return "NUMERIC" + elif bigframes_dtype == bigframes.dtypes.BIGNUMERIC_DTYPE: + return "BIGNUMERIC" + elif bigframes_dtype == bigframes.dtypes.JSON_DTYPE: + return "JSON" + elif bigframes_dtype == bigframes.dtypes.GEO_DTYPE: + return "GEOGRAPHY" + elif bigframes_dtype == bigframes.dtypes.TIMEDELTA_DTYPE: + return "INT64" + elif isinstance(bigframes_dtype, pd.ArrowDtype): + if pa.types.is_list(bigframes_dtype.pyarrow_dtype): + inner_bigframes_dtype = bigframes.dtypes.arrow_dtype_to_bigframes_dtype( + bigframes_dtype.pyarrow_dtype.value_type + ) + return f"ARRAY<{from_bigframes_dtype(inner_bigframes_dtype)}>" + elif pa.types.is_struct(bigframes_dtype.pyarrow_dtype): + struct_type = typing.cast(pa.StructType, bigframes_dtype.pyarrow_dtype) + inner_fields: list[str] = [] + for i in range(struct_type.num_fields): + field = struct_type.field(i) + key = sg.to_identifier(field.name).sql("bigquery") + dtype = from_bigframes_dtype( + bigframes.dtypes.arrow_dtype_to_bigframes_dtype(field.type) ) - return ( - f"ARRAY<{SQLGlotType.from_bigframes_dtype(inner_bigframes_dtype)}>" - ) - elif pa.types.is_struct(bigframes_dtype.pyarrow_dtype): - struct_type = typing.cast(pa.StructType, bigframes_dtype.pyarrow_dtype) - inner_fields: list[str] = [] - for i in range(struct_type.num_fields): - field = struct_type.field(i) - key = sg.to_identifier(field.name).sql("bigquery") - dtype = SQLGlotType.from_bigframes_dtype( - bigframes.dtypes.arrow_dtype_to_bigframes_dtype(field.type) - ) - inner_fields.append(f"{key} {dtype}") - return "STRUCT<{}>".format(", ".join(inner_fields)) + inner_fields.append(f"{key} {dtype}") + return "STRUCT<{}>".format(", ".join(inner_fields)) - raise ValueError( - f"Unsupported type for {bigframes_dtype}. {constants.FEEDBACK_LINK}" - ) + raise ValueError( + f"Unsupported type for {bigframes_dtype}. {constants.FEEDBACK_LINK}" + ) diff --git a/tests/unit/core/compile/sqlglot/test_sqlglot_types.py b/tests/unit/core/compile/sqlglot/test_sqlglot_types.py index a9108e5daf..5c2d84383d 100644 --- a/tests/unit/core/compile/sqlglot/test_sqlglot_types.py +++ b/tests/unit/core/compile/sqlglot/test_sqlglot_types.py @@ -20,34 +20,34 @@ def test_from_bigframes_simple_dtypes(): - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.INT_DTYPE) == "INT64" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.FLOAT_DTYPE) == "FLOAT64" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.STRING_DTYPE) == "STRING" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.BOOL_DTYPE) == "BOOLEAN" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.DATE_DTYPE) == "DATE" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.TIME_DTYPE) == "TIME" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.DATETIME_DTYPE) == "DATETIME" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.TIMESTAMP_DTYPE) == "TIMESTAMP" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.BYTES_DTYPE) == "BYTES" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.NUMERIC_DTYPE) == "NUMERIC" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.BIGNUMERIC_DTYPE) == "BIGNUMERIC" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.JSON_DTYPE) == "JSON" - assert sgt.SQLGlotType.from_bigframes_dtype(dtypes.GEO_DTYPE) == "GEOGRAPHY" + assert sgt.from_bigframes_dtype(dtypes.INT_DTYPE) == "INT64" + assert sgt.from_bigframes_dtype(dtypes.FLOAT_DTYPE) == "FLOAT64" + assert sgt.from_bigframes_dtype(dtypes.STRING_DTYPE) == "STRING" + assert sgt.from_bigframes_dtype(dtypes.BOOL_DTYPE) == "BOOLEAN" + assert sgt.from_bigframes_dtype(dtypes.DATE_DTYPE) == "DATE" + assert sgt.from_bigframes_dtype(dtypes.TIME_DTYPE) == "TIME" + assert sgt.from_bigframes_dtype(dtypes.DATETIME_DTYPE) == "DATETIME" + assert sgt.from_bigframes_dtype(dtypes.TIMESTAMP_DTYPE) == "TIMESTAMP" + assert sgt.from_bigframes_dtype(dtypes.BYTES_DTYPE) == "BYTES" + assert sgt.from_bigframes_dtype(dtypes.NUMERIC_DTYPE) == "NUMERIC" + assert sgt.from_bigframes_dtype(dtypes.BIGNUMERIC_DTYPE) == "BIGNUMERIC" + assert sgt.from_bigframes_dtype(dtypes.JSON_DTYPE) == "JSON" + assert sgt.from_bigframes_dtype(dtypes.GEO_DTYPE) == "GEOGRAPHY" def test_from_bigframes_struct_dtypes(): fields = [pa.field("int_col", pa.int64()), pa.field("bool_col", pa.bool_())] struct_type = pd.ArrowDtype(pa.struct(fields)) expected = "STRUCT" - assert sgt.SQLGlotType.from_bigframes_dtype(struct_type) == expected + assert sgt.from_bigframes_dtype(struct_type) == expected def test_from_bigframes_array_dtypes(): int_array_type = pd.ArrowDtype(pa.list_(pa.int64())) - assert sgt.SQLGlotType.from_bigframes_dtype(int_array_type) == "ARRAY" + assert sgt.from_bigframes_dtype(int_array_type) == "ARRAY" string_array_type = pd.ArrowDtype(pa.list_(pa.string())) - assert sgt.SQLGlotType.from_bigframes_dtype(string_array_type) == "ARRAY" + assert sgt.from_bigframes_dtype(string_array_type) == "ARRAY" def test_from_bigframes_multi_nested_dtypes(): @@ -61,4 +61,4 @@ def test_from_bigframes_multi_nested_dtypes(): expected = ( "ARRAY>>" ) - assert sgt.SQLGlotType.from_bigframes_dtype(array_type) == expected + assert sgt.from_bigframes_dtype(array_type) == expected From ef0b0b73843da2a93baf08e4cd5457fbb590b89c Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 7 Oct 2025 13:21:45 -0700 Subject: [PATCH 03/22] feat: add output_schema parameter to ai.generate() (#2139) * feat: add output_schema to ai.generate() * fix lint * fix lint * fix test * fix mypy * fix lint * code optimization * fix tests * support case-insensitive type parsing * fix test * fix: Fix row count local execution bug (#2133) * fix: join on, how args are now positional (#2140) --------- Co-authored-by: TrevorBergeron --- bigframes/bigquery/_operations/ai.py | 26 ++++- .../ibis_compiler/scalar_op_registry.py | 1 + .../compile/sqlglot/expressions/ai_ops.py | 18 +++- bigframes/operations/ai_ops.py | 10 +- bigframes/operations/output_schemas.py | 90 +++++++++++++++++ tests/system/small/bigquery/test_ai.py | 35 +++++++ .../out.sql | 19 ++++ .../sqlglot/expressions/test_ai_ops.py | 21 ++++ tests/unit/operations/test_output_schemas.py | 99 +++++++++++++++++++ .../ibis/expr/operations/ai_ops.py | 21 +++- 10 files changed, 328 insertions(+), 12 deletions(-) create mode 100644 bigframes/operations/output_schemas.py create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_with_output_schema/out.sql create mode 100644 tests/unit/operations/test_output_schemas.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index a789310683..0c5eba9496 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -25,7 +25,7 @@ from bigframes import clients, dtypes, series, session from bigframes.core import convert, log_adapter -from bigframes.operations import ai_ops +from bigframes.operations import ai_ops, output_schemas PROMPT_TYPE = Union[ series.Series, @@ -43,7 +43,7 @@ def generate( endpoint: str | None = None, request_type: Literal["dedicated", "shared", "unspecified"] = "unspecified", model_params: Mapping[Any, Any] | None = None, - # TODO(b/446974666) Add output_schema parameter + output_schema: Mapping[str, str] | None = None, ) -> series.Series: """ Returns the AI analysis based on the prompt, which can be any combination of text and unstructured data. @@ -64,6 +64,14 @@ def generate( 1 Ottawa\\n Name: result, dtype: string + You get structured output when the `output_schema` parameter is set: + + >>> animals = bpd.Series(["Rabbit", "Spider"]) + >>> bbq.ai.generate(animals, output_schema={"number_of_legs": "INT64", "is_herbivore": "BOOL"}) + 0 {'is_herbivore': True, 'number_of_legs': 4, 'f... + 1 {'is_herbivore': False, 'number_of_legs': 8, '... + dtype: struct>, status: string>[pyarrow] + Args: prompt (Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series @@ -86,10 +94,14 @@ def generate( If requests exceed the Provisioned Throughput quota, the overflow traffic uses DSQ quota. model_params (Mapping[Any, Any]): Provides additional parameters to the model. The MODEL_PARAMS value must conform to the generateContent request body format. + output_schema (Mapping[str, str]): + A mapping value that specifies the schema of the output, in the form {field_name: data_type}. Supported data types include + `STRING`, `INT64`, `FLOAT64`, `BOOL`, `ARRAY`, and `STRUCT`. Returns: bigframes.series.Series: A new struct Series with the result data. The struct contains these fields: * "result": a STRING value containing the model's response to the prompt. The result is None if the request fails or is filtered by responsible AI. + If you specify an output schema then result is replaced by your custom schema. * "full_response": a JSON value containing the response from the projects.locations.endpoints.generateContent call to the model. The generated text is in the text element. * "status": a STRING value that contains the API response status for the corresponding row. This value is empty if the operation was successful. @@ -98,12 +110,22 @@ def generate( prompt_context, series_list = _separate_context_and_series(prompt) assert len(series_list) > 0 + if output_schema is None: + output_schema_str = None + else: + output_schema_str = ", ".join( + [f"{name} {sql_type}" for name, sql_type in output_schema.items()] + ) + # Validate user input + output_schemas.parse_sql_fields(output_schema_str) + operator = ai_ops.AIGenerate( prompt_context=tuple(prompt_context), connection_id=_resolve_connection_id(series_list[0], connection_id), endpoint=endpoint, request_type=request_type, model_params=json.dumps(model_params) if model_params else None, + output_schema=output_schema_str, ) return series_list[0]._apply_nary_op(operator, series_list[1:]) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 4c02e17d6f..e983fc7e21 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1985,6 +1985,7 @@ def ai_generate( op.endpoint, # type: ignore op.request_type.upper(), # type: ignore op.model_params, # type: ignore + op.output_schema, # type: ignore ).to_expr() diff --git a/bigframes/core/compile/sqlglot/expressions/ai_ops.py b/bigframes/core/compile/sqlglot/expressions/ai_ops.py index 4129c91906..e40173d2fd 100644 --- a/bigframes/core/compile/sqlglot/expressions/ai_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/ai_ops.py @@ -15,7 +15,6 @@ from __future__ import annotations from dataclasses import asdict -import typing import sqlglot.expressions as sge @@ -105,16 +104,16 @@ def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]: op_args = asdict(op) - connection_id = typing.cast(str, op_args["connection_id"]) + connection_id = op_args["connection_id"] args.append( sge.Kwarg(this="connection_id", expression=sge.Literal.string(connection_id)) ) - endpoit = typing.cast(str, op_args.get("endpoint", None)) + endpoit = op_args.get("endpoint", None) if endpoit is not None: args.append(sge.Kwarg(this="endpoint", expression=sge.Literal.string(endpoit))) - request_type = typing.cast(str, op_args.get("request_type", None)) + request_type = op_args.get("request_type", None) if request_type is not None: args.append( sge.Kwarg( @@ -122,7 +121,7 @@ def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]: ) ) - model_params = typing.cast(str, op_args.get("model_params", None)) + model_params = op_args.get("model_params", None) if model_params is not None: args.append( sge.Kwarg( @@ -133,4 +132,13 @@ def _construct_named_args(op: ops.NaryOp) -> list[sge.Kwarg]: ) ) + output_schema = op_args.get("output_schema", None) + if output_schema is not None: + args.append( + sge.Kwarg( + this="output_schema", + expression=sge.Literal.string(output_schema), + ) + ) + return args diff --git a/bigframes/operations/ai_ops.py b/bigframes/operations/ai_ops.py index 7ba3737ba0..ea65b705e5 100644 --- a/bigframes/operations/ai_ops.py +++ b/bigframes/operations/ai_ops.py @@ -21,7 +21,7 @@ import pyarrow as pa from bigframes import dtypes -from bigframes.operations import base_ops +from bigframes.operations import base_ops, output_schemas @dataclasses.dataclass(frozen=True) @@ -33,12 +33,18 @@ class AIGenerate(base_ops.NaryOp): endpoint: str | None request_type: Literal["dedicated", "shared", "unspecified"] model_params: str | None + output_schema: str | None def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if self.output_schema is None: + output_fields = (pa.field("result", pa.string()),) + else: + output_fields = output_schemas.parse_sql_fields(self.output_schema) + return pd.ArrowDtype( pa.struct( ( - pa.field("result", pa.string()), + *output_fields, pa.field("full_response", dtypes.JSON_ARROW_TYPE), pa.field("status", pa.string()), ) diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py new file mode 100644 index 0000000000..ff9c9883dc --- /dev/null +++ b/bigframes/operations/output_schemas.py @@ -0,0 +1,90 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pyarrow as pa + + +def parse_sql_type(sql: str) -> pa.DataType: + """ + Parses a SQL type string to its PyArrow equivalence: + + For example: + "STRING" -> pa.string() + "ARRAY" -> pa.list_(pa.int64()) + "STRUCT, y BOOL>" -> pa.struct( + ( + pa.field("x", pa.list_(pa.float64())), + pa.field("y", pa.bool_()), + ) + ) + """ + sql = sql.strip() + + if sql.upper() == "STRING": + return pa.string() + + if sql.upper() == "INT64": + return pa.int64() + + if sql.upper() == "FLOAT64": + return pa.float64() + + if sql.upper() == "BOOL": + return pa.bool_() + + if sql.upper().startswith("ARRAY<") and sql.endswith(">"): + inner_type = sql[len("ARRAY<") : -1] + return pa.list_(parse_sql_type(inner_type)) + + if sql.upper().startswith("STRUCT<") and sql.endswith(">"): + inner_fields = parse_sql_fields(sql[len("STRUCT<") : -1]) + return pa.struct(inner_fields) + + raise ValueError(f"Unsupported SQL type: {sql}") + + +def parse_sql_fields(sql: str) -> tuple[pa.Field]: + sql = sql.strip() + + start_idx = 0 + nested_depth = 0 + fields: list[pa.field] = [] + + for end_idx in range(len(sql)): + c = sql[end_idx] + + if c == "<": + nested_depth += 1 + elif c == ">": + nested_depth -= 1 + elif c == "," and nested_depth == 0: + field = sql[start_idx:end_idx] + fields.append(parse_sql_field(field)) + start_idx = end_idx + 1 + + # Append the last field + fields.append(parse_sql_field(sql[start_idx:])) + + return tuple(sorted(fields, key=lambda f: f.name)) + + +def parse_sql_field(sql: str) -> pa.Field: + sql = sql.strip() + + space_idx = sql.find(" ") + + if space_idx == -1: + raise ValueError(f"Invalid struct field: {sql}") + + return pa.field(sql[:space_idx].strip(), parse_sql_type(sql[space_idx:])) diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index 7a6e5aea4f..2ccdb01944 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -87,6 +87,41 @@ def test_ai_generate(session): ) +def test_ai_generate_with_output_schema(session): + country = bpd.Series(["Japan", "Canada"], session=session) + prompt = ("Describe ", country) + + result = bbq.ai.generate( + prompt, + endpoint="gemini-2.5-flash", + output_schema={"population": "INT64", "is_in_north_america": "bool"}, + ) + + assert _contains_no_nulls(result) + assert result.dtype == pd.ArrowDtype( + pa.struct( + ( + pa.field("is_in_north_america", pa.bool_()), + pa.field("population", pa.int64()), + pa.field("full_response", dtypes.JSON_ARROW_TYPE), + pa.field("status", pa.string()), + ) + ) + ) + + +def test_ai_generate_with_invalid_output_schema_raise_error(session): + country = bpd.Series(["Japan", "Canada"], session=session) + prompt = ("Describe ", country) + + with pytest.raises(ValueError): + bbq.ai.generate( + prompt, + endpoint="gemini-2.5-flash", + output_schema={"population": "INT64", "is_in_north_america": "JSON"}, + ) + + def test_ai_generate_bool(session): s1 = bpd.Series(["apple", "bear"], session=session) s2 = bpd.Series(["fruit", "tree"], session=session) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_with_output_schema/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_with_output_schema/out.sql new file mode 100644 index 0000000000..62fc2f9db0 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_ai_ops/test_ai_generate_with_output_schema/out.sql @@ -0,0 +1,19 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + AI.GENERATE( + prompt => (`bfcol_0`, ' is the same as ', `bfcol_0`), + connection_id => 'bigframes-dev.us.bigframes-default-connection', + endpoint => 'gemini-2.5-flash', + request_type => 'SHARED', + output_schema => 'x INT64, y FLOAT64' + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `result` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py index c809e90a90..13481d88c6 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_ai_ops.py @@ -36,6 +36,26 @@ def test_ai_generate(scalar_types_df: dataframe.DataFrame, snapshot): endpoint="gemini-2.5-flash", request_type="shared", model_params=None, + output_schema=None, + ) + + sql = utils._apply_unary_ops( + scalar_types_df, [op.as_expr(col_name, col_name)], ["result"] + ) + + snapshot.assert_match(sql, "out.sql") + + +def test_ai_generate_with_output_schema(scalar_types_df: dataframe.DataFrame, snapshot): + col_name = "string_col" + + op = ops.AIGenerate( + prompt_context=(None, " is the same as ", None), + connection_id=CONNECTION_ID, + endpoint="gemini-2.5-flash", + request_type="shared", + model_params=None, + output_schema="x INT64, y FLOAT64", ) sql = utils._apply_unary_ops( @@ -59,6 +79,7 @@ def test_ai_generate_with_model_param(scalar_types_df: dataframe.DataFrame, snap endpoint=None, request_type="shared", model_params=json.dumps(dict()), + output_schema=None, ) sql = utils._apply_unary_ops( diff --git a/tests/unit/operations/test_output_schemas.py b/tests/unit/operations/test_output_schemas.py new file mode 100644 index 0000000000..c609098c98 --- /dev/null +++ b/tests/unit/operations/test_output_schemas.py @@ -0,0 +1,99 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pyarrow as pa +import pytest + +from bigframes.operations import output_schemas + + +@pytest.mark.parametrize( + ("sql", "expected"), + [ + ("INT64", pa.int64()), + (" INT64 ", pa.int64()), + ("int64", pa.int64()), + ("FLOAT64", pa.float64()), + ("STRING", pa.string()), + ("BOOL", pa.bool_()), + ("ARRAY", pa.list_(pa.int64())), + ( + "STRUCT", + pa.struct((pa.field("x", pa.int64()), pa.field("y", pa.float64()))), + ), + ( + "STRUCT< x INT64, y FLOAT64>", + pa.struct((pa.field("x", pa.int64()), pa.field("y", pa.float64()))), + ), + ( + "STRUCT", + pa.struct((pa.field("x", pa.float64()), pa.field("y", pa.int64()))), + ), + ( + "ARRAY>", + pa.list_(pa.struct((pa.field("x", pa.int64()), pa.field("y", pa.int64())))), + ), + ( + "STRUCT, x ARRAY>", + pa.struct( + ( + pa.field("x", pa.list_(pa.float64())), + pa.field( + "y", + pa.struct( + (pa.field("a", pa.bool_()), pa.field("b", pa.string())) + ), + ), + ) + ), + ), + ], +) +def test_parse_sql_to_pyarrow_dtype(sql, expected): + assert output_schemas.parse_sql_type(sql) == expected + + +@pytest.mark.parametrize( + "sql", + [ + "a INT64", + "ARRAY<>", + "ARRAY" "ARRAY" "STRUCT<>", + "DATE", + "STRUCT", + "ARRAY>", + ], +) +def test_parse_sql_to_pyarrow_dtype_invalid_input_raies_error(sql): + with pytest.raises(ValueError): + output_schemas.parse_sql_type(sql) + + +@pytest.mark.parametrize( + ("sql", "expected"), + [ + ("x INT64", (pa.field("x", pa.int64()),)), + ( + "x INT64, y FLOAT64", + (pa.field("x", pa.int64()), pa.field("y", pa.float64())), + ), + ( + "y FLOAT64, x INT64", + (pa.field("x", pa.int64()), pa.field("y", pa.float64())), + ), + ], +) +def test_parse_sql_fields(sql, expected): + assert output_schemas.parse_sql_fields(sql) == expected diff --git a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py index e9d704fa8e..da7f132de3 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/ai_ops.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/ibis-project/ibis/blob/9.2.0/ibis/expr/operations/maps.py -"""Operations for working with maps.""" +"""Operations for working with AI operators.""" from __future__ import annotations @@ -11,6 +11,9 @@ from bigframes_vendored.ibis.expr.operations.core import Value import bigframes_vendored.ibis.expr.rules as rlz from public import public +import pyarrow as pa + +from bigframes.operations import output_schemas @public @@ -22,15 +25,27 @@ class AIGenerate(Value): endpoint: Optional[Value[dt.String]] request_type: Value[dt.String] model_params: Optional[Value[dt.String]] + output_schema: Optional[Value[dt.String]] shape = rlz.shape_like("prompt") @attribute def dtype(self) -> dt.Struct: - return dt.Struct.from_tuples( - (("result", dt.string), ("full_resposne", dt.string), ("status", dt.string)) + if self.output_schema is None: + output_pa_fields = (pa.field("result", pa.string()),) + else: + output_pa_fields = output_schemas.parse_sql_fields(self.output_schema.value) + + pyarrow_output_type = pa.struct( + ( + *output_pa_fields, + pa.field("full_resposne", pa.string()), + pa.field("status", pa.string()), + ) ) + return dt.Struct.from_pyarrow(pyarrow_output_type) + @public class AIGenerateBool(Value): From 85142008ec895fa078d192bbab942d0257f70df3 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 7 Oct 2025 13:40:04 -0700 Subject: [PATCH 04/22] feat: Add Index.__eq__ for consts, aligned objects (#2141) --- bigframes/core/indexes/base.py | 52 +++++++++++++++++++++++++++ bigframes/core/indexes/multi.py | 25 +++++++++++++ tests/system/small/test_index.py | 17 +++++++++ tests/system/small/test_multiindex.py | 10 ++++++ 4 files changed, 104 insertions(+) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index a6b18fcb43..83dd11dacb 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -754,6 +754,58 @@ def item(self): # Docstring is in third_party/bigframes_vendored/pandas/core/indexes/base.py return self.to_series().peek(2).item() + def __eq__(self, other) -> Index: # type: ignore + return self._apply_binop(other, ops.eq_op) + + def _apply_binop(self, other, op: ops.BinaryOp) -> Index: + # TODO: Handle local objects, or objects not implicitly alignable? Gets ambiguous with partial ordering though + if isinstance(other, (bigframes.series.Series, Index)): + other = Index(other) + if other.nlevels != self.nlevels: + raise ValueError("Dimensions do not match") + + lexpr = self._block.expr + rexpr = other._block.expr + join_result = lexpr.try_row_join(rexpr) + if join_result is None: + raise ValueError("Cannot align objects") + + expr, (lmap, rmap) = join_result + + expr, res_ids = expr.compute_values( + [ + op.as_expr(lmap[lid], rmap[rid]) + for lid, rid in zip(lexpr.column_ids, rexpr.column_ids) + ] + ) + return Index( + blocks.Block( + expr.select_columns(res_ids), + index_columns=res_ids, + column_labels=[], + index_labels=[None] * len(res_ids), + ) + ) + elif ( + isinstance(other, bigframes.dtypes.LOCAL_SCALAR_TYPES) and self.nlevels == 1 + ): + block, id = self._block.project_expr( + op.as_expr(self._block.index_columns[0], ex.const(other)) + ) + return Index(block.select_column(id)) + elif isinstance(other, tuple) and len(other) == self.nlevels: + block = self._block.project_exprs( + [ + op.as_expr(self._block.index_columns[i], ex.const(other[i])) + for i in range(self.nlevels) + ], + labels=[None] * self.nlevels, + drop=True, + ) + return Index(block.set_index(block.value_columns)) + else: + return NotImplemented + def _should_create_datetime_index(block: blocks.Block) -> bool: if len(block.index.dtypes) != 1: diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index 182d1f101c..a8b4b7dffe 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -19,6 +19,8 @@ import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas +from bigframes.core import blocks +from bigframes.core import expression as ex from bigframes.core.indexes.base import Index @@ -46,3 +48,26 @@ def from_arrays( pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex return cast(MultiIndex, Index(pd_index)) + + def __eq__(self, other) -> Index: # type: ignore + import bigframes.operations as ops + import bigframes.operations.aggregations as agg_ops + + eq_result = self._apply_binop(other, ops.eq_op)._block.expr + + as_array = ops.ToArrayOp().as_expr( + *( + ops.fillna_op.as_expr(col, ex.const(False)) + for col in eq_result.column_ids + ) + ) + reduced = ops.ArrayReduceOp(agg_ops.all_op).as_expr(as_array) + result_expr, result_ids = eq_result.compute_values([reduced]) + return Index( + blocks.Block( + result_expr.select_columns(result_ids), + index_columns=result_ids, + column_labels=(), + index_labels=[None], + ) + ) diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 90986c989a..3fe479af6e 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -668,3 +668,20 @@ def test_custom_index_setitem_error(): with pytest.raises(TypeError, match="Index does not support mutable operations"): custom_index[2] = 999 + + +def test_index_eq_const(scalars_df_index, scalars_pandas_df_index): + bf_result = (scalars_df_index.index == 3).to_pandas() + pd_result = scalars_pandas_df_index.index == 3 + assert bf_result == pd.Index(pd_result) + + +def test_index_eq_aligned_index(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + bpd.Index(scalars_df_index.int64_col) + == bpd.Index(scalars_df_index.int64_col.abs()) + ).to_pandas() + pd_result = pd.Index(scalars_pandas_df_index.int64_col) == pd.Index( + scalars_pandas_df_index.int64_col.abs() + ) + assert bf_result == pd.Index(pd_result) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index f15b8d8b21..3a86d5f6c5 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1474,3 +1474,13 @@ def test_multi_index_contains(scalars_df_index, scalars_pandas_df_index, key): pd_result = key in scalars_pandas_df_index.set_index(col_name).index assert bf_result == pd_result + + +def test_multiindex_eq_const(scalars_df_index, scalars_pandas_df_index): + col_name = ["int64_col", "bool_col"] + bf_result = scalars_df_index.set_index(col_name).index == (2, False) + pd_result = scalars_pandas_df_index.set_index(col_name).index == (2, False) + + pandas.testing.assert_index_equal( + pandas.Index(pd_result, dtype="boolean"), bf_result.to_pandas() + ) From 8997d4d7d9965e473195f98c550c80657035b7e1 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 7 Oct 2025 22:52:56 -0700 Subject: [PATCH 05/22] fix: Yield row count from read session if otherwise unknown (#2148) --- bigframes/session/read_api_execution.py | 2 +- tests/system/small/test_dataframe.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index 037fde011f..2530a1dc8d 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -102,7 +102,7 @@ def process_page(page): if peek: batches = pyarrow_utils.truncate_pyarrow_iterable(batches, max_results=peek) - rows = node.source.n_rows + rows = node.source.n_rows or session.estimated_row_count if peek and rows: rows = min(peek, rows) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1a942a023e..851c934838 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -993,6 +993,12 @@ def test_filter_df(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +def test_read_gbq_direct_to_batches_row_count(unordered_session): + df = unordered_session.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + iter = df.to_pandas_batches() + assert iter.total_rows == 5552452 + + def test_df_to_pandas_batches(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs From d13abadbcd68d03997e8dc11bb7a2b14bbd57fcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 Oct 2025 11:00:09 -0500 Subject: [PATCH 06/22] docs: remove progress bar from getting started template (#2143) Fixes b/449999815 --- .../getting_started_bq_dataframes.ipynb | 443 +++++++++--------- 1 file changed, 214 insertions(+), 229 deletions(-) diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 384f3b9c10..fa88cf65bb 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -137,11 +137,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "mfPoOwPLGpSr" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: bigframes in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (2.17.0)\n", + "Requirement already satisfied: cloudpickle>=2.0.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (3.1.1)\n", + "Requirement already satisfied: fsspec>=2023.3.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2025.9.0)\n", + "Requirement already satisfied: gcsfs!=2025.5.0,>=2023.3.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2025.9.0)\n", + "Requirement already satisfied: geopandas>=0.12.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.1.1)\n", + "Requirement already satisfied: google-auth<3.0,>=2.15.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.40.3)\n", + "Requirement already satisfied: google-cloud-bigquery>=3.36.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-cloud-bigquery[bqstorage,pandas]>=3.36.0->bigframes) (3.36.0)\n", + "Requirement already satisfied: google-cloud-bigquery-storage<3.0.0,>=2.30.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.33.0)\n", + "Requirement already satisfied: google-cloud-functions>=1.12.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.20.4)\n", + "Requirement already satisfied: google-cloud-bigquery-connection>=1.12.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.18.3)\n", + "Requirement already satisfied: google-cloud-resource-manager>=1.10.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.14.2)\n", + "Requirement already satisfied: google-cloud-storage>=2.0.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (3.3.1)\n", + "Requirement already satisfied: grpc-google-iam-v1>=0.14.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (0.14.2)\n", + "Requirement already satisfied: numpy>=1.24.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.2.6)\n", + "Requirement already satisfied: pandas>=1.5.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.3.2)\n", + "Requirement already satisfied: pandas-gbq>=0.26.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (0.29.2)\n", + "Requirement already satisfied: pyarrow>=15.0.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (21.0.0)\n", + "Requirement already satisfied: pydata-google-auth>=1.8.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.9.1)\n", + "Requirement already satisfied: requests>=2.27.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.32.5)\n", + "Requirement already satisfied: shapely>=1.8.5 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.1.1)\n", + "Requirement already satisfied: sqlglot>=23.6.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (27.11.0)\n", + "Requirement already satisfied: tabulate>=0.9 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (0.9.0)\n", + "Requirement already satisfied: ipywidgets>=7.7.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (8.1.7)\n", + "Requirement already satisfied: humanize>=4.6.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (4.13.0)\n", + "Requirement already satisfied: matplotlib>=3.7.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (3.10.6)\n", + "Requirement already satisfied: db-dtypes>=1.4.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.4.3)\n", + "Requirement already satisfied: atpublic<6,>=2.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (5.1)\n", + "Requirement already satisfied: python-dateutil<3,>=2.8.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2022.7 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (2025.2)\n", + "Requirement already satisfied: toolz<2,>=0.11 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (1.0.0)\n", + "Requirement already satisfied: typing-extensions<5,>=4.5.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (4.15.0)\n", + "Requirement already satisfied: rich<14,>=12.4.4 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from bigframes) (13.9.4)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-auth<3.0,>=2.15.0->bigframes) (5.5.2)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-auth<3.0,>=2.15.0->bigframes) (0.4.2)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-auth<3.0,>=2.15.0->bigframes) (4.9.1)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0->google-cloud-bigquery-storage<3.0.0,>=2.30.0->bigframes) (2.25.1)\n", + "Requirement already satisfied: proto-plus<2.0.0,>=1.22.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-cloud-bigquery-storage<3.0.0,>=2.30.0->bigframes) (1.26.1)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0,>=3.20.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-cloud-bigquery-storage<3.0.0,>=2.30.0->bigframes) (6.32.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.56.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0->google-cloud-bigquery-storage<3.0.0,>=2.30.0->bigframes) (1.70.0)\n", + "Requirement already satisfied: grpcio<2.0.0,>=1.33.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0->google-cloud-bigquery-storage<3.0.0,>=2.30.0->bigframes) (1.74.0)\n", + "Requirement already satisfied: grpcio-status<2.0.0,>=1.33.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.0->google-cloud-bigquery-storage<3.0.0,>=2.30.0->bigframes) (1.74.0)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from python-dateutil<3,>=2.8.2->bigframes) (1.17.0)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from requests>=2.27.1->bigframes) (3.4.3)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from requests>=2.27.1->bigframes) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from requests>=2.27.1->bigframes) (2.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from requests>=2.27.1->bigframes) (2025.8.3)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from rich<14,>=12.4.4->bigframes) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from rich<14,>=12.4.4->bigframes) (2.19.2)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from rsa<5,>=3.1.4->google-auth<3.0,>=2.15.0->bigframes) (0.6.1)\n", + "Requirement already satisfied: packaging>=24.2.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from db-dtypes>=1.4.2->bigframes) (25.0)\n", + "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from gcsfs!=2025.5.0,>=2023.3.0->bigframes) (3.12.15)\n", + "Requirement already satisfied: decorator>4.1.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from gcsfs!=2025.5.0,>=2023.3.0->bigframes) (5.2.1)\n", + "Requirement already satisfied: google-auth-oauthlib in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from gcsfs!=2025.5.0,>=2023.3.0->bigframes) (1.2.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (1.4.0)\n", + "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (5.0.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (1.7.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (6.6.4)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (0.3.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (1.20.1)\n", + "Requirement already satisfied: pyogrio>=0.7.2 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from geopandas>=0.12.2->bigframes) (0.11.1)\n", + "Requirement already satisfied: pyproj>=3.5.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from geopandas>=0.12.2->bigframes) (3.7.1)\n", + "Requirement already satisfied: google-cloud-core<3.0.0,>=2.4.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-cloud-bigquery>=3.36.0->google-cloud-bigquery[bqstorage,pandas]>=3.36.0->bigframes) (2.4.3)\n", + "Requirement already satisfied: google-resumable-media<3.0.0,>=2.0.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-cloud-bigquery>=3.36.0->google-cloud-bigquery[bqstorage,pandas]>=3.36.0->bigframes) (2.7.2)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-resumable-media<3.0.0,>=2.0.0->google-cloud-bigquery>=3.36.0->google-cloud-bigquery[bqstorage,pandas]>=3.36.0->bigframes) (1.7.1)\n", + "Requirement already satisfied: comm>=0.1.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipywidgets>=7.7.1->bigframes) (0.2.3)\n", + "Requirement already satisfied: ipython>=6.1.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipywidgets>=7.7.1->bigframes) (8.37.0)\n", + "Requirement already satisfied: traitlets>=4.3.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipywidgets>=7.7.1->bigframes) (5.14.3)\n", + "Requirement already satisfied: widgetsnbextension~=4.0.14 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipywidgets>=7.7.1->bigframes) (4.0.14)\n", + "Requirement already satisfied: jupyterlab_widgets~=3.0.15 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipywidgets>=7.7.1->bigframes) (3.0.15)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (1.3.0)\n", + "Requirement already satisfied: jedi>=0.16 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.19.2)\n", + "Requirement already satisfied: matplotlib-inline in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.1.7)\n", + "Requirement already satisfied: pexpect>4.3 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (4.9.0)\n", + "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (3.0.52)\n", + "Requirement already satisfied: stack_data in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.6.3)\n", + "Requirement already satisfied: wcwidth in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.2.13)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.8.5)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich<14,>=12.4.4->bigframes) (0.1.2)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from matplotlib>=3.7.1->bigframes) (1.3.2)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from matplotlib>=3.7.1->bigframes) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from matplotlib>=3.7.1->bigframes) (4.59.2)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from matplotlib>=3.7.1->bigframes) (1.4.9)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from matplotlib>=3.7.1->bigframes) (11.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from matplotlib>=3.7.1->bigframes) (3.2.3)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from pandas>=1.5.3->bigframes) (2025.2)\n", + "Requirement already satisfied: setuptools in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from pandas-gbq>=0.26.1->bigframes) (65.5.0)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from google-auth-oauthlib->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (2.0.0)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.7.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs!=2025.5.0,>=2023.3.0->bigframes) (3.3.1)\n", + "Requirement already satisfied: executing>=1.2.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (2.2.1)\n", + "Requirement already satisfied: asttokens>=2.1.0 in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (3.0.0)\n", + "Requirement already satisfied: pure-eval in /usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes-2/venv/lib/python3.10/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.7.1->bigframes) (0.2.3)\n" + ] + } + ], "source": [ "!pip install bigframes" ] @@ -230,20 +331,9 @@ "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Updated property [core/project].\n" - ] - } - ], + "outputs": [], "source": [ - "PROJECT_ID = \"\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" + "PROJECT_ID = \"\" # @param {type:\"string\"}" ] }, { @@ -381,7 +471,13 @@ "# It defaults to the location of the first table or query\n", "# passed to read_gbq(). For APIs where a location can't be\n", "# auto-detected, the location defaults to the \"US\" location.\n", - "bpd.options.bigquery.location = REGION" + "bpd.options.bigquery.location = REGION\n", + "\n", + "# Note: By default BigQuery DataFrames emits out BigQuery job metadata via a\n", + "# progress bar. But in this notebook let's disable the progress bar to keep the\n", + "# experience less verbose. If you would like the default behavior, please\n", + "# comment out the following expression. \n", + "bpd.options.display.progress_bar = None" ] }, { @@ -432,20 +528,7 @@ "metadata": { "id": "Vyex9BQI-BNa" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job badadf0b-27c8-4dac-a468-be3c40745538 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# bq_df_sample = bpd.read_gbq(\"bigquery-samples.wikipedia_pageviews.200809h\")" ] @@ -477,18 +560,6 @@ "id": "XfGq5apK-D_e" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job c8669c7f-bca3-4f54-b354-8e57b3321f5a is DONE. 34.9 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -716,20 +787,7 @@ "metadata": { "id": "EDAaIwHpQCDZ" }, - "outputs": [ - { - "data": { - "text/html": [ - "Load job 93903930-10b8-48b8-b41b-3da54917b281 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# If order is not important, use the \"bigquery\" engine to\n", "# allow BigQuery DataFrames to read directly from GCS.\n", @@ -752,18 +810,6 @@ "id": "_gPD0Zn1Stdb" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 17f58b5c-88b2-4b26-8d0d-cc3d9a979a06 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -796,53 +842,53 @@ " \n", " \n", " \n", - " 78\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 47.0\n", - " 17.3\n", - " 185\n", - " 3700\n", - " FEMALE\n", + " 41\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 49.8\n", + " 16.8\n", + " 230\n", + " 5700\n", + " MALE\n", " \n", " \n", - " 130\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " 73\n", + " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 40.5\n", - " 17.9\n", - " 187\n", - " 3200\n", - " FEMALE\n", + " 46.8\n", + " 16.1\n", + " 215\n", + " 5500\n", + " MALE\n", " \n", " \n", - " 84\n", + " 75\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 49.1\n", - " 14.5\n", - " 212\n", - " 4625\n", - " FEMALE\n", + " 49.6\n", + " 16.0\n", + " 225\n", + " 5700\n", + " MALE\n", " \n", " \n", - " 334\n", + " 93\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 38.2\n", - " 20.0\n", - " 190\n", - " 3900\n", - " MALE\n", + " 35.5\n", + " 16.2\n", + " 195\n", + " 3350\n", + " FEMALE\n", " \n", " \n", - " 67\n", + " 299\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 55.8\n", - " 19.8\n", - " 207\n", - " 4000\n", + " 52.0\n", + " 18.1\n", + " 201\n", + " 4050\n", " MALE\n", " \n", " \n", @@ -851,18 +897,18 @@ ], "text/plain": [ " species island culmen_length_mm \\\n", - "78 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "130 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", - "84 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "334 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.2 \n", - "67 Chinstrap penguin (Pygoscelis antarctica) Dream 55.8 \n", + "41 Gentoo penguin (Pygoscelis papua) Biscoe 49.8 \n", + "73 Gentoo penguin (Pygoscelis papua) Biscoe 46.8 \n", + "75 Gentoo penguin (Pygoscelis papua) Biscoe 49.6 \n", + "93 Adelie Penguin (Pygoscelis adeliae) Biscoe 35.5 \n", + "299 Chinstrap penguin (Pygoscelis antarctica) Dream 52.0 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "78 17.3 185 3700 FEMALE \n", - "130 17.9 187 3200 FEMALE \n", - "84 14.5 212 4625 FEMALE \n", - "334 20.0 190 3900 MALE \n", - "67 19.8 207 4000 MALE " + "41 16.8 230 5700 MALE \n", + "73 16.1 215 5500 MALE \n", + "75 16.0 225 5700 MALE \n", + "93 16.2 195 3350 FEMALE \n", + "299 18.1 201 4050 MALE " ] }, "execution_count": 15, @@ -936,18 +982,6 @@ "id": "oP1NIAmUBjop" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 55aa9cc4-29b6-4052-aae4-5499dc5f1168 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ @@ -992,18 +1026,6 @@ "id": "IBuo-d6dWfsA" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 7b2ff811-1563-4ac4-9d21-69f87e8e85bc is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1036,73 +1058,73 @@ " \n", " \n", " \n", - " 12\n", + " 79\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 42.7\n", - " 13.7\n", + " 43.3\n", + " 14.0\n", " 208\n", - " 3950\n", + " 4575\n", " FEMALE\n", " \n", " \n", - " 24\n", - " Gentoo penguin (Pygoscelis papua)\n", + " 118\n", + " Adelie Penguin (Pygoscelis adeliae)\n", " Biscoe\n", - " 45.0\n", - " 15.4\n", - " 220\n", - " 5050\n", + " 40.6\n", + " 18.6\n", + " 183\n", + " 3550\n", " MALE\n", " \n", " \n", - " 62\n", + " 213\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 38.8\n", - " 20.0\n", - " 190\n", - " 3950\n", + " Torgersen\n", + " 42.1\n", + " 19.1\n", + " 195\n", + " 4000\n", " MALE\n", " \n", " \n", - " 123\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 42.5\n", - " 17.3\n", - " 187\n", - " 3350\n", + " 315\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 38.7\n", + " 19.0\n", + " 195\n", + " 3450\n", " FEMALE\n", " \n", " \n", - " 27\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " 338\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 44.1\n", - " 19.7\n", - " 196\n", - " 4400\n", - " MALE\n", + " 40.9\n", + " 16.6\n", + " 187\n", + " 3200\n", + " FEMALE\n", " \n", " \n", "\n", "" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "12 Gentoo penguin (Pygoscelis papua) Biscoe 42.7 \n", - "24 Gentoo penguin (Pygoscelis papua) Biscoe 45.0 \n", - "62 Adelie Penguin (Pygoscelis adeliae) Dream 38.8 \n", - "123 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", - "27 Adelie Penguin (Pygoscelis adeliae) Dream 44.1 \n", + " species island culmen_length_mm \\\n", + "79 Gentoo penguin (Pygoscelis papua) Biscoe 43.3 \n", + "118 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.6 \n", + "213 Adelie Penguin (Pygoscelis adeliae) Torgersen 42.1 \n", + "315 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.7 \n", + "338 Chinstrap penguin (Pygoscelis antarctica) Dream 40.9 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "12 13.7 208 3950 FEMALE \n", - "24 15.4 220 5050 MALE \n", - "62 20.0 190 3950 MALE \n", - "123 17.3 187 3350 FEMALE \n", - "27 19.7 196 4400 MALE " + "79 14.0 208 4575 FEMALE \n", + "118 18.6 183 3550 MALE \n", + "213 19.1 195 4000 MALE \n", + "315 19.0 195 3450 FEMALE \n", + "338 16.6 187 3200 FEMALE " ] }, "execution_count": 18, @@ -1152,18 +1174,6 @@ "id": "6i6HkFJZa8na" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job b396baed-6242-4478-9092-f5e86811b045 is DONE. 31.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ @@ -1171,10 +1181,10 @@ "279 3150\n", "34 3400\n", "96 3600\n", - "18 3800\n", "208 3950\n", - "310 3175\n", + "18 3800\n", "64 2850\n", + "310 3175\n", "118 3550\n", "2 3075\n", "Name: body_mass_g, dtype: Int64" @@ -1209,7 +1219,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "average_body_mass: 4201.754385964913\n" + "average_body_mass: 4201.754385964914\n" ] } ], @@ -1234,18 +1244,6 @@ "id": "4PyKMR61-Mjy" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job fef05ee2-9690-41a4-bd35-7cded77310f2 is DONE. 15.6 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1366,20 +1364,7 @@ "metadata": { "id": "rSWTOG-vb2Fc" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job c7b6c009-d2c4-4739-a6f8-5ef51e6b1851 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "@bpd.remote_function(cloud_function_service_account=\"default\")\n", "def get_bucket(num: float) -> str:\n", @@ -1410,8 +1395,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Cloud Function Name projects/bigframes-dev/locations/us-central1/functions/bigframes-sessiondf1983-1d02aa9bc80939ba72e7ff69e37e27c8\n", - "Remote Function Name bigframes-dev._f36a8f778c434a1ec421979eaa3bf562a8561e38.bigframes_sessiondf1983_1d02aa9bc80939ba72e7ff69e37e27c8\n" + "Cloud Function Name projects/bigframes-dev/locations/us-central1/functions/bigframes-sessioncf7a5d-aa59468b9d6c757c1256e46c9f71ebe3\n", + "Remote Function Name bigframes-dev._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bigframes_sessioncf7a5d_aa59468b9d6c757c1256e46c9f71ebe3\n" ] } ], @@ -1485,19 +1470,14 @@ " at_or_above_3500\n", " \n", " \n", - " 18\n", - " 3800\n", - " at_or_above_3500\n", - " \n", - " \n", " 208\n", " 3950\n", " at_or_above_3500\n", " \n", " \n", - " 310\n", - " 3175\n", - " below_3500\n", + " 18\n", + " 3800\n", + " at_or_above_3500\n", " \n", " \n", " 64\n", @@ -1505,6 +1485,11 @@ " below_3500\n", " \n", " \n", + " 310\n", + " 3175\n", + " below_3500\n", + " \n", + " \n", " 118\n", " 3550\n", " at_or_above_3500\n", @@ -1524,10 +1509,10 @@ "279 3150 below_3500\n", "34 3400 below_3500\n", "96 3600 at_or_above_3500\n", - "18 3800 at_or_above_3500\n", "208 3950 at_or_above_3500\n", - "310 3175 below_3500\n", + "18 3800 at_or_above_3500\n", "64 2850 below_3500\n", + "310 3175 below_3500\n", "118 3550 at_or_above_3500\n", "2 3075 below_3500" ] @@ -1658,7 +1643,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.10.16" } }, "nbformat": 4, From 095c0b85a25a2e51087880909597cc62a0341c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 Oct 2025 11:01:07 -0500 Subject: [PATCH 07/22] fix: avoid possible circular imports in global session (#2115) --- bigframes/core/global_session.py | 16 ++++++++++--- .../pandas/_config/config.py | 24 ++++++++++++------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 4698e4c4c5..b055bdb854 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -14,16 +14,19 @@ """Utilities for managing a default, globally available Session object.""" +from __future__ import annotations + import threading import traceback -from typing import Callable, Optional, TypeVar +from typing import Callable, Optional, TYPE_CHECKING, TypeVar import warnings import google.auth.exceptions -import bigframes._config import bigframes.exceptions as bfe -import bigframes.session + +if TYPE_CHECKING: + import bigframes.session _global_session: Optional[bigframes.session.Session] = None _global_session_lock = threading.Lock() @@ -56,6 +59,9 @@ def close_session() -> None: Returns: None """ + # Avoid troubles with circular imports. + import bigframes._config + global _global_session, _global_session_lock, _global_session_state if bigframes._config.options.is_bigquery_thread_local: @@ -88,6 +94,10 @@ def get_global_session(): Creates the global session if it does not exist. """ + # Avoid troubles with circular imports. + import bigframes._config + import bigframes.session + global _global_session, _global_session_lock, _global_session_state if bigframes._config.options.is_bigquery_thread_local: diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py index 13ccfdac89..418f5868e5 100644 --- a/third_party/bigframes_vendored/pandas/_config/config.py +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -2,8 +2,6 @@ import contextlib import operator -import bigframes - class option_context(contextlib.ContextDecorator): """ @@ -35,8 +33,11 @@ def __init__(self, *args) -> None: self.ops = list(zip(args[::2], args[1::2])) def __enter__(self) -> None: + # Avoid problems with circular imports. + import bigframes._config + self.undo = [ - (pat, operator.attrgetter(pat)(bigframes.options)) + (pat, operator.attrgetter(pat)(bigframes._config.options)) for pat, _ in self.ops # Don't try to undo changes to bigquery options. We're starting and # closing a new thread-local session if those are set. @@ -47,6 +48,10 @@ def __enter__(self) -> None: self._set_option(pat, val) def __exit__(self, *args) -> None: + # Avoid problems with circular imports. + import bigframes._config + import bigframes.core.global_session + if self.undo: for pat, val in self.undo: self._set_option(pat, val) @@ -54,18 +59,21 @@ def __exit__(self, *args) -> None: # TODO(tswast): What to do if someone nests several context managers # with separate "bigquery" options? We might need a "stack" of # sessions if we allow that. - if bigframes.options.is_bigquery_thread_local: - bigframes.close_session() + if bigframes._config.options.is_bigquery_thread_local: + bigframes.core.global_session.close_session() # Reset bigquery_options so that we're no longer thread-local. - bigframes.options._local.bigquery_options = None + bigframes._config.options._local.bigquery_options = None def _set_option(self, pat, val): + # Avoid problems with circular imports. + import bigframes._config + root, attr = pat.rsplit(".", 1) # We are now using a thread-specific session. if root == "bigquery": - bigframes.options._init_bigquery_thread_local() + bigframes._config.options._init_bigquery_thread_local() - parent = operator.attrgetter(root)(bigframes.options) + parent = operator.attrgetter(root)(bigframes._config.options) setattr(parent, attr, val) From 1f48d3a62e7e6dac4acb39e911daf766b8e2fe62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 Oct 2025 11:04:17 -0500 Subject: [PATCH 08/22] fix: show progress even in job optional queries (#2119) * fix: show progress even in job optional queries * first attempt at publisher * report execution started/stopped in read_gbq_query * render bigquery sent events * Feat render more events (#2121) * feat: Render more BigQuery events in progress bar This change updates bigframes/formatting_helpers.py to render more event types from bigframes/core/events.py. Specifically, it adds rendering support for: - BigQueryRetryEvent - BigQueryReceivedEvent - BigQueryFinishedEvent - BigQueryUnknownEvent This provides users with more detailed feedback during query execution in both notebook (HTML) and terminal (plaintext) environments. * feat: Render more BigQuery events in progress bar This change updates bigframes/formatting_helpers.py to render more event types from bigframes/core/events.py. Specifically, it adds rendering support for: - BigQueryRetryEvent - BigQueryReceivedEvent - BigQueryFinishedEvent - BigQueryUnknownEvent This provides users with more detailed feedback during query execution in both notebook (HTML) and terminal (plaintext) environments. Unit tests have been added to verify the rendering of each new event type. --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> * fix job links * fix system tests * fix mypy * fix unit tests * support more event types * move publisher to session * fix remaining mypy errors * update text * add explicit unsubscribe * fix presubmits * add lock for publisher and publish temp table creations --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/blob/_functions.py | 1 + bigframes/core/events.py | 237 +++++++++ bigframes/dataframe.py | 82 ++-- bigframes/formatting_helpers.py | 453 +++++++++++++----- bigframes/functions/_function_client.py | 1 + bigframes/functions/function.py | 16 +- bigframes/pandas/__init__.py | 1 + bigframes/session/__init__.py | 27 +- bigframes/session/_io/bigquery/__init__.py | 84 +++- .../session/_io/bigquery/read_gbq_table.py | 48 +- bigframes/session/anonymous_dataset.py | 6 +- bigframes/session/bigquery_session.py | 61 ++- bigframes/session/bq_caching_executor.py | 28 +- bigframes/session/direct_gbq_execution.py | 9 +- bigframes/session/loader.py | 69 ++- bigframes/testing/mocks.py | 1 + setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/engines/conftest.py | 9 +- .../system/small/engines/test_aggregation.py | 16 +- tests/system/small/engines/test_windowing.py | 8 +- .../small/functions/test_remote_function.py | 4 + tests/system/small/test_bq_sessions.py | 10 +- tests/system/small/test_progress_bar.py | 36 +- tests/unit/session/test_io_bigquery.py | 2 + tests/unit/session/test_read_gbq_table.py | 68 ++- tests/unit/session/test_session.py | 15 +- tests/unit/test_formatting_helpers.py | 129 ++++- 28 files changed, 1155 insertions(+), 270 deletions(-) create mode 100644 bigframes/core/events.py diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 8d1ca38e62..8dd9328fb8 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -99,6 +99,7 @@ def _create_udf(self): project=None, timeout=None, query_with_job=True, + publisher=self._session._publisher, ) return udf_name diff --git a/bigframes/core/events.py b/bigframes/core/events.py new file mode 100644 index 0000000000..d0e5f7ad69 --- /dev/null +++ b/bigframes/core/events.py @@ -0,0 +1,237 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +import datetime +import threading +from typing import Any, Callable, Optional, Set +import uuid + +import google.cloud.bigquery._job_helpers +import google.cloud.bigquery.job.query +import google.cloud.bigquery.table + +import bigframes.session.executor + + +class Subscriber: + def __init__(self, callback: Callable[[Event], None], *, publisher: Publisher): + self._publisher = publisher + self._callback = callback + self._subscriber_id = uuid.uuid4() + + def __call__(self, *args, **kwargs): + return self._callback(*args, **kwargs) + + def __hash__(self) -> int: + return hash(self._subscriber_id) + + def __eq__(self, value: object): + if not isinstance(value, Subscriber): + return NotImplemented + return value._subscriber_id == self._subscriber_id + + def close(self): + self._publisher.unsubscribe(self) + del self._publisher + del self._callback + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_value is not None: + self( + UnknownErrorEvent( + exc_type=exc_type, + exc_value=exc_value, + traceback=traceback, + ) + ) + self.close() + + +class Publisher: + def __init__(self): + self._subscribers_lock = threading.Lock() + self._subscribers: Set[Subscriber] = set() + + def subscribe(self, callback: Callable[[Event], None]) -> Subscriber: + # TODO(b/448176657): figure out how to handle subscribers/publishers in + # a background thread. Maybe subscribers should be thread-local? + subscriber = Subscriber(callback, publisher=self) + with self._subscribers_lock: + self._subscribers.add(subscriber) + return subscriber + + def unsubscribe(self, subscriber: Subscriber): + with self._subscribers_lock: + self._subscribers.remove(subscriber) + + def publish(self, event: Event): + with self._subscribers_lock: + for subscriber in self._subscribers: + subscriber(event) + + +class Event: + pass + + +@dataclasses.dataclass(frozen=True) +class SessionClosed(Event): + session_id: str + + +class ExecutionStarted(Event): + pass + + +class ExecutionRunning(Event): + pass + + +@dataclasses.dataclass(frozen=True) +class ExecutionFinished(Event): + result: Optional[bigframes.session.executor.ExecuteResult] = None + + +@dataclasses.dataclass(frozen=True) +class UnknownErrorEvent(Event): + exc_type: Any + exc_value: Any + traceback: Any + + +@dataclasses.dataclass(frozen=True) +class BigQuerySentEvent(ExecutionRunning): + """Query sent to BigQuery.""" + + query: str + billing_project: Optional[str] = None + location: Optional[str] = None + job_id: Optional[str] = None + request_id: Optional[str] = None + + @classmethod + def from_bqclient(cls, event: google.cloud.bigquery._job_helpers.QuerySentEvent): + return cls( + query=event.query, + billing_project=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryRetryEvent(ExecutionRunning): + """Query sent another time because the previous attempt failed.""" + + query: str + billing_project: Optional[str] = None + location: Optional[str] = None + job_id: Optional[str] = None + request_id: Optional[str] = None + + @classmethod + def from_bqclient(cls, event: google.cloud.bigquery._job_helpers.QueryRetryEvent): + return cls( + query=event.query, + billing_project=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryReceivedEvent(ExecutionRunning): + """Query received and acknowledged by the BigQuery API.""" + + billing_project: Optional[str] = None + location: Optional[str] = None + job_id: Optional[str] = None + statement_type: Optional[str] = None + state: Optional[str] = None + query_plan: Optional[list[google.cloud.bigquery.job.query.QueryPlanEntry]] = None + created: Optional[datetime.datetime] = None + started: Optional[datetime.datetime] = None + ended: Optional[datetime.datetime] = None + + @classmethod + def from_bqclient( + cls, event: google.cloud.bigquery._job_helpers.QueryReceivedEvent + ): + return cls( + billing_project=event.billing_project, + location=event.location, + job_id=event.job_id, + statement_type=event.statement_type, + state=event.state, + query_plan=event.query_plan, + created=event.created, + started=event.started, + ended=event.ended, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryFinishedEvent(ExecutionRunning): + """Query finished successfully.""" + + billing_project: Optional[str] = None + location: Optional[str] = None + query_id: Optional[str] = None + job_id: Optional[str] = None + destination: Optional[google.cloud.bigquery.table.TableReference] = None + total_rows: Optional[int] = None + total_bytes_processed: Optional[int] = None + slot_millis: Optional[int] = None + created: Optional[datetime.datetime] = None + started: Optional[datetime.datetime] = None + ended: Optional[datetime.datetime] = None + + @classmethod + def from_bqclient( + cls, event: google.cloud.bigquery._job_helpers.QueryFinishedEvent + ): + return cls( + billing_project=event.billing_project, + location=event.location, + query_id=event.query_id, + job_id=event.job_id, + destination=event.destination, + total_rows=event.total_rows, + total_bytes_processed=event.total_bytes_processed, + slot_millis=event.slot_millis, + created=event.created, + started=event.started, + ended=event.ended, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryUnknownEvent(ExecutionRunning): + """Got unknown event from the BigQuery client library.""" + + # TODO: should we just skip sending unknown events? + + event: object + + @classmethod + def from_bqclient(cls, event): + return cls(event) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1bde29506d..bc2bbb963b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4670,24 +4670,24 @@ def to_string( ) -> str | None: return self.to_pandas(allow_large_results=allow_large_results).to_string( buf, - columns, # type: ignore - col_space, - header, # type: ignore - index, - na_rep, - formatters, - float_format, - sparsify, - index_names, - justify, - max_rows, - max_cols, - show_dimensions, - decimal, - line_width, - min_rows, - max_colwidth, - encoding, + columns=columns, # type: ignore + col_space=col_space, + header=header, # type: ignore + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + justify=justify, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, + min_rows=min_rows, + max_colwidth=max_colwidth, + encoding=encoding, ) def to_html( @@ -4720,28 +4720,28 @@ def to_html( ) -> str: return self.to_pandas(allow_large_results=allow_large_results).to_html( buf, - columns, # type: ignore - col_space, - header, - index, - na_rep, - formatters, - float_format, - sparsify, - index_names, - justify, # type: ignore - max_rows, - max_cols, - show_dimensions, - decimal, - bold_rows, - classes, - escape, - notebook, - border, - table_id, - render_links, - encoding, + columns=columns, # type: ignore + col_space=col_space, + header=header, + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + justify=justify, # type: ignore + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + bold_rows=bold_rows, + classes=classes, + escape=escape, + notebook=notebook, + border=border, + table_id=table_id, + render_links=render_links, + encoding=encoding, ) def to_markdown( @@ -4753,7 +4753,7 @@ def to_markdown( allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: - return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode, index, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: return self.to_pandas(allow_large_results=allow_large_results).to_pickle( diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 48afb4fdbd..f75394c47d 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -13,11 +13,13 @@ # limitations under the License. """Shared helper functions for formatting jobs related info.""" -# TODO(orrbradford): cleanup up typings and documenttion in this file + +from __future__ import annotations import datetime +import html import random -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Type, TYPE_CHECKING, Union import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions @@ -25,7 +27,9 @@ import humanize import IPython import IPython.display as display -import ipywidgets as widgets + +if TYPE_CHECKING: + import bigframes.core.events GenericJob = Union[ bigquery.LoadJob, bigquery.ExtractJob, bigquery.QueryJob, bigquery.CopyJob @@ -58,39 +62,6 @@ def create_exception_with_feedback_link( return exception(constants.FEEDBACK_LINK) -def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): - """Return query job in html format. - Args: - query_job (bigquery.QueryJob, Optional): - The job representing the execution of the query on the server. - Returns: - Pywidget html table. - """ - if query_job is None: - return display.HTML("No job information available") - if query_job.dry_run: - return display.HTML( - f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" - ) - table_html = "" - table_html += "" - for key, value in query_job_prop_pairs.items(): - job_val = getattr(query_job, value) - if job_val is not None: - if key == "Job Id": # add link to job - table_html += f"""""" - elif key == "Slot Time": - table_html += ( - f"""""" - ) - elif key == "Bytes Processed": - table_html += f"""""" - else: - table_html += f"""""" - table_html += "
{key}{job_val}
{key}{get_formatted_time(job_val)}
{key}{get_formatted_bytes(job_val)}
{key}{job_val}
" - return widgets.HTML(table_html) - - def repr_query_job(query_job: Optional[bigquery.QueryJob]): """Return query job as a formatted string. Args: @@ -109,7 +80,11 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): if job_val is not None: res += "\n" if key == "Job Id": # add link to job - res += f"""Job url: {get_job_url(query_job)}""" + res += f"""Job url: {get_job_url( + project_id=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + )}""" elif key == "Slot Time": res += f"""{key}: {get_formatted_time(job_val)}""" elif key == "Bytes Processed": @@ -119,71 +94,90 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): return res -def wait_for_query_job( - query_job: bigquery.QueryJob, - max_results: Optional[int] = None, - page_size: Optional[int] = None, - progress_bar: Optional[str] = None, -) -> bigquery.table.RowIterator: - """Return query results. Displays a progress bar while the query is running - Args: - query_job (bigquery.QueryJob, Optional): - The job representing the execution of the query on the server. - max_results (int, Optional): - The maximum number of rows the row iterator should return. - page_size (int, Optional): - The number of results to return on each results page. - progress_bar (str, Optional): - Which progress bar to show. - Returns: - A row iterator over the query results. - """ +current_display: Optional[display.HTML] = None +current_display_id: Optional[str] = None +previous_display_html: str = "" + + +def progress_callback( + event: bigframes.core.events.Event, +): + """Displays a progress bar while the query is running""" + global current_display, current_display_id, previous_display_html + + import bigframes._config + import bigframes.core.events + + progress_bar = bigframes._config.options.display.progress_bar + if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" - try: - if progress_bar == "notebook": - display_id = str(random.random()) - loading_bar = display.HTML(get_query_job_loading_html(query_job)) - display.display(loading_bar, display_id=display_id) - query_result = query_job.result( - max_results=max_results, page_size=page_size + if progress_bar == "notebook": + if ( + isinstance(event, bigframes.core.events.ExecutionStarted) + or current_display is None + or current_display_id is None + ): + previous_display_html = "" + current_display_id = str(random.random()) + current_display = display.HTML("Starting.") + display.display( + current_display, + display_id=current_display_id, ) - query_job.reload() + + if isinstance(event, bigframes.core.events.BigQuerySentEvent): + previous_display_html = render_bqquery_sent_event_html(event) display.update_display( - display.HTML(get_query_job_loading_html(query_job)), - display_id=display_id, + display.HTML(previous_display_html), + display_id=current_display_id, ) - elif progress_bar == "terminal": - initial_loading_bar = get_query_job_loading_string(query_job) - print(initial_loading_bar) - query_result = query_job.result( - max_results=max_results, page_size=page_size + elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): + previous_display_html = render_bqquery_retry_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, ) - query_job.reload() - if initial_loading_bar != get_query_job_loading_string(query_job): - print(get_query_job_loading_string(query_job)) - else: - # No progress bar. - query_result = query_job.result( - max_results=max_results, page_size=page_size + elif isinstance(event, bigframes.core.events.BigQueryReceivedEvent): + previous_display_html = render_bqquery_received_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, ) - query_job.reload() - return query_result - except api_core_exceptions.RetryError as exc: - add_feedback_link(exc) - raise - except api_core_exceptions.GoogleAPICallError as exc: - add_feedback_link(exc) - raise - except KeyboardInterrupt: - query_job.cancel() - print( - f"Requested cancellation for {query_job.job_type.capitalize()}" - f" job {query_job.job_id} in location {query_job.location}..." - ) - # begin the cancel request before immediately rethrowing - raise + elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): + previous_display_html = render_bqquery_finished_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.ExecutionFinished): + display.update_display( + display.HTML(f"✅ Completed. {previous_display_html}"), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.SessionClosed): + display.update_display( + display.HTML(f"Session {event.session_id} closed."), + display_id=current_display_id, + ) + elif progress_bar == "terminal": + if isinstance(event, bigframes.core.events.ExecutionStarted): + print("Starting execution.") + elif isinstance(event, bigframes.core.events.BigQuerySentEvent): + message = render_bqquery_sent_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): + message = render_bqquery_retry_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.BigQueryReceivedEvent): + message = render_bqquery_received_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): + message = render_bqquery_finished_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.ExecutionFinished): + print("Execution done.") def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): @@ -234,24 +228,74 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): raise -def get_job_url(query_job: GenericJob): +def render_query_references( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], + request_id: Optional[str], +) -> str: + query_id = "" + if request_id and not job_id: + query_id = f" with request ID {project_id}:{location}.{request_id}" + return query_id + + +def render_job_link_html( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], +) -> str: + job_url = get_job_url( + project_id=project_id, + location=location, + job_id=job_id, + ) + if job_url: + job_link = f' [Job {project_id}:{location}.{job_id} details]' + else: + job_link = "" + return job_link + + +def render_job_link_plaintext( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], +) -> str: + job_url = get_job_url( + project_id=project_id, + location=location, + job_id=job_id, + ) + if job_url: + job_link = f" Job {project_id}:{location}.{job_id} details: {job_url}" + else: + job_link = "" + return job_link + + +def get_job_url( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], +): """Return url to the query job in cloud console. - Args: - query_job (GenericJob): - The job representing the execution of the query on the server. + Returns: String url. """ - if ( - query_job.project is None - or query_job.location is None - or query_job.job_id is None - ): + if project_id is None or location is None or job_id is None: return None - return f"""https://console.cloud.google.com/bigquery?project={query_job.project}&j=bq:{query_job.location}:{query_job.job_id}&page=queryresults""" + return f"""https://console.cloud.google.com/bigquery?project={project_id}&j=bq:{location}:{job_id}&page=queryresults""" -def get_query_job_loading_html(query_job: bigquery.QueryJob): +def render_bqquery_sent_event_html( + event: bigframes.core.events.BigQuerySentEvent, +) -> str: """Return progress bar html string Args: query_job (bigquery.QueryJob): @@ -259,18 +303,195 @@ def get_query_job_loading_html(query_job: bigquery.QueryJob): Returns: Html string. """ - return f"""Query job {query_job.job_id} is {query_job.state}. {get_bytes_processed_string(query_job.total_bytes_processed)}Open Job""" + job_link = render_job_link_html( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + query_text_details = f"
SQL
{html.escape(event.query)}
" + + return f""" + Query started{query_id}.{job_link}{query_text_details} + """ -def get_query_job_loading_string(query_job: bigquery.QueryJob): - """Return progress bar string + +def render_bqquery_sent_event_plaintext( + event: bigframes.core.events.BigQuerySentEvent, +) -> str: + """Return progress bar html string Args: query_job (bigquery.QueryJob): The job representing the execution of the query on the server. Returns: - String + Html string. + """ + + job_link = render_job_link_plaintext( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + + return f"Query started{query_id}.{job_link}" + + +def render_bqquery_retry_event_html( + event: bigframes.core.events.BigQueryRetryEvent, +) -> str: + """Return progress bar html string for retry event.""" + + job_link = render_job_link_html( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + query_text_details = f"
SQL
{html.escape(event.query)}
" + + return f""" + Retrying query{query_id}.{job_link}{query_text_details} + """ + + +def render_bqquery_retry_event_plaintext( + event: bigframes.core.events.BigQueryRetryEvent, +) -> str: + """Return progress bar plaintext string for retry event.""" + + job_link = render_job_link_plaintext( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + return f"Retrying query{query_id}.{job_link}" + + +def render_bqquery_received_event_html( + event: bigframes.core.events.BigQueryReceivedEvent, +) -> str: + """Return progress bar html string for received event.""" + + job_link = render_job_link_html( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) + + query_plan_details = "" + if event.query_plan: + plan_str = "\n".join([str(entry) for entry in event.query_plan]) + query_plan_details = f"
Query Plan
{html.escape(plan_str)}
" + + return f""" + Query{query_id} is {event.state}.{job_link}{query_plan_details} + """ + + +def render_bqquery_received_event_plaintext( + event: bigframes.core.events.BigQueryReceivedEvent, +) -> str: + """Return progress bar plaintext string for received event.""" + + job_link = render_job_link_plaintext( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) + return f"Query{query_id} is {event.state}.{job_link}" + + +def render_bqquery_finished_event_html( + event: bigframes.core.events.BigQueryFinishedEvent, +) -> str: + """Return progress bar html string for finished event.""" + + bytes_str = "" + if event.total_bytes_processed is not None: + bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)}" + + slot_time_str = "" + if event.slot_millis is not None: + slot_time = datetime.timedelta(milliseconds=event.slot_millis) + slot_time_str = f" in {humanize.naturaldelta(slot_time)} of slot time" + + job_link = render_job_link_html( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) + return f""" + Query processed{bytes_str}{slot_time_str}{query_id}.{job_link} """ - return f"""Query job {query_job.job_id} is {query_job.state}.{get_bytes_processed_string(query_job.total_bytes_processed)} \n{get_job_url(query_job)}""" + + +def render_bqquery_finished_event_plaintext( + event: bigframes.core.events.BigQueryFinishedEvent, +) -> str: + """Return progress bar plaintext string for finished event.""" + + bytes_str = "" + if event.total_bytes_processed is not None: + bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)} processed." + + slot_time_str = "" + if event.slot_millis is not None: + slot_time = datetime.timedelta(milliseconds=event.slot_millis) + slot_time_str = f" Slot time: {humanize.naturaldelta(slot_time)}." + + job_link = render_job_link_plaintext( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) + return f"Query{query_id} finished.{bytes_str}{slot_time_str}{job_link}" def get_base_job_loading_html(job: GenericJob): @@ -281,7 +502,11 @@ def get_base_job_loading_html(job: GenericJob): Returns: Html string. """ - return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. Open Job""" + return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. Open Job""" def get_base_job_loading_string(job: GenericJob): @@ -292,7 +517,11 @@ def get_base_job_loading_string(job: GenericJob): Returns: String """ - return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. \n{get_job_url(job)}""" + return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. \n{get_job_url( + project_id=job.job_id, + location=job.location, + job_id=job.job_id, + )}""" def get_formatted_time(val): diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 641bf52dc9..8a88a14040 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -145,6 +145,7 @@ def _create_bq_function(self, create_function_ddl: str) -> None: timeout=None, metrics=None, query_with_job=True, + publisher=self._session._publisher, ) logger.info(f"Created bigframes function {query_job.ddl_target_routine}") diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 99b89131e7..242daf7525 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -219,7 +219,13 @@ def __call__(self, *args, **kwargs): args_string = ", ".join(map(bf_sql.simple_literal, args)) sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" - iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig()) # type: ignore + iter, job = bf_io_bigquery.start_query_with_client( + self._session.bqclient, + sql=sql, + query_with_job=True, + job_config=bigquery.QueryJobConfig(), + publisher=self._session._publisher, + ) # type: ignore return list(iter.to_arrow().to_pydict().values())[0][0] @property @@ -297,7 +303,13 @@ def __call__(self, *args, **kwargs): args_string = ", ".join(map(bf_sql.simple_literal, args)) sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" - iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig()) # type: ignore + iter, job = bf_io_bigquery.start_query_with_client( + self._session.bqclient, + sql=sql, + query_with_job=True, + job_config=bigquery.QueryJobConfig(), + publisher=self._session._publisher, + ) # type: ignore return list(iter.to_arrow().to_pydict().values())[0][0] @property diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 2ea10132bc..2455637b0a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -291,6 +291,7 @@ def clean_up_by_session_id( session.bqclient, location=location, project=project, + publisher=session._publisher, ) bigframes.session._io.bigquery.delete_tables_matching_session_id( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index df0afb4c8d..46fb56b88e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,18 +67,14 @@ import bigframes.constants import bigframes.core from bigframes.core import blocks, log_adapter, utils +import bigframes.core.events import bigframes.core.pyformat - -# Even though the ibis.backends.bigquery import is unused, it's needed -# to register new and replacement ops with the Ibis BigQuery backend. +import bigframes.formatting_helpers import bigframes.functions._function_session as bff_session import bigframes.functions.function as bff from bigframes.session import bigquery_session, bq_caching_executor, executor import bigframes.session._io.bigquery as bf_io_bigquery -import bigframes.session.anonymous_dataset import bigframes.session.clients -import bigframes.session.loader -import bigframes.session.metrics import bigframes.session.validation # Avoid circular imports. @@ -140,6 +136,11 @@ def __init__( _warn_if_bf_version_is_obsolete() + # Publisher needs to be created before the other objects, especially + # the executors, because they access it. + self._publisher = bigframes.core.events.Publisher() + self._publisher.subscribe(bigframes.formatting_helpers.progress_callback) + if context is None: context = bigquery_options.BigQueryOptions() @@ -232,12 +233,14 @@ def __init__( location=self._location, session_id=self._session_id, kms_key=self._bq_kms_key_name, + publisher=self._publisher, ) # Session temp tables don't support specifying kms key, so use anon dataset if kms key specified self._session_resource_manager = ( bigquery_session.SessionResourceManager( self.bqclient, self._location, + publisher=self._publisher, ) if (self._bq_kms_key_name is None) else None @@ -254,6 +257,7 @@ def __init__( scan_index_uniqueness=self._strictly_ordered, force_total_order=self._strictly_ordered, metrics=self._metrics, + publisher=self._publisher, ) self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( bqclient=self._clients_provider.bqclient, @@ -263,6 +267,7 @@ def __init__( strictly_ordered=self._strictly_ordered, metrics=self._metrics, enable_polars_execution=context.enable_polars_execution, + publisher=self._publisher, ) def __del__(self): @@ -373,10 +378,16 @@ def close(self): remote_function_session = getattr(self, "_function_session", None) if remote_function_session: - self._function_session.clean_up( + remote_function_session.clean_up( self.bqclient, self.cloudfunctionsclient, self.session_id ) + publisher_session = getattr(self, "_publisher", None) + if publisher_session: + publisher_session.publish( + bigframes.core.events.SessionClosed(self.session_id) + ) + @overload def read_gbq( # type: ignore[overload-overlap] self, @@ -2154,6 +2165,7 @@ def _start_query_ml_ddl( timeout=None, query_with_job=True, job_retry=third_party_gcb_retry.DEFAULT_ML_JOB_RETRY, + publisher=self._publisher, ) return iterator, query_job @@ -2181,6 +2193,7 @@ def _create_object_table(self, path: str, connection: str) -> str: project=None, timeout=None, query_with_job=True, + publisher=self._publisher, ) return table diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 83f63e8b9a..aa56dc0040 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -29,11 +29,13 @@ import google.api_core.exceptions import google.api_core.retry import google.cloud.bigquery as bigquery +import google.cloud.bigquery._job_helpers +import google.cloud.bigquery.table from bigframes.core import log_adapter import bigframes.core.compile.googlesql as googlesql +import bigframes.core.events import bigframes.core.sql -import bigframes.formatting_helpers as formatting_helpers import bigframes.session.metrics CHECK_DRIVE_PERMISSIONS = "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." @@ -238,6 +240,24 @@ def add_and_trim_labels(job_config): ) +def create_bq_event_callback(publisher): + def publish_bq_event(event): + if isinstance(event, google.cloud.bigquery._job_helpers.QueryFinishedEvent): + bf_event = bigframes.core.events.BigQueryFinishedEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QueryReceivedEvent): + bf_event = bigframes.core.events.BigQueryReceivedEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QueryRetryEvent): + bf_event = bigframes.core.events.BigQueryRetryEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QuerySentEvent): + bf_event = bigframes.core.events.BigQuerySentEvent.from_bqclient(event) + else: + bf_event = bigframes.core.events.BigQueryUnknownEvent(event) + + publisher.publish(bf_event) + + return publish_bq_event + + @overload def start_query_with_client( bq_client: bigquery.Client, @@ -249,7 +269,8 @@ def start_query_with_client( timeout: Optional[float], metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[True], -) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + publisher: bigframes.core.events.Publisher, +) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: ... @@ -264,7 +285,8 @@ def start_query_with_client( timeout: Optional[float], metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[False], -) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: + publisher: bigframes.core.events.Publisher, +) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: ... @@ -280,7 +302,8 @@ def start_query_with_client( metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[True], job_retry: google.api_core.retry.Retry, -) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + publisher: bigframes.core.events.Publisher, +) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: ... @@ -296,7 +319,8 @@ def start_query_with_client( metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[False], job_retry: google.api_core.retry.Retry, -) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: + publisher: bigframes.core.events.Publisher, +) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: ... @@ -315,23 +339,26 @@ def start_query_with_client( # https://github.com/googleapis/python-bigquery/pull/2256 merged, likely # version 3.36.0 or later. job_retry: google.api_core.retry.Retry = third_party_gcb_retry.DEFAULT_JOB_RETRY, -) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: + publisher: bigframes.core.events.Publisher, +) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts query job and waits for results. """ + # Note: Ensure no additional labels are added to job_config after this + # point, as `add_and_trim_labels` ensures the label count does not + # exceed MAX_LABELS_COUNT. + add_and_trim_labels(job_config) + try: - # Note: Ensure no additional labels are added to job_config after this - # point, as `add_and_trim_labels` ensures the label count does not - # exceed MAX_LABELS_COUNT. - add_and_trim_labels(job_config) if not query_with_job: - results_iterator = bq_client.query_and_wait( + results_iterator = bq_client._query_and_wait_bigframes( sql, job_config=job_config, location=location, project=project, api_timeout=timeout, job_retry=job_retry, + callback=create_bq_event_callback(publisher), ) if metrics is not None: metrics.count_job_stats(row_iterator=results_iterator) @@ -350,14 +377,32 @@ def start_query_with_client( ex.message += CHECK_DRIVE_PERMISSIONS raise - opts = bigframes.options.display - if opts.progress_bar is not None and not query_job.configuration.dry_run: - results_iterator = formatting_helpers.wait_for_query_job( - query_job, - progress_bar=opts.progress_bar, + if not query_job.configuration.dry_run: + publisher.publish( + bigframes.core.events.BigQuerySentEvent( + sql, + billing_project=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + request_id=None, + ) + ) + results_iterator = query_job.result() + if not query_job.configuration.dry_run: + publisher.publish( + bigframes.core.events.BigQueryFinishedEvent( + billing_project=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + destination=query_job.destination, + total_rows=results_iterator.total_rows, + total_bytes_processed=query_job.total_bytes_processed, + slot_millis=query_job.slot_millis, + created=query_job.created, + started=query_job.started, + ended=query_job.ended, + ) ) - else: - results_iterator = query_job.result() if metrics is not None: metrics.count_job_stats(query_job=query_job) @@ -399,6 +444,8 @@ def create_bq_dataset_reference( bq_client: bigquery.Client, location: Optional[str] = None, project: Optional[str] = None, + *, + publisher: bigframes.core.events.Publisher, ) -> bigquery.DatasetReference: """Create and identify dataset(s) for temporary BQ resources. @@ -430,6 +477,7 @@ def create_bq_dataset_reference( timeout=None, metrics=None, query_with_job=True, + publisher=publisher, ) # The anonymous dataset is used by BigQuery to write query results and diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 00531ce25d..f8a379aee9 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -26,8 +26,9 @@ import bigframes_vendored.constants as constants import google.api_core.exceptions import google.cloud.bigquery as bigquery +import google.cloud.bigquery.table -import bigframes.core.sql +import bigframes.core.events import bigframes.exceptions as bfe import bigframes.session._io.bigquery @@ -43,6 +44,7 @@ def get_table_metadata( *, cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], use_cache: bool = True, + publisher: bigframes.core.events.Publisher, ) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]: """Get the table metadata, either from cache or via REST API.""" @@ -59,6 +61,7 @@ def get_table_metadata( # Don't warn, because that will already have been taken care of. should_warn=False, should_dry_run=False, + publisher=publisher, ): # This warning should only happen if the cached snapshot_time will # have any effect on bigframes (b/437090788). For example, with @@ -101,13 +104,14 @@ def get_table_metadata( def is_time_travel_eligible( bqclient: bigquery.Client, - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, columns: Optional[Sequence[str]], snapshot_time: datetime.datetime, filter_str: Optional[str] = None, *, should_warn: bool, should_dry_run: bool, + publisher: bigframes.core.events.Publisher, ): """Check if a table is eligible to use time-travel. @@ -184,6 +188,7 @@ def is_time_travel_eligible( timeout=None, metrics=None, query_with_job=False, + publisher=publisher, ) return True @@ -210,10 +215,8 @@ def is_time_travel_eligible( def infer_unique_columns( - bqclient: bigquery.Client, - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, index_cols: List[str], - metadata_only: bool = False, ) -> Tuple[str, ...]: """Return a set of columns that can provide a unique row key or empty if none can be inferred. @@ -227,14 +230,37 @@ def infer_unique_columns( # Essentially, just reordering the primary key to match the index col order return tuple(index_col for index_col in index_cols if index_col in primary_keys) - if primary_keys or metadata_only or (not index_cols): - # Sometimes not worth scanning data to check uniqueness + if primary_keys: return primary_keys + + return () + + +def check_if_index_columns_are_unique( + bqclient: bigquery.Client, + table: google.cloud.bigquery.table.Table, + index_cols: List[str], + *, + publisher: bigframes.core.events.Publisher, +) -> Tuple[str, ...]: + import bigframes.core.sql + import bigframes.session._io.bigquery + # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) job_config = bigquery.QueryJobConfig() - results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) + results, _ = bigframes.session._io.bigquery.start_query_with_client( + bq_client=bqclient, + sql=is_unique_sql, + job_config=job_config, + timeout=None, + location=None, + project=None, + metrics=None, + query_with_job=False, + publisher=publisher, + ) row = next(iter(results)) if row["total_count"] == row["distinct_count"]: @@ -243,7 +269,7 @@ def infer_unique_columns( def _get_primary_keys( - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, ) -> List[str]: """Get primary keys from table if they are set.""" @@ -261,7 +287,7 @@ def _get_primary_keys( def _is_table_clustered_or_partitioned( - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, ) -> bool: """Returns True if the table is clustered or partitioned.""" @@ -284,7 +310,7 @@ def _is_table_clustered_or_partitioned( def get_index_cols( - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, index_col: Iterable[str] | str | Iterable[int] diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index ec624d4eb4..3c1757806b 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -20,6 +20,7 @@ import google.cloud.bigquery as bigquery from bigframes import constants +import bigframes.core.events from bigframes.session import temporary_storage import bigframes.session._io.bigquery as bf_io_bigquery @@ -37,10 +38,12 @@ def __init__( location: str, session_id: str, *, - kms_key: Optional[str] = None + kms_key: Optional[str] = None, + publisher: bigframes.core.events.Publisher, ): self.bqclient = bqclient self._location = location + self._publisher = publisher self.session_id = session_id self._table_ids: List[bigquery.TableReference] = [] @@ -62,6 +65,7 @@ def dataset(self) -> bigquery.DatasetReference: self._datset_ref = bf_io_bigquery.create_bq_dataset_reference( self.bqclient, location=self._location, + publisher=self._publisher, ) return self._datset_ref diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index 883087df07..99c13007d8 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import datetime import logging import threading @@ -23,7 +25,9 @@ import google.cloud.bigquery as bigquery from bigframes.core.compile import googlesql +import bigframes.core.events from bigframes.session import temporary_storage +import bigframes.session._io.bigquery as bfbqio KEEPALIVE_QUERY_TIMEOUT_SECONDS = 5.0 @@ -38,12 +42,19 @@ class SessionResourceManager(temporary_storage.TemporaryStorageManager): Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. """ - def __init__(self, bqclient: bigquery.Client, location: str): + def __init__( + self, + bqclient: bigquery.Client, + location: str, + *, + publisher: bigframes.core.events.Publisher, + ): self.bqclient = bqclient self._location = location self._session_id: Optional[str] = None self._sessiondaemon: Optional[RecurringTaskDaemon] = None self._session_lock = threading.RLock() + self._publisher = publisher @property def location(self): @@ -84,21 +95,38 @@ def create_temp_table( ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" - job = self.bqclient.query( - ddl, job_config=job_config, location=self.location + _, job = bfbqio.start_query_with_client( + self.bqclient, + ddl, + job_config=job_config, + location=self.location, + project=None, + timeout=None, + metrics=None, + query_with_job=True, + publisher=self._publisher, ) job.result() # return the fully qualified table, so it can be used outside of the session - return job.destination + destination = job.destination + assert destination is not None, "Failure to create temp table." + return destination def close(self): if self._sessiondaemon is not None: self._sessiondaemon.stop() if self._session_id is not None and self.bqclient is not None: - self.bqclient.query_and_wait( + bfbqio.start_query_with_client( + self.bqclient, f"CALL BQ.ABORT_SESSION('{self._session_id}')", + job_config=bigquery.QueryJobConfig(), location=self.location, + project=None, + timeout=None, + metrics=None, + query_with_job=False, + publisher=self._publisher, ) def _get_session_id(self) -> str: @@ -109,8 +137,16 @@ def _get_session_id(self) -> str: job_config = bigquery.QueryJobConfig(create_session=True) # Make sure the session is a new one, not one associated with another query. job_config.use_query_cache = False - query_job = self.bqclient.query( - "SELECT 1", job_config=job_config, location=self.location + _, query_job = bfbqio.start_query_with_client( + self.bqclient, + "SELECT 1", + job_config=job_config, + location=self.location, + project=None, + timeout=None, + metrics=None, + query_with_job=True, + publisher=self._publisher, ) query_job.result() # blocks until finished assert query_job.session_info is not None @@ -133,11 +169,16 @@ def _keep_session_alive(self): ] ) try: - self.bqclient.query_and_wait( + bfbqio.start_query_with_client( + self.bqclient, "SELECT 1", - location=self.location, job_config=job_config, - wait_timeout=KEEPALIVE_QUERY_TIMEOUT_SECONDS, + location=self.location, + project=None, + timeout=KEEPALIVE_QUERY_TIMEOUT_SECONDS, + metrics=None, + query_with_job=False, + publisher=self._publisher, ) except Exception as e: logging.warning("BigQuery session keep-alive query errored : %s", e) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index cbda9bc640..d4cfa13aa4 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -33,6 +33,7 @@ import bigframes.core from bigframes.core import compile, local_data, rewrite import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir +import bigframes.core.events import bigframes.core.guid import bigframes.core.identifiers import bigframes.core.nodes as nodes @@ -140,6 +141,7 @@ def __init__( strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, enable_polars_execution: bool = False, + publisher: bigframes.core.events.Publisher, ): self.bqclient = bqclient self.storage_manager = storage_manager @@ -149,6 +151,9 @@ def __init__( self.loader = loader self.bqstoragereadclient = bqstoragereadclient self._enable_polars_execution = enable_polars_execution + self._publisher = publisher + + # TODO(tswast): Send events from semi-executors, too. self._semi_executors: Sequence[semi_executor.SemiExecutor] = ( read_api_execution.ReadApiSemiExecutor( bqstoragereadclient=bqstoragereadclient, @@ -188,6 +193,8 @@ def execute( array_value: bigframes.core.ArrayValue, execution_spec: ex_spec.ExecutionSpec, ) -> executor.ExecuteResult: + self._publisher.publish(bigframes.core.events.ExecutionStarted()) + # TODO: Support export jobs in combination with semi executors if execution_spec.destination_spec is None: plan = self.prepare_plan(array_value.node, target="simplify") @@ -196,6 +203,11 @@ def execute( plan, ordered=execution_spec.ordered, peek=execution_spec.peek ) if maybe_result: + self._publisher.publish( + bigframes.core.events.ExecutionFinished( + result=maybe_result, + ) + ) return maybe_result if isinstance(execution_spec.destination_spec, ex_spec.TableOutputSpec): @@ -204,7 +216,13 @@ def execute( "Ordering and peeking not supported for gbq export" ) # separate path for export_gbq, as it has all sorts of annoying logic, such as possibly running as dml - return self._export_gbq(array_value, execution_spec.destination_spec) + result = self._export_gbq(array_value, execution_spec.destination_spec) + self._publisher.publish( + bigframes.core.events.ExecutionFinished( + result=result, + ) + ) + return result result = self._execute_plan_gbq( array_value.node, @@ -219,6 +237,11 @@ def execute( if isinstance(execution_spec.destination_spec, ex_spec.GcsOutputSpec): self._export_result_gcs(result, execution_spec.destination_spec) + self._publisher.publish( + bigframes.core.events.ExecutionFinished( + result=result, + ) + ) return result def _export_result_gcs( @@ -243,6 +266,7 @@ def _export_result_gcs( location=None, timeout=None, query_with_job=True, + publisher=self._publisher, ) def _maybe_find_existing_table( @@ -404,6 +428,7 @@ def _run_execute_query( location=None, timeout=None, query_with_job=True, + publisher=self._publisher, ) else: return bq_io.start_query_with_client( @@ -415,6 +440,7 @@ def _run_execute_query( location=None, timeout=None, query_with_job=False, + publisher=self._publisher, ) except google.api_core.exceptions.BadRequest as e: diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py index 7538c9300f..9e7db87301 100644 --- a/bigframes/session/direct_gbq_execution.py +++ b/bigframes/session/direct_gbq_execution.py @@ -21,6 +21,7 @@ from bigframes.core import compile, nodes from bigframes.core.compile import sqlglot +import bigframes.core.events from bigframes.session import executor, semi_executor import bigframes.session._io.bigquery as bq_io @@ -31,7 +32,11 @@ # reference for validating more complex executors. class DirectGbqExecutor(semi_executor.SemiExecutor): def __init__( - self, bqclient: bigquery.Client, compiler: Literal["ibis", "sqlglot"] = "ibis" + self, + bqclient: bigquery.Client, + compiler: Literal["ibis", "sqlglot"] = "ibis", + *, + publisher: bigframes.core.events.Publisher, ): self.bqclient = bqclient self._compile_fn = ( @@ -39,6 +44,7 @@ def __init__( if compiler == "ibis" else sqlglot.SQLGlotCompiler()._compile_sql ) + self._publisher = publisher def execute( self, @@ -83,4 +89,5 @@ def _run_execute_query( timeout=None, metrics=None, query_with_job=False, + publisher=self._publisher, ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 94d8db6f36..940fdc1352 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -50,6 +50,7 @@ from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils import bigframes.core as core import bigframes.core.blocks as blocks +import bigframes.core.events import bigframes.core.schema as schemata import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers @@ -262,6 +263,8 @@ def __init__( scan_index_uniqueness: bool, force_total_order: bool, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + *, + publisher: bigframes.core.events.Publisher, ): self._bqclient = bqclient self._write_client = write_client @@ -273,6 +276,7 @@ def __init__( bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table] ] = {} self._metrics = metrics + self._publisher = publisher # Unfortunate circular reference, but need to pass reference when constructing objects self._session = session self._clock = session_time.BigQuerySyncedClock(bqclient) @@ -499,6 +503,7 @@ def read_gbq_table( # type: ignore[overload-overlap] force_total_order: Optional[bool] = ..., n_rows: Optional[int] = None, index_col_in_columns: bool = False, + publish_execution: bool = True, ) -> dataframe.DataFrame: ... @@ -522,6 +527,7 @@ def read_gbq_table( force_total_order: Optional[bool] = ..., n_rows: Optional[int] = None, index_col_in_columns: bool = False, + publish_execution: bool = True, ) -> pandas.Series: ... @@ -544,6 +550,7 @@ def read_gbq_table( force_total_order: Optional[bool] = None, n_rows: Optional[int] = None, index_col_in_columns: bool = False, + publish_execution: bool = True, ) -> dataframe.DataFrame | pandas.Series: """Read a BigQuery table into a BigQuery DataFrames DataFrame. @@ -603,8 +610,12 @@ def read_gbq_table( when the index is selected from the data columns (e.g., in a ``read_csv`` scenario). The column will be used as the DataFrame's index and removed from the list of value columns. + publish_execution (bool, optional): + If True, sends an execution started and stopped event if this + causes a query. Set to False if using read_gbq_table from + another function that is reporting execution. """ - import bigframes._tools.strings + import bigframes.core.events import bigframes.dataframe as dataframe # --------------------------------- @@ -636,6 +647,7 @@ def read_gbq_table( bq_time=self._clock.get_time(), cache=self._df_snapshot, use_cache=use_cache, + publisher=self._publisher, ) if table.location.casefold() != self._storage_manager.location.casefold(): @@ -756,6 +768,7 @@ def read_gbq_table( filter_str, should_warn=True, should_dry_run=True, + publisher=self._publisher, ) # ---------------------------- @@ -768,12 +781,27 @@ def read_gbq_table( # TODO(b/338065601): Provide a way to assume uniqueness and avoid this # check. primary_key = bf_read_gbq_table.infer_unique_columns( - bqclient=self._bqclient, table=table, index_cols=index_cols, - # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique - metadata_only=not self._scan_index_uniqueness, ) + + # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique + if not primary_key and self._scan_index_uniqueness and index_cols: + if publish_execution: + self._publisher.publish( + bigframes.core.events.ExecutionStarted(), + ) + primary_key = bf_read_gbq_table.check_if_index_columns_are_unique( + self._bqclient, + table=table, + index_cols=index_cols, + publisher=self._publisher, + ) + if publish_execution: + self._publisher.publish( + bigframes.core.events.ExecutionFinished(), + ) + schema = schemata.ArraySchema.from_bq_table(table) if not include_all_columns: schema = schema.select(index_cols + columns) @@ -991,6 +1019,12 @@ def read_gbq_query( query_job, list(columns), index_cols ) + # We want to make sure we show progress when we actually do execute a + # query. Since we have got this far, we know it's not a dry run. + self._publisher.publish( + bigframes.core.events.ExecutionStarted(), + ) + query_job_for_metrics: Optional[bigquery.QueryJob] = None destination: Optional[bigquery.TableReference] = None @@ -1046,20 +1080,28 @@ def read_gbq_query( # makes sense to download the results beyond the first page, even if # there is a job and destination table available. if query_job_for_metrics is None and rows is not None: - return bf_read_gbq_query.create_dataframe_from_row_iterator( + df = bf_read_gbq_query.create_dataframe_from_row_iterator( rows, session=self._session, index_col=index_col, columns=columns, ) + self._publisher.publish( + bigframes.core.events.ExecutionFinished(), + ) + return df # We already checked rows, so if there's no destination table, then # there are no results to return. if destination is None: - return bf_read_gbq_query.create_dataframe_from_query_job_stats( + df = bf_read_gbq_query.create_dataframe_from_query_job_stats( query_job_for_metrics, session=self._session, ) + self._publisher.publish( + bigframes.core.events.ExecutionFinished(), + ) + return df # If the query was DDL or DML, return some job metadata. See # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type @@ -1070,10 +1112,14 @@ def read_gbq_query( query_job_for_metrics is not None and not bf_read_gbq_query.should_return_query_results(query_job_for_metrics) ): - return bf_read_gbq_query.create_dataframe_from_query_job_stats( + df = bf_read_gbq_query.create_dataframe_from_query_job_stats( query_job_for_metrics, session=self._session, ) + self._publisher.publish( + bigframes.core.events.ExecutionFinished(), + ) + return df # Speed up counts by getting counts from result metadata. if rows is not None: @@ -1083,16 +1129,21 @@ def read_gbq_query( else: n_rows = None - return self.read_gbq_table( + df = self.read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", index_col=index_col, columns=columns, use_cache=configuration["query"]["useQueryCache"], force_total_order=force_total_order, n_rows=n_rows, + publish_execution=False, # max_results and filters are omitted because they are already # handled by to_query(), above. ) + self._publisher.publish( + bigframes.core.events.ExecutionFinished(), + ) + return df def _query_to_destination( self, @@ -1194,6 +1245,7 @@ def _start_query_with_job_optional( project=None, metrics=None, query_with_job=False, + publisher=self._publisher, ) return rows @@ -1219,6 +1271,7 @@ def _start_query_with_job( project=None, metrics=None, query_with_job=True, + publisher=self._publisher, ) return query_job diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index 8d9997b1df..ff210419fd 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -143,6 +143,7 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs): bqclient.query.side_effect = query_mock bqclient.query_and_wait.side_effect = query_and_wait_mock + bqclient._query_and_wait_bigframes.side_effect = query_and_wait_mock clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider) type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) diff --git a/setup.py b/setup.py index 2aef514749..abc760b691 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "gcsfs >=2023.3.0, !=2025.5.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", - "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.36.0", # 2.30 needed for arrow support. "google-cloud-bigquery-storage >= 2.30.0, < 3.0.0", "google-cloud-functions >=1.12.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8df3a3a2c3..eceec07dc4 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -6,7 +6,7 @@ geopandas==0.12.2 google-auth==2.15.0 google-cloud-bigtable==2.24.0 google-cloud-pubsub==2.21.4 -google-cloud-bigquery==3.31.0 +google-cloud-bigquery==3.36.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 diff --git a/tests/system/small/engines/conftest.py b/tests/system/small/engines/conftest.py index 9699cc6a61..a775731cde 100644 --- a/tests/system/small/engines/conftest.py +++ b/tests/system/small/engines/conftest.py @@ -19,7 +19,7 @@ import pytest import bigframes -from bigframes.core import ArrayValue, local_data +from bigframes.core import ArrayValue, events, local_data from bigframes.session import ( direct_gbq_execution, local_scan_executor, @@ -50,11 +50,14 @@ def engine(request, bigquery_client: bigquery.Client) -> semi_executor.SemiExecu return local_scan_executor.LocalScanExecutor() if request.param == "polars": return polars_executor.PolarsExecutor() + publisher = events.Publisher() if request.param == "bq": - return direct_gbq_execution.DirectGbqExecutor(bigquery_client) + return direct_gbq_execution.DirectGbqExecutor( + bigquery_client, publisher=publisher + ) if request.param == "bq-sqlglot": return direct_gbq_execution.DirectGbqExecutor( - bigquery_client, compiler="sqlglot" + bigquery_client, compiler="sqlglot", publisher=publisher ) raise ValueError(f"Unrecognized param: {request.param}") diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py index a25c167f71..d71013c648 100644 --- a/tests/system/small/engines/test_aggregation.py +++ b/tests/system/small/engines/test_aggregation.py @@ -15,7 +15,14 @@ from google.cloud import bigquery import pytest -from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes +from bigframes.core import ( + agg_expressions, + array_value, + events, + expression, + identifiers, + nodes, +) import bigframes.operations.aggregations as agg_ops from bigframes.session import direct_gbq_execution, polars_executor from bigframes.testing.engine_utils import assert_equivalence_execution @@ -112,9 +119,12 @@ def test_sql_engines_median_op_aggregates( scalars_array_value, agg_ops.MedianOp(), ).node - left_engine = direct_gbq_execution.DirectGbqExecutor(bigquery_client) + publisher = events.Publisher() + left_engine = direct_gbq_execution.DirectGbqExecutor( + bigquery_client, publisher=publisher + ) right_engine = direct_gbq_execution.DirectGbqExecutor( - bigquery_client, compiler="sqlglot" + bigquery_client, compiler="sqlglot", publisher=publisher ) assert_equivalence_execution(node, left_engine, right_engine) diff --git a/tests/system/small/engines/test_windowing.py b/tests/system/small/engines/test_windowing.py index f344a3b60a..a34d7b8f38 100644 --- a/tests/system/small/engines/test_windowing.py +++ b/tests/system/small/engines/test_windowing.py @@ -18,6 +18,7 @@ from bigframes.core import ( agg_expressions, array_value, + events, expression, identifiers, nodes, @@ -64,8 +65,11 @@ def test_engines_with_rows_window( skip_reproject_unsafe=False, ) - bq_executor = direct_gbq_execution.DirectGbqExecutor(bigquery_client) + publisher = events.Publisher() + bq_executor = direct_gbq_execution.DirectGbqExecutor( + bigquery_client, publisher=publisher + ) bq_sqlgot_executor = direct_gbq_execution.DirectGbqExecutor( - bigquery_client, compiler="sqlglot" + bigquery_client, compiler="sqlglot", publisher=publisher ) assert_equivalence_execution(window_node, bq_executor, bq_sqlgot_executor) diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 15070a3a29..26c4b89b24 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -28,6 +28,7 @@ import bigframes import bigframes.clients +import bigframes.core.events import bigframes.dtypes import bigframes.exceptions from bigframes.functions import _utils as bff_utils @@ -770,6 +771,7 @@ def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_un timeout=None, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) func = session.read_gbq_function(routine_id_unique) @@ -808,6 +810,7 @@ def test_read_gbq_function_runs_existing_udf_2_params_array_output( timeout=None, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) func = session.read_gbq_function(routine_id_unique) @@ -848,6 +851,7 @@ def test_read_gbq_function_runs_existing_udf_4_params_array_output( timeout=None, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) func = session.read_gbq_function(routine_id_unique) diff --git a/tests/system/small/test_bq_sessions.py b/tests/system/small/test_bq_sessions.py index 7aad19bd8f..801346600d 100644 --- a/tests/system/small/test_bq_sessions.py +++ b/tests/system/small/test_bq_sessions.py @@ -17,10 +17,10 @@ import google import google.api_core.exceptions -import google.cloud from google.cloud import bigquery import pytest +import bigframes.core.events from bigframes.session import bigquery_session TEST_SCHEMA = [ @@ -39,12 +39,14 @@ def session_resource_manager( bigquery_client, ) -> bigquery_session.SessionResourceManager: - return bigquery_session.SessionResourceManager(bigquery_client, "US") + return bigquery_session.SessionResourceManager( + bigquery_client, "US", publisher=bigframes.core.events.Publisher() + ) def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client): session_resource_manager = bigquery_session.SessionResourceManager( - bigquery_client, "US" + bigquery_client, "US", publisher=bigframes.core.events.Publisher() ) cluster_cols = ["string field", "bool field"] @@ -68,7 +70,7 @@ def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client): session_resource_manager = bigquery_session.SessionResourceManager( - bigquery_client, "US" + bigquery_client, "US", publisher=bigframes.core.events.Publisher() ) def create_table(): diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 8a323831b5..0c9c4070f4 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -23,7 +23,7 @@ import bigframes.formatting_helpers as formatting_helpers from bigframes.session import MAX_INLINE_DF_BYTES -job_load_message_regex = r"\w+ job [\w-]+ is \w+\." +job_load_message_regex = r"Query" EXPECTED_DRY_RUN_MESSAGE = "Computation deferred. Computation will process" @@ -56,7 +56,7 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, with bf.option_context("display.progress_bar", "terminal"): penguins_df_default_index["body_mass_g"].head(10).mean() - assert capsys.readouterr().out == "" + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_scalar_allow_large_results( @@ -100,37 +100,19 @@ def test_progress_bar_load_jobs( capsys.readouterr() # clear output session.read_csv(path) - assert_loading_msg_exist(capsys.readouterr().out) + assert_loading_msg_exist(capsys.readouterr().out, pattern="Load") -def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): - numLoadingMsg = 0 - lines = capystOut.split("\n") +def assert_loading_msg_exist(capstdout: str, pattern=job_load_message_regex): + num_loading_msg = 0 + lines = capstdout.split("\n") lines = [line for line in lines if len(line) > 0] assert len(lines) > 0 for line in lines: - if re.match(pattern, line) is not None: - numLoadingMsg += 1 - assert numLoadingMsg > 0 - - -def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): - with bf.option_context("display.progress_bar", "terminal"): - penguins_df_default_index.to_pandas(allow_large_results=True) - query_job_repr = formatting_helpers.repr_query_job_html( - penguins_df_default_index.query_job - ).value - - string_checks = [ - "Job Id", - "Destination Table", - "Slot Time", - "Bytes Processed", - "Cache hit", - ] - for string in string_checks: - assert string in query_job_repr + if re.search(pattern, line) is not None: + num_loading_msg += 1 + assert num_loading_msg > 0 def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index c451d74d0f..57ac3d88f7 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -22,6 +22,7 @@ import bigframes from bigframes.core import log_adapter +import bigframes.core.events import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq from bigframes.testing import mocks @@ -236,6 +237,7 @@ def test_start_query_with_client_labels_length_limit_met( timeout=timeout, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) assert job_config.labels is not None diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 0c67e05813..d21f0000a9 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -24,13 +24,12 @@ @pytest.mark.parametrize( - ("index_cols", "primary_keys", "values_distinct", "expected"), + ("index_cols", "primary_keys", "expected"), ( - (["col1", "col2"], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")), + (["col1", "col2"], ["col1", "col2", "col3"], ("col1", "col2", "col3")), ( ["col1", "col2", "col3"], ["col1", "col2", "col3"], - True, ("col1", "col2", "col3"), ), ( @@ -39,15 +38,14 @@ "col3", "col2", ], - True, ("col2", "col3"), ), - (["col1", "col2"], [], False, ()), - ([], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")), - ([], [], False, ()), + (["col1", "col2"], [], ()), + ([], ["col1", "col2", "col3"], ("col1", "col2", "col3")), + ([], [], ()), ), ) -def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expected): +def test_infer_unique_columns(index_cols, primary_keys, expected): """If a primary key is set on the table, we use that as the index column by default, no error should be raised in this case. @@ -79,6 +77,49 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte "columns": primary_keys, }, } + + result = bf_read_gbq_table.infer_unique_columns(table, index_cols) + + assert result == expected + + +@pytest.mark.parametrize( + ("index_cols", "values_distinct", "expected"), + ( + ( + ["col1", "col2", "col3"], + True, + ("col1", "col2", "col3"), + ), + ( + ["col2", "col3", "col1"], + True, + ("col2", "col3", "col1"), + ), + (["col1", "col2"], False, ()), + ([], False, ()), + ), +) +def test_check_if_index_columns_are_unique(index_cols, values_distinct, expected): + table = google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "my_table", + }, + "clustering": { + "fields": ["col1", "col2"], + }, + }, + ) + table.schema = ( + google.cloud.bigquery.SchemaField("col1", "INT64"), + google.cloud.bigquery.SchemaField("col2", "INT64"), + google.cloud.bigquery.SchemaField("col3", "INT64"), + google.cloud.bigquery.SchemaField("col4", "INT64"), + ) + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" session = mocks.create_bigquery_session( @@ -87,13 +128,18 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte # Mock bqclient _after_ creating session to override its mocks. bqclient.get_table.return_value = table - bqclient.query_and_wait.side_effect = None - bqclient.query_and_wait.return_value = ( + bqclient._query_and_wait_bigframes.side_effect = None + bqclient._query_and_wait_bigframes.return_value = ( {"total_count": 3, "distinct_count": 3 if values_distinct else 2}, ) table._properties["location"] = session._location - result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols) + result = bf_read_gbq_table.check_if_index_columns_are_unique( + bqclient=bqclient, + table=table, + index_cols=index_cols, + publisher=session._publisher, + ) assert result == expected diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 63c82eb30f..d05957b941 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -247,7 +247,7 @@ def test_read_gbq_cached_table(): table, ) - session.bqclient.query_and_wait = mock.MagicMock( + session.bqclient._query_and_wait_bigframes = mock.MagicMock( return_value=({"total_count": 3, "distinct_count": 2},) ) session.bqclient.get_table.return_value = table @@ -278,7 +278,7 @@ def test_read_gbq_cached_table_doesnt_warn_for_anonymous_tables_and_doesnt_inclu table, ) - session.bqclient.query_and_wait = mock.MagicMock( + session.bqclient._query_and_wait_bigframes = mock.MagicMock( return_value=({"total_count": 3, "distinct_count": 2},) ) session.bqclient.get_table.return_value = table @@ -306,7 +306,9 @@ def test_default_index_warning_raised_by_read_gbq(table): bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},) + bqclient._query_and_wait_bigframes.return_value = ( + {"total_count": 3, "distinct_count": 2}, + ) session = mocks.create_bigquery_session( bqclient=bqclient, # DefaultIndexWarning is only relevant for strict mode. @@ -333,7 +335,9 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},) + bqclient._query_and_wait_bigframes.return_value = ( + {"total_count": 4, "distinct_count": 3}, + ) session = mocks.create_bigquery_session( bqclient=bqclient, # DefaultIndexWarning is only relevant for strict mode. @@ -382,7 +386,7 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - bqclient.query_and_wait.return_value = ( + bqclient._query_and_wait_bigframes.return_value = ( {"total_count": total_count, "distinct_count": distinct_count}, ) session = mocks.create_bigquery_session( @@ -492,6 +496,7 @@ def query_mock(query, *args, **kwargs): return session_query_mock(query, *args, **kwargs) session.bqclient.query_and_wait = query_mock + session.bqclient._query_and_wait_bigframes = query_mock def get_table_mock(table_ref): table = google.cloud.bigquery.Table( diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index 588ef6e824..9dc1379496 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -19,6 +19,7 @@ import google.cloud.bigquery as bigquery import pytest +import bigframes.core.events as bfevents import bigframes.formatting_helpers as formatting_helpers import bigframes.version @@ -30,7 +31,7 @@ def test_wait_for_query_job_error_includes_feedback_link(): ) with pytest.raises(api_core_exceptions.BadRequest) as cap_exc: - formatting_helpers.wait_for_query_job(mock_query_job) + formatting_helpers.wait_for_job(mock_query_job) cap_exc.match("Test message 123.") cap_exc.match(constants.FEEDBACK_LINK) @@ -70,3 +71,129 @@ def test_get_formatted_bytes(test_input, expected): ) def test_get_formatted_time(test_input, expected): assert formatting_helpers.get_formatted_time(test_input) == expected + + +def test_render_bqquery_sent_event_html(): + event = bfevents.BigQuerySentEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + html = formatting_helpers.render_bqquery_sent_event_html(event) + assert "SELECT * FROM my_table" in html + assert "my-job-id" in html + assert "us-central1" in html + assert "my-project" in html + assert "
" in html + + +def test_render_bqquery_sent_event_plaintext(): + event = bfevents.BigQuerySentEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + text = formatting_helpers.render_bqquery_sent_event_plaintext(event) + assert "my-job-id" in text + assert "us-central1" in text + assert "my-project" in text + assert "SELECT * FROM my_table" not in text + + +def test_render_bqquery_retry_event_html(): + event = bfevents.BigQueryRetryEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + html = formatting_helpers.render_bqquery_retry_event_html(event) + assert "Retrying query" in html + assert "SELECT * FROM my_table" in html + assert "my-job-id" in html + assert "us-central1" in html + assert "my-project" in html + assert "
" in html + + +def test_render_bqquery_retry_event_plaintext(): + event = bfevents.BigQueryRetryEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + text = formatting_helpers.render_bqquery_retry_event_plaintext(event) + assert "Retrying query" in text + assert "my-job-id" in text + assert "us-central1" in text + assert "my-project" in text + assert "SELECT * FROM my_table" not in text + + +def test_render_bqquery_received_event_html(): + mock_plan_entry = mock.create_autospec( + bigquery.job.query.QueryPlanEntry, instance=True + ) + mock_plan_entry.__str__.return_value = "mocked plan" + event = bfevents.BigQueryReceivedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + state="RUNNING", + query_plan=[mock_plan_entry], + ) + html = formatting_helpers.render_bqquery_received_event_html(event) + assert "Query" in html + assert "my-job-id" in html + assert "is RUNNING" in html + assert "
" in html + assert "mocked plan" in html + + +def test_render_bqquery_received_event_plaintext(): + event = bfevents.BigQueryReceivedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + state="RUNNING", + query_plan=[], + ) + text = formatting_helpers.render_bqquery_received_event_plaintext(event) + assert "Query" in text + assert "my-job-id" in text + assert "is RUNNING" in text + assert "Query Plan" not in text + + +def test_render_bqquery_finished_event_html(): + event = bfevents.BigQueryFinishedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + total_bytes_processed=1000, + slot_millis=2000, + ) + html = formatting_helpers.render_bqquery_finished_event_html(event) + assert "Query" in html + assert "my-job-id" in html + assert "processed 1.0 kB" in html + assert "2 seconds of slot time" in html + + +def test_render_bqquery_finished_event_plaintext(): + event = bfevents.BigQueryFinishedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + total_bytes_processed=1000, + slot_millis=2000, + ) + text = formatting_helpers.render_bqquery_finished_event_plaintext(event) + assert "Query" in text + assert "my-job-id" in text + assert "finished" in text + assert "1.0 kB processed" in text + assert "Slot time: 2 seconds" in text From cdf2dd55a0c03da50ab92de09788cafac0abf6f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 8 Oct 2025 13:09:35 -0500 Subject: [PATCH 09/22] fix: address typo in error message (#2142) --- tests/system/small/test_dataframe.py | 2 +- third_party/bigframes_vendored/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 851c934838..d0847eee4e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -6034,7 +6034,7 @@ def test_df_astype_python_types(scalars_dfs): def test_astype_invalid_type_fail(scalars_dfs): bf_df, _ = scalars_dfs - with pytest.raises(TypeError, match=r".*Share your usecase with.*"): + with pytest.raises(TypeError, match=r".*Share your use case with.*"): bf_df.astype(123) diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index 6d55817a27..9705b19c90 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -23,7 +23,7 @@ import bigframes_vendored.version FEEDBACK_LINK = ( - "Share your usecase with the BigQuery DataFrames team at the " + "Share your use case with the BigQuery DataFrames team at the " "https://bit.ly/bigframes-feedback survey. " f"You are currently running BigFrames version {bigframes_vendored.version.__version__}." ) From 1f434fb5c7c00601654b3ab19c6ad7fceb258bd6 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 8 Oct 2025 13:16:08 -0700 Subject: [PATCH 10/22] docs: add a brief intro notebook for bbq AI functions (#2150) * docs: add an intro notebook for bbq AI functions * deprecate old ai notebooks * fix grammar * remove the project ID value --- notebooks/experimental/ai_operators.ipynb | 3106 +---------------- .../experimental/semantic_operators.ipynb | 4 +- notebooks/generative_ai/ai_functions.ipynb | 555 +++ 3 files changed, 560 insertions(+), 3105 deletions(-) create mode 100644 notebooks/generative_ai/ai_functions.ipynb diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 977f7b9d74..8aaa3f4b7c 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -29,3111 +29,11 @@ "id": "rWJnGj2ViouP" }, "source": [ - "# BigFrames AI Operator Tutorial\n", + "All AI operators except for `ai.forecast` have been deprecated.\n", "\n", - "\n", + "The tutorial notebook for AI functions is located at https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/ai_functions.ipynb\n", "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"BQ\n", - " Open in BQ Studio\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mgOrr256iouQ" - }, - "source": [ - "This notebook provides a hands-on preview of AI operator APIs powered by the Gemini model.\n", - "\n", - "The notebook is divided into two sections. The first section introduces the API syntax with examples, aiming to familiarize you with how AI operators work. The second section applies AI operators to a large real-world dataset and presents performance statistics.\n", - "\n", - "This work is inspired by [this paper](https://arxiv.org/pdf/2407.11418) and powered by BigQuery ML and Vertex AI." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ymVbJV2iouQ" - }, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vvVzFzo3iouQ" - }, - "source": [ - "First, import the BigFrames modules.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "Jb9glT2ziouQ" - }, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xQiCWj7OiouQ" - }, - "source": [ - "Make sure the BigFrames version is at least `1.42.0`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "LTPpI8IpiouQ" - }, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.42.0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "agxLmtlbiouR" - }, - "source": [ - "Turn on the AI operator experiment. You will see a warning sign saying that these operators are still under experiments. If you don't turn on the experiment before using the operators, you will get `NotImplemenetedError`s." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "1wXqdDr8iouR" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: AI operators are still under experiments, and are subject to change in\n", - "the future.\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bigframes.options.experiments.ai_operators = True" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "W8TPUvnsqxhv" - }, - "source": [ - "Specify your GCP project and location." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "vCkraKOeqJFl" - }, - "outputs": [], - "source": [ - "bpd.options.bigquery.project = 'bigframes-dev'\n", - "bpd.options.bigquery.location = 'US'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n95MFlS0iouR" - }, - "source": [ - "**Optional**: turn off the display of progress bar so that only the operation results will be printed out" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "5r6ahx7MiouR" - }, - "outputs": [], - "source": [ - "bpd.options.display.progress_bar = None" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "93iYvp7niouR" - }, - "source": [ - "Create LLM instances. They will be passed in as parameters for each AI operator.\n", - "\n", - "This tutorial uses the \"gemini-2.0-flash-001\" model for text generation and \"text-embedding-005\" for embedding. While these are recommended, you can choose [other Vertex AI LLM models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) based on your needs and availability. Ensure you have [sufficient quota](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas) for your chosen models and adjust it if necessary." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "tHkymaLNiouR" - }, - "outputs": [], - "source": [ - "from bigframes.ml import llm\n", - "gemini_model = llm.GeminiTextGenerator(model_name=\"gemini-2.0-flash-001\")\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mbFDcvnPiouR" - }, - "source": [ - "**Note**: AI operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.ai_ops_confirmation_threshold` at `version 1.42.0` so that the BigFrames will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", - "\n", - "The default threshold is 0, which means the operators will always ask for confirmations. You are free to adjust the value as needed. You can also set the threshold to `None` to disable this feature." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "F4dZm4b7iouR" - }, - "outputs": [], - "source": [ - "if Version(bigframes.__version__) >= Version(\"1.42.0\"):\n", - " bigframes.options.compute.ai_ops_confirmation_threshold = 1000" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_dEA3G9RiouR" - }, - "source": [ - "If you would like your operations to fail automatically when the data is too large, set `bigframes.options.compute.ai_ops_threshold_autofail` to `True`:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "BoUK-cpbiouS" - }, - "outputs": [], - "source": [ - "# if Version(bigframes.__version__) >= Version(\"1.42.0\"):\n", - "# bigframes.options.compute.ai_ops_threshold_autofail = True" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hQft3o3OiouS" - }, - "source": [ - "# API Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dt5Kl-QGiouS" - }, - "source": [ - "You will learn about each AI operator by trying some examples." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J7XAT459iouS" - }, - "source": [ - "## AI Filtering" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9d5HUIvliouS" - }, - "source": [ - "AI filtering allows you to filter your dataframe based on the instruction (i.e. prompt) you provided.\n", - "\n", - "First, create a dataframe:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "NDpCRGd_iouS", - "outputId": "5048c935-06d3-4ef1-ad87-72e14a30b1b7" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countrycity
0USASeattle
1GermanyBerlin
2JapanKyoto
\n", - "

3 rows × 2 columns

\n", - "
[3 rows x 2 columns in total]" - ], - "text/plain": [ - " country city\n", - "0 USA Seattle\n", - "1 Germany Berlin\n", - "2 Japan Kyoto\n", - "\n", - "[3 rows x 2 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({'country': ['USA', 'Germany', 'Japan'], 'city': ['Seattle', 'Berlin', 'Kyoto']})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6AXmT7sniouS" - }, - "source": [ - "Now, filter this dataframe by keeping only the rows where the value in `city` column is the capital of the value in `country` column. The column references could be \"escaped\" by using a pair of braces in your instruction. In this example, your instruction should be like this:\n", - "```\n", - "The {city} is the capital of the {country}.\n", - "```\n", - "\n", - "Note that this is not a Python f-string, so you shouldn't prefix your instruction with an `f`." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 127 - }, - "id": "ipW3Z_l4iouS", - "outputId": "ad447459-225a-419c-d4c8-fedac4a9ed0f" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countrycity
1GermanyBerlin
\n", - "

1 rows × 2 columns

\n", - "
[1 rows x 2 columns in total]" - ], - "text/plain": [ - " country city\n", - "1 Germany Berlin\n", - "\n", - "[1 rows x 2 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.ai.filter(\"The {city} is the capital of the {country}\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "swKvgfm1iouS" - }, - "source": [ - "The filter operator extracts the information from the referenced column to enrich your instruction with context. The instruction is then sent for the designated model for evaluation. For filtering operations, the LLM is asked to return only `True` and `False` for each row, and the operator removes the rows accordingly." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r_2AAGGoiouS" - }, - "source": [ - "## AI Mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vT6skC57iouS" - }, - "source": [ - "AI mapping allows to you to combine values from multiple columns into a single output based your instruction.\n", - "\n", - "Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "BQ7xeUK3iouS", - "outputId": "33dcb742-77ed-4bea-8dbc-1cf775102a25" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ingredient_1ingredient_2
0BunBeef Patty
1Soy BeanBittern
2SausageLong Bread
\n", - "

3 rows × 2 columns

\n", - "
[3 rows x 2 columns in total]" - ], - "text/plain": [ - " ingredient_1 ingredient_2\n", - "0 Bun Beef Patty\n", - "1 Soy Bean Bittern\n", - "2 Sausage Long Bread\n", - "\n", - "[3 rows x 2 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\n", - " \"ingredient_1\": [\"Bun\", \"Soy Bean\", \"Sausage\"],\n", - " \"ingredient_2\": [\"Beef Patty\", \"Bittern\", \"Long Bread\"]\n", - " })\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VFObP2aFiouS" - }, - "source": [ - "Now, you ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the output column name." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you are using BigFrames version `2.5.0` or later, the column name is specified with the `output_schema` parameter. This parameter expects a dictionary input in the form of `{'col_name': 'type_name'}`." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ingredient_1ingredient_2food
0BunBeef PattyHamburger
1Soy BeanBitternTofu
2SausageLong BreadHotdog
\n", - "

3 rows × 3 columns

\n", - "
[3 rows x 3 columns in total]" - ], - "text/plain": [ - " ingredient_1 ingredient_2 food\n", - "0 Bun Beef Patty Hamburger\n", - "1 Soy Bean Bittern Tofu\n", - "2 Sausage Long Bread Hotdog\n", - "\n", - "[3 rows x 3 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", model=gemini_model, output_schema={\"food\": \"string\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you are using BigFrames version 2.4.0 or prior, the column name is specified wit the `output_column` parameter. The outputs are always strings." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 190 - }, - "id": "PpL24AQFiouS", - "outputId": "e7aff038-bf4b-4833-def8-fe2648e8885b" - }, - "outputs": [], - "source": [ - "# df.ai.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### AI Extraction\n", - "\n", - "AI mapping is also able to extract multiple pieces of information based on your prompt, because the output schema keys can carry semantic meanings:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
textpersonaddress
0Elmo lives at 123 Sesame Street.Elmo123 Sesame Street
1124 Conch Street is SpongeBob's homeSpongeBob124 Conch Street
\n", - "

2 rows × 3 columns

\n", - "
[2 rows x 3 columns in total]" - ], - "text/plain": [ - " text person address\n", - "0 Elmo lives at 123 Sesame Street. Elmo 123 Sesame Street\n", - "1 124 Conch Street is SpongeBob's home SpongeBob 124 Conch Street\n", - "\n", - "[2 rows x 3 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\n", - " \"text\": [\n", - " \"Elmo lives at 123 Sesame Street.\", \n", - " \"124 Conch Street is SpongeBob's home\",\n", - " ]\n", - "})\n", - "df.ai.map(\"{text}\", model=gemini_model, output_schema={\"person\": \"string\", \"address\": \"string\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "70WTZZfdiouS" - }, - "source": [ - "## AI Joining" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u93uieRaiouS" - }, - "source": [ - "AI joining can join two dataframes based on the instruction you provided.\n", - "\n", - "First, you prepare two dataframes:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "id": "dffIGEUEiouS" - }, - "outputs": [], - "source": [ - "cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})\n", - "continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Hz0X-0RtiouS" - }, - "source": [ - "You want to join the `cities` with `continents` to form a new dataframe such that, in each row the city from the `cities` data frame is in the continent from the `continents` dataframe. You could re-use the aforementioned column reference syntax:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 221 - }, - "id": "WPIOHEwCiouT", - "outputId": "976586c3-b5db-4088-a46a-44dfbf822ecb" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
citycontinent
0SeattleNorth America
1OttawaNorth America
2ShanghaiAsia
3New DelhiAsia
\n", - "

4 rows × 2 columns

\n", - "
[4 rows x 2 columns in total]" - ], - "text/plain": [ - " city continent\n", - "0 Seattle North America\n", - "1 Ottawa North America\n", - "2 Shanghai Asia\n", - "3 New Delhi Asia\n", - "\n", - "[4 rows x 2 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cities.ai.join(continents, \"{city} is in {continent}\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4Qc97GMWiouT" - }, - "source": [ - "!! **Important:** AI join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes AI filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MUEJXT1IiouT" - }, - "source": [ - "### Self Joins" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QvX-nCogiouT" - }, - "source": [ - "This self-join example is for demonstrating a special case: what happens when the joining columns exist in both data frames? It turns out that you need to provide extra information in your column references: by attaching \"left.\" and \"right.\" prefixes to your column names.\n", - "\n", - "Create an example data frame:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "OIGz5sqxiouW" - }, - "outputs": [], - "source": [ - "animals = bpd.DataFrame({'animal': ['cow', 'cat', 'spider', 'elephant']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VmJbuWNniouX" - }, - "source": [ - "You want to compare the weights of these animals, and output all the pairs where the animal on the left is heavier than the animal on the right. In this case, you use `left.animal` and `right.animal` to differentiate the data sources:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 284 - }, - "id": "UHfggdhBiouX", - "outputId": "a439e3aa-1382-4244-951f-127dc8da0fe3" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
animal_leftanimal_right
0cowcat
1cowspider
2catspider
3elephantcow
4elephantcat
5elephantspider
\n", - "

6 rows × 2 columns

\n", - "
[6 rows x 2 columns in total]" - ], - "text/plain": [ - " animal_left animal_right\n", - "0 cow cat\n", - "1 cow spider\n", - "2 cat spider\n", - "3 elephant cow\n", - "4 elephant cat\n", - "5 elephant spider\n", - "\n", - "[6 rows x 2 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "animals.ai.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sIszJ0zPiouX" - }, - "source": [ - "## AI Search" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e4ojHRKAiouX" - }, - "source": [ - "AI search searches the most similar values to your query within a single column. Here is an example:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "gnQSIZ5SiouX", - "outputId": "dd6e1ecb-1bad-4a7c-8065-e56c697d0863" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
creatures
0salmon
1sea urchin
2baboons
3frog
4chimpanzee
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " creatures\n", - "0 salmon\n", - "1 sea urchin\n", - "2 baboons\n", - "3 frog\n", - "4 chimpanzee\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bpd.DataFrame({\"creatures\": [\"salmon\", \"sea urchin\", \"baboons\", \"frog\", \"chimpanzee\"]})\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5apfIaZMiouX" - }, - "source": [ - "You want to get the top 2 creatures that are most similar to \"monkey\":" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 159 - }, - "id": "CkAuFgPYiouY", - "outputId": "723c7604-f53c-43d7-c754-4c91ec198dff" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n", - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n", - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
creaturessimilarity score
2baboons0.708434
4chimpanzee0.635844
\n", - "

2 rows × 2 columns

\n", - "
[2 rows x 2 columns in total]" - ], - "text/plain": [ - " creatures similarity score\n", - "2 baboons 0.708434\n", - "4 chimpanzee 0.635844\n", - "\n", - "[2 rows x 2 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.ai.search(\"creatures\", query=\"monkey\", top_k = 2, model = text_embedding_model, score_column='similarity score')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GDZeVzFTiouY" - }, - "source": [ - "Note that you are using a text embedding model this time. This model generates embedding vectors for both your query as well as the values in the search space. The operator then uses BigQuery's built-in VECTOR_SEARCH function to find the nearest neighbors of your query.\n", - "\n", - "In addition, `score_column` is an optional parameter for storing the distances between the results and your query. If not set, the score column won't be attached to the result." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EXNutIXqiouY" - }, - "source": [ - "## AI Similarity Join" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BhWrhQMjiouY" - }, - "source": [ - "When you want to perform multiple similarity queries in the same value space, you could use similarity join to simplify your call. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "cUc7-8O6iouY" - }, - "outputs": [], - "source": [ - "df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})\n", - "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "k96WerOviouY" - }, - "source": [ - "In this example, you want to pick the most related animal from `df2` for each value in `df1`." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "wPV5EkfpiouY", - "outputId": "4be1211d-0353-4b94-8c27-ebd568e8e104" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n", - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
animalanimal_1distance
0monkeybaboon0.620521
1spiderscorpion0.728024
2salmontuna0.782141
3giraffeelephant0.7135
4sparrowowl0.810864
\n", - "

5 rows × 3 columns

\n", - "
[5 rows x 3 columns in total]" - ], - "text/plain": [ - " animal animal_1 distance\n", - "0 monkey baboon 0.620521\n", - "1 spider scorpion 0.728024\n", - "2 salmon tuna 0.782141\n", - "3 giraffe elephant 0.7135\n", - "4 sparrow owl 0.810864\n", - "\n", - "[5 rows x 3 columns]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1.ai.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model=text_embedding_model, score_column='distance')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GplzD7v0iouY" - }, - "source": [ - "!! **Important** Like AI join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `bigframes.options.compute.sem_ops_confirmation_threshold` option to specify a threshold." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hgj8GoQhiouY" - }, - "source": [ - "# Performance Analyses" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EZomL0BciouY" - }, - "source": [ - "In this section, you will use BigQuery's public data of hacker news to perform some heavy work. We recommend you to check the code without executing them in order to save your time and money. The execution results are attached after each cell for your reference.\n", - "\n", - "First, load 3k rows from the table:" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 880 - }, - "id": "wRR0SrcSiouY", - "outputId": "3b25f3a3-09c7-4396-9107-4aa4cdb4b963" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
0<NA><NA><NA><NA>2010-04-16 19:52:51+00:00comment
1<NA>I&#x27;d agree about border control with a cav...bandrami<NA>2023-06-04 06:12:00+00:00comment
2<NA>So 4 pickups? At least pickups are high margin...seanmcdirmid<NA>2023-09-19 14:19:46+00:00comment
3Workplace Wellness Programs Don’t Work Well. W...<NA>anarbadalov22018-08-07 12:17:45+00:00story
4<NA>Are you implying that to be a good developer y...ecesena<NA>2016-06-10 19:38:25+00:00comment
5<NA>It pretty much works with other carriers. My s...toast0<NA>2024-08-13 03:11:32+00:00comment
6<NA><NA><NA><NA>2020-06-07 22:43:03+00:00comment
7<NA>&quot;not operated for profit&quot; and &quot;...radford-neal<NA>2020-03-19 00:24:47+00:00comment
8<NA>It&#x27;s a good description of one applicatio...dkarl<NA>2024-10-07 13:38:18+00:00comment
9<NA>Might be a bit high, but....<p><i>&quot;For ex...tyingq<NA>2017-01-23 19:49:15+00:00comment
10Taiwan’s Tech King to Nancy Pelosi: U.S. Is in...<NA>dlcmh112023-02-18 02:51:11+00:00story
11Android’s new multitasking is terrible and sho...<NA>wowamit12018-10-22 09:50:36+00:00story
12<NA>SEEKING WORK | REMOTE | US Citizen<p>Location:...rasikjain<NA>2024-08-01 16:56:49+00:00comment
13<NA>I had a very similar experience last month tea...tmaly<NA>2020-01-22 18:26:36+00:00comment
14<NA><NA>mrtweetyhack<NA>2022-02-26 19:34:00+00:00comment
15<NA>&gt; Just do what most American cities do with...AnthonyMouse<NA>2021-10-04 23:10:50+00:00comment
16<NA>It&#x27;s not a space. The l and the C are at ...antninja<NA>2013-07-13 09:48:34+00:00comment
17<NA>I’ve knowingly paid the premium in the past, j...zwily<NA>2020-06-17 14:26:43+00:00comment
18<NA>&gt; Any sufficiently complicated C or Fortran...wavemode<NA>2025-02-07 06:42:53+00:00comment
19<NA>It&#x27;s similar to a lot of Japanese &quot;t...TillE<NA>2022-11-06 17:15:10+00:00comment
20<NA>Engineers are just people paid to code. If you...rchaud<NA>2023-04-12 14:31:42+00:00comment
21<NA>So don&#x27;t use itCyberDildonics<NA>2015-12-29 22:01:16+00:00comment
22<NA>Sure, but there are degrees of these things. T...dang<NA>2021-11-11 23:42:12+00:00comment
23<NA>I wish this would happen. There&#x27;s a &quo...coredog64<NA>2018-02-12 16:03:37+00:00comment
24<NA>I’m not sure why responsible riders wouldn’t w...mjmahone17<NA>2021-11-09 01:36:01+00:00comment
\n", - "

25 rows × 6 columns

\n", - "
[3000 rows x 6 columns in total]" - ], - "text/plain": [ - " title \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 Workplace Wellness Programs Don’t Work Well. W... \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 Taiwan’s Tech King to Nancy Pelosi: U.S. Is in... \n", - "11 Android’s new multitasking is terrible and sho... \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "23 \n", - "24 \n", - "\n", - " text by score \\\n", - "0 \n", - "1 I'd agree about border control with a cav... bandrami \n", - "2 So 4 pickups? At least pickups are high margin... seanmcdirmid \n", - "3 anarbadalov 2 \n", - "4 Are you implying that to be a good developer y... ecesena \n", - "5 It pretty much works with other carriers. My s... toast0 \n", - "6 \n", - "7 "not operated for profit" and "... radford-neal \n", - "8 It's a good description of one applicatio... dkarl \n", - "9 Might be a bit high, but....

"For ex... tyingq \n", - "10 dlcmh 11 \n", - "11 wowamit 1 \n", - "12 SEEKING WORK | REMOTE | US Citizen

Location:... rasikjain \n", - "13 I had a very similar experience last month tea... tmaly \n", - "14 mrtweetyhack \n", - "15 > Just do what most American cities do with... AnthonyMouse \n", - "16 It's not a space. The l and the C are at ... antninja \n", - "17 I’ve knowingly paid the premium in the past, j... zwily \n", - "18 > Any sufficiently complicated C or Fortran... wavemode \n", - "19 It's similar to a lot of Japanese "t... TillE \n", - "20 Engineers are just people paid to code. If you... rchaud \n", - "21 So don't use it CyberDildonics \n", - "22 Sure, but there are degrees of these things. T... dang \n", - "23 I wish this would happen. There's a &quo... coredog64 \n", - "24 I’m not sure why responsible riders wouldn’t w... mjmahone17 \n", - "\n", - " timestamp type \n", - "0 2010-04-16 19:52:51+00:00 comment \n", - "1 2023-06-04 06:12:00+00:00 comment \n", - "2 2023-09-19 14:19:46+00:00 comment \n", - "3 2018-08-07 12:17:45+00:00 story \n", - "4 2016-06-10 19:38:25+00:00 comment \n", - "5 2024-08-13 03:11:32+00:00 comment \n", - "6 2020-06-07 22:43:03+00:00 comment \n", - "7 2020-03-19 00:24:47+00:00 comment \n", - "8 2024-10-07 13:38:18+00:00 comment \n", - "9 2017-01-23 19:49:15+00:00 comment \n", - "10 2023-02-18 02:51:11+00:00 story \n", - "11 2018-10-22 09:50:36+00:00 story \n", - "12 2024-08-01 16:56:49+00:00 comment \n", - "13 2020-01-22 18:26:36+00:00 comment \n", - "14 2022-02-26 19:34:00+00:00 comment \n", - "15 2021-10-04 23:10:50+00:00 comment \n", - "16 2013-07-13 09:48:34+00:00 comment \n", - "17 2020-06-17 14:26:43+00:00 comment \n", - "18 2025-02-07 06:42:53+00:00 comment \n", - "19 2022-11-06 17:15:10+00:00 comment \n", - "20 2023-04-12 14:31:42+00:00 comment \n", - "21 2015-12-29 22:01:16+00:00 comment \n", - "22 2021-11-11 23:42:12+00:00 comment \n", - "23 2018-02-12 16:03:37+00:00 comment \n", - "24 2021-11-09 01:36:01+00:00 comment \n", - "...\n", - "\n", - "[3000 rows x 6 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", - "hacker_news" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3e94DPOdiouY" - }, - "source": [ - "Then, keep only the rows that have text content:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mQl8hc1biouY", - "outputId": "2b4ffa85-9d95-4a20-9040-0420c67da2d4" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2533" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news_with_texts = hacker_news[hacker_news['text'].isnull() == False]\n", - "len(hacker_news_with_texts)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JWalDtLDiouZ" - }, - "source": [ - "You can get an idea of the input token length by calculating the average string length." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PZeg4LCUiouZ", - "outputId": "05b67cac-6b3d-42ef-d6d6-b578a9734f4c" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "393.2356889064355" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news_with_texts['text'].str.len().mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2IXqskHHiouZ" - }, - "source": [ - "**Optional**: You can raise the confirmation threshold for a smoother experience." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EpjXQ4FViouZ" - }, - "outputs": [], - "source": [ - "if Version(bigframes.__version__) >= Version(\"1.42.0\"):\n", - " bigframes.options.compute.ai_ops_confirmation_threshold = 5000" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SYFB-X1RiouZ" - }, - "source": [ - "Now it's LLM's turn. You want to keep only the rows whose texts are talking about iPhone. This will take several minutes to finish." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "rditQlmoiouZ", - "outputId": "2b44dcbf-2ef5-4119-ca05-9b082db9c0c1" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
445<NA>If I want to manipulate a device, I&#x27;ll bu...exelius<NA>2017-09-21 17:39:37+00:00comment
967<NA><a href=\"https:&#x2F;&#x2F;archive.ph&#x2F;nnE...blinding-streak<NA>2023-04-30 19:10:16+00:00comment
975<NA>I&#x27;ve had my 6S Plus now for 36 months and...throwaway427<NA>2019-01-03 18:06:33+00:00comment
1253<NA>Apple is far more closed and tyrannical with i...RyanMcGreal<NA>2012-12-21 00:45:40+00:00comment
1274<NA>An iOS version was released earlier this year....pls2halp<NA>2017-12-09 06:36:41+00:00comment
1548<NA>I’m not sure how that fits with Apple pursuing...alphabettsy<NA>2021-12-26 19:41:38+00:00comment
1630<NA>Not sure if you’re being ironic, but I use an ...lxgr<NA>2025-03-29 03:57:25+00:00comment
1664<NA>Quoting from the article I linked you:<p>&gt;&...StreamBright<NA>2017-09-11 19:57:34+00:00comment
1884<NA>&gt; Not all wireless headsets are the same, h...cptskippy<NA>2021-11-16 13:28:44+00:00comment
2251<NA>Will not buy any more apple product, iphone 4s...omi<NA>2012-09-11 14:42:52+00:00comment
2877<NA>I&#x27;ve been an iPhone user since the OG in ...vsnf<NA>2024-04-15 06:28:09+00:00comment
\n", - "

11 rows × 6 columns

\n", - "
[11 rows x 6 columns in total]" - ], - "text/plain": [ - " title text \\\n", - "445 If I want to manipulate a device, I'll bu... \n", - "967 I've had my 6S Plus now for 36 months and... \n", - "1253 Apple is far more closed and tyrannical with i... \n", - "1274 An iOS version was released earlier this year.... \n", - "1548 I’m not sure how that fits with Apple pursuing... \n", - "1630 Not sure if you’re being ironic, but I use an ... \n", - "1664 Quoting from the article I linked you:

>&... \n", - "1884 > Not all wireless headsets are the same, h... \n", - "2251 Will not buy any more apple product, iphone 4s... \n", - "2877 I've been an iPhone user since the OG in ... \n", - "\n", - " by score timestamp type \n", - "445 exelius 2017-09-21 17:39:37+00:00 comment \n", - "967 blinding-streak 2023-04-30 19:10:16+00:00 comment \n", - "975 throwaway427 2019-01-03 18:06:33+00:00 comment \n", - "1253 RyanMcGreal 2012-12-21 00:45:40+00:00 comment \n", - "1274 pls2halp 2017-12-09 06:36:41+00:00 comment \n", - "1548 alphabettsy 2021-12-26 19:41:38+00:00 comment \n", - "1630 lxgr 2025-03-29 03:57:25+00:00 comment \n", - "1664 StreamBright 2017-09-11 19:57:34+00:00 comment \n", - "1884 cptskippy 2021-11-16 13:28:44+00:00 comment \n", - "2251 omi 2012-09-11 14:42:52+00:00 comment \n", - "2877 vsnf 2024-04-15 06:28:09+00:00 comment \n", - "\n", - "[11 rows x 6 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iphone_comments = hacker_news_with_texts.ai.filter(\"The {text} is mainly focused on iPhone\", gemini_model)\n", - "iphone_comments" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yl24sJFIiouZ" - }, - "source": [ - "The performance of the ai operators depends on the length of your input as well as your quota. Here are our benchmarks for running the previous operation with Gemini Flash 1.5 over data of different sizes. Here are the estimates supposing your quota is [the default 200 requests per minute](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas):\n", - "\n", - "* 800 Rows -> ~4m\n", - "* 2550 Rows -> ~13m\n", - "* 8500 Rows -> ~40m\n", - "\n", - "These numbers can give you a general idea of how fast the operators run." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eo4nfISuiouZ" - }, - "source": [ - "Now, use LLM to summarize the sentiments towards iPhone:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 253 - }, - "id": "IlKBrNxUiouZ", - "outputId": "818d01e4-1cdf-42a2-9e02-61c4736a8905" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptypesentiment
445<NA>If I want to manipulate a device, I&#x27;ll bu...exelius<NA>2017-09-21 17:39:37+00:00commentPragmatic, slightly annoyed
967<NA><a href=\"https:&#x2F;&#x2F;archive.ph&#x2F;nnE...blinding-streak<NA>2023-04-30 19:10:16+00:00commentI lack the ability to access external websites...
975<NA>I&#x27;ve had my 6S Plus now for 36 months and...throwaway427<NA>2019-01-03 18:06:33+00:00commentGenerally positive, impressed.
1253<NA>Apple is far more closed and tyrannical with i...RyanMcGreal<NA>2012-12-21 00:45:40+00:00commentNegative towards Apple
1274<NA>An iOS version was released earlier this year....pls2halp<NA>2017-12-09 06:36:41+00:00commentNeutral, factual statement.
1548<NA>I’m not sure how that fits with Apple pursuing...alphabettsy<NA>2021-12-26 19:41:38+00:00commentSkeptical and critical.
1630<NA>Not sure if you’re being ironic, but I use an ...lxgr<NA>2025-03-29 03:57:25+00:00commentWants interoperability, frustrated.
1664<NA>Quoting from the article I linked you:<p>&gt;&...StreamBright<NA>2017-09-11 19:57:34+00:00commentExtremely positive review
1884<NA>&gt; Not all wireless headsets are the same, h...cptskippy<NA>2021-11-16 13:28:44+00:00commentSkeptical and critical
2251<NA>Will not buy any more apple product, iphone 4s...omi<NA>2012-09-11 14:42:52+00:00commentNegative, regretful.
2877<NA>I&#x27;ve been an iPhone user since the OG in ...vsnf<NA>2024-04-15 06:28:09+00:00commentMildly annoyed, resigned
\n", - "

11 rows × 7 columns

\n", - "
[11 rows x 7 columns in total]" - ], - "text/plain": [ - " title text \\\n", - "445 If I want to manipulate a device, I'll bu... \n", - "967
I've had my 6S Plus now for 36 months and... \n", - "1253 Apple is far more closed and tyrannical with i... \n", - "1274 An iOS version was released earlier this year.... \n", - "1548 I’m not sure how that fits with Apple pursuing... \n", - "1630 Not sure if you’re being ironic, but I use an ... \n", - "1664 Quoting from the article I linked you:

>&... \n", - "1884 > Not all wireless headsets are the same, h... \n", - "2251 Will not buy any more apple product, iphone 4s... \n", - "2877 I've been an iPhone user since the OG in ... \n", - "\n", - " by score timestamp type \\\n", - "445 exelius 2017-09-21 17:39:37+00:00 comment \n", - "967 blinding-streak 2023-04-30 19:10:16+00:00 comment \n", - "975 throwaway427 2019-01-03 18:06:33+00:00 comment \n", - "1253 RyanMcGreal 2012-12-21 00:45:40+00:00 comment \n", - "1274 pls2halp 2017-12-09 06:36:41+00:00 comment \n", - "1548 alphabettsy 2021-12-26 19:41:38+00:00 comment \n", - "1630 lxgr 2025-03-29 03:57:25+00:00 comment \n", - "1664 StreamBright 2017-09-11 19:57:34+00:00 comment \n", - "1884 cptskippy 2021-11-16 13:28:44+00:00 comment \n", - "2251 omi 2012-09-11 14:42:52+00:00 comment \n", - "2877 vsnf 2024-04-15 06:28:09+00:00 comment \n", - "\n", - " sentiment \n", - "445 Pragmatic, slightly annoyed\n", - " \n", - "967 I lack the ability to access external websites... \n", - "975 Generally positive, impressed.\n", - " \n", - "1253 Negative towards Apple\n", - " \n", - "1274 Neutral, factual statement.\n", - " \n", - "1548 Skeptical and critical.\n", - " \n", - "1630 Wants interoperability, frustrated.\n", - " \n", - "1664 Extremely positive review\n", - " \n", - "1884 Skeptical and critical\n", - " \n", - "2251 Negative, regretful.\n", - " \n", - "2877 Mildly annoyed, resigned\n", - " \n", - "\n", - "[11 rows x 7 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "iphone_comments.ai.map(\"Summarize the sentiment of the {text}. Your answer should have at most 3 words\", output_column=\"sentiment\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y7_16T2xiouZ" - }, - "source": [ - "Here is another example: count the number of rows whose authors have animals in their names." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 880 - }, - "id": "CbGwc_uXiouZ", - "outputId": "138acca0-7fb9-495a-e797-0d42495d65e6" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2025-04-02 18:00:55.801294+00:00 to avoid\n", - "incompatibilies with previous reads of this table. To read the latest\n", - "version, set `use_cache=False` or close the current session with\n", - "Session.close() or bigframes.pandas.close_session().\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n" - ] - }, - { - "data": { - "text/html": [ - "

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
0<NA><NA><NA><NA>2010-04-16 19:52:51+00:00comment
1<NA>I&#x27;d agree about border control with a cav...bandrami<NA>2023-06-04 06:12:00+00:00comment
2<NA>So 4 pickups? At least pickups are high margin...seanmcdirmid<NA>2023-09-19 14:19:46+00:00comment
3Workplace Wellness Programs Don’t Work Well. W...<NA>anarbadalov22018-08-07 12:17:45+00:00story
4<NA>Are you implying that to be a good developer y...ecesena<NA>2016-06-10 19:38:25+00:00comment
5<NA>It pretty much works with other carriers. My s...toast0<NA>2024-08-13 03:11:32+00:00comment
6<NA><NA><NA><NA>2020-06-07 22:43:03+00:00comment
7<NA>&quot;not operated for profit&quot; and &quot;...radford-neal<NA>2020-03-19 00:24:47+00:00comment
8<NA>It&#x27;s a good description of one applicatio...dkarl<NA>2024-10-07 13:38:18+00:00comment
9<NA>Might be a bit high, but....<p><i>&quot;For ex...tyingq<NA>2017-01-23 19:49:15+00:00comment
10Taiwan’s Tech King to Nancy Pelosi: U.S. Is in...<NA>dlcmh112023-02-18 02:51:11+00:00story
11Android’s new multitasking is terrible and sho...<NA>wowamit12018-10-22 09:50:36+00:00story
12<NA>SEEKING WORK | REMOTE | US Citizen<p>Location:...rasikjain<NA>2024-08-01 16:56:49+00:00comment
13<NA>I had a very similar experience last month tea...tmaly<NA>2020-01-22 18:26:36+00:00comment
14<NA><NA>mrtweetyhack<NA>2022-02-26 19:34:00+00:00comment
15<NA>&gt; Just do what most American cities do with...AnthonyMouse<NA>2021-10-04 23:10:50+00:00comment
16<NA>It&#x27;s not a space. The l and the C are at ...antninja<NA>2013-07-13 09:48:34+00:00comment
17<NA>I’ve knowingly paid the premium in the past, j...zwily<NA>2020-06-17 14:26:43+00:00comment
18<NA>&gt; Any sufficiently complicated C or Fortran...wavemode<NA>2025-02-07 06:42:53+00:00comment
19<NA>It&#x27;s similar to a lot of Japanese &quot;t...TillE<NA>2022-11-06 17:15:10+00:00comment
20<NA>Engineers are just people paid to code. If you...rchaud<NA>2023-04-12 14:31:42+00:00comment
21<NA>So don&#x27;t use itCyberDildonics<NA>2015-12-29 22:01:16+00:00comment
22<NA>Sure, but there are degrees of these things. T...dang<NA>2021-11-11 23:42:12+00:00comment
23<NA>I wish this would happen. There&#x27;s a &quo...coredog64<NA>2018-02-12 16:03:37+00:00comment
24<NA>I’m not sure why responsible riders wouldn’t w...mjmahone17<NA>2021-11-09 01:36:01+00:00comment
\n", - "

25 rows × 6 columns

\n", - "
[3000 rows x 6 columns in total]" - ], - "text/plain": [ - " title \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 Workplace Wellness Programs Don’t Work Well. W... \n", - "4 \n", - "5 \n", - "6 \n", - "7 \n", - "8 \n", - "9 \n", - "10 Taiwan’s Tech King to Nancy Pelosi: U.S. Is in... \n", - "11 Android’s new multitasking is terrible and sho... \n", - "12 \n", - "13 \n", - "14 \n", - "15 \n", - "16 \n", - "17 \n", - "18 \n", - "19 \n", - "20 \n", - "21 \n", - "22 \n", - "23 \n", - "24 \n", - "\n", - " text by score \\\n", - "0 \n", - "1 I'd agree about border control with a cav... bandrami \n", - "2 So 4 pickups? At least pickups are high margin... seanmcdirmid \n", - "3 anarbadalov 2 \n", - "4 Are you implying that to be a good developer y... ecesena \n", - "5 It pretty much works with other carriers. My s... toast0 \n", - "6 \n", - "7 "not operated for profit" and "... radford-neal \n", - "8 It's a good description of one applicatio... dkarl \n", - "9 Might be a bit high, but....

"For ex... tyingq \n", - "10 dlcmh 11 \n", - "11 wowamit 1 \n", - "12 SEEKING WORK | REMOTE | US Citizen

Location:... rasikjain \n", - "13 I had a very similar experience last month tea... tmaly \n", - "14 mrtweetyhack \n", - "15 > Just do what most American cities do with... AnthonyMouse \n", - "16 It's not a space. The l and the C are at ... antninja \n", - "17 I’ve knowingly paid the premium in the past, j... zwily \n", - "18 > Any sufficiently complicated C or Fortran... wavemode \n", - "19 It's similar to a lot of Japanese "t... TillE \n", - "20 Engineers are just people paid to code. If you... rchaud \n", - "21 So don't use it CyberDildonics \n", - "22 Sure, but there are degrees of these things. T... dang \n", - "23 I wish this would happen. There's a &quo... coredog64 \n", - "24 I’m not sure why responsible riders wouldn’t w... mjmahone17 \n", - "\n", - " timestamp type \n", - "0 2010-04-16 19:52:51+00:00 comment \n", - "1 2023-06-04 06:12:00+00:00 comment \n", - "2 2023-09-19 14:19:46+00:00 comment \n", - "3 2018-08-07 12:17:45+00:00 story \n", - "4 2016-06-10 19:38:25+00:00 comment \n", - "5 2024-08-13 03:11:32+00:00 comment \n", - "6 2020-06-07 22:43:03+00:00 comment \n", - "7 2020-03-19 00:24:47+00:00 comment \n", - "8 2024-10-07 13:38:18+00:00 comment \n", - "9 2017-01-23 19:49:15+00:00 comment \n", - "10 2023-02-18 02:51:11+00:00 story \n", - "11 2018-10-22 09:50:36+00:00 story \n", - "12 2024-08-01 16:56:49+00:00 comment \n", - "13 2020-01-22 18:26:36+00:00 comment \n", - "14 2022-02-26 19:34:00+00:00 comment \n", - "15 2021-10-04 23:10:50+00:00 comment \n", - "16 2013-07-13 09:48:34+00:00 comment \n", - "17 2020-06-17 14:26:43+00:00 comment \n", - "18 2025-02-07 06:42:53+00:00 comment \n", - "19 2022-11-06 17:15:10+00:00 comment \n", - "20 2023-04-12 14:31:42+00:00 comment \n", - "21 2015-12-29 22:01:16+00:00 comment \n", - "22 2021-11-11 23:42:12+00:00 comment \n", - "23 2018-02-12 16:03:37+00:00 comment \n", - "24 2021-11-09 01:36:01+00:00 comment \n", - "...\n", - "\n", - "[3000 rows x 6 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", - "hacker_news" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 880 - }, - "id": "9dzU8SNziouZ", - "outputId": "da8815c1-c411-4afc-d1ca-5e44c75b5b48" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titletextbyscoretimestamptype
15<NA>&gt; Just do what most American cities do with...AnthonyMouse<NA>2021-10-04 23:10:50+00:00comment
16<NA>It&#x27;s not a space. The l and the C are at ...antninja<NA>2013-07-13 09:48:34+00:00comment
23<NA>I wish this would happen. There&#x27;s a &quo...coredog64<NA>2018-02-12 16:03:37+00:00comment
27<NA>Flash got close, but was too complex and expen...surfingdino<NA>2024-05-08 05:02:37+00:00comment
36<NA>I think the &quot;algo genius&quot; type of de...poisonborz<NA>2024-06-04 07:39:08+00:00comment
150<NA>No one will be doing anything practical with a...NeutralCrane<NA>2025-02-01 14:26:25+00:00comment
160<NA>I think this is more semantics than anything.<...superb-owl<NA>2022-06-08 16:55:54+00:00comment
205<NA>Interesting to think of sign language localisa...robin_reala<NA>2019-02-01 11:49:23+00:00comment
231<NA>Probably because of their key location.ape4<NA>2014-08-29 14:55:40+00:00comment
250<NA>I realize this is a bit passe, but there were ...FeepingCreature<NA>2023-10-15 11:32:44+00:00comment
320Protest against Bill C-11, Canada's SOPA, plan...<NA>magikarp12012-01-29 02:14:12+00:00story
344<NA>What? Are you suggesting we cannot criticize p...chickenpotpie<NA>2020-12-02 18:24:19+00:00comment
348The flu vaccine this year is only 10% effective<NA>maryfoxmarlow32018-02-02 02:19:42+00:00story
360<NA>Bomb ownership is okay AFAIK. Intent to commi...Ferret7446<NA>2023-06-25 20:04:30+00:00comment
3981 + 1 = 3<NA>oscar-the-horse22012-08-05 22:18:51+00:00story
407<NA>No (almost certainly), but you will become fru...AnimalMuppet<NA>2023-09-15 16:11:08+00:00comment
454<NA>48h is less than 5 kWh of batteries, one quart...tigershark<NA>2021-07-23 05:12:52+00:00comment
457Brazilian Rails Websites<NA>akitaonrails12008-07-27 17:27:47+00:00story
472<NA>&gt; When most people start as programmers, th...PavlovsCat<NA>2018-12-23 20:37:20+00:00comment
493<NA>Related anecdata + a study I found useful. Aft...TrainedMonkey<NA>2023-02-02 16:14:23+00:00comment
497<NA>That &quot;civilized&quot; country has too man...rantanplan<NA>2017-02-17 12:51:51+00:00comment
514<NA>The current Go 2 drafts do.tapirl<NA>2020-08-12 02:37:41+00:00comment
535<NA>Having walked this same path, this blog resona...curiousllama<NA>2020-10-07 20:35:18+00:00comment
607<NA>If people thought the reward for talking to a ...slapfrog<NA>2021-09-08 20:58:13+00:00comment
672<NA>Given that you say you&#x27;re 38 and looking ...strix_varius<NA>2023-08-04 02:41:50+00:00comment
\n", - "

25 rows × 6 columns

\n", - "
[112 rows x 6 columns in total]" - ], - "text/plain": [ - " title \\\n", - "15 \n", - "16 \n", - "23 \n", - "27 \n", - "36 \n", - "150 \n", - "160 \n", - "205 \n", - "231 \n", - "250 \n", - "320 Protest against Bill C-11, Canada's SOPA, plan... \n", - "344 \n", - "348 The flu vaccine this year is only 10% effective \n", - "360 \n", - "398 1 + 1 = 3 \n", - "407 \n", - "454 \n", - "457 Brazilian Rails Websites \n", - "472 \n", - "493 \n", - "497 \n", - "514 \n", - "535 \n", - "607 \n", - "672 \n", - "\n", - " text by \\\n", - "15 > Just do what most American cities do with... AnthonyMouse \n", - "16 It's not a space. The l and the C are at ... antninja \n", - "23 I wish this would happen. There's a &quo... coredog64 \n", - "27 Flash got close, but was too complex and expen... surfingdino \n", - "36 I think the "algo genius" type of de... poisonborz \n", - "150 No one will be doing anything practical with a... NeutralCrane \n", - "160 I think this is more semantics than anything.<... superb-owl \n", - "205 Interesting to think of sign language localisa... robin_reala \n", - "231 Probably because of their key location. ape4 \n", - "250 I realize this is a bit passe, but there were ... FeepingCreature \n", - "320 magikarp \n", - "344 What? Are you suggesting we cannot criticize p... chickenpotpie \n", - "348 maryfoxmarlow \n", - "360 Bomb ownership is okay AFAIK. Intent to commi... Ferret7446 \n", - "398 oscar-the-horse \n", - "407 No (almost certainly), but you will become fru... AnimalMuppet \n", - "454 48h is less than 5 kWh of batteries, one quart... tigershark \n", - "457 akitaonrails \n", - "472 > When most people start as programmers, th... PavlovsCat \n", - "493 Related anecdata + a study I found useful. Aft... TrainedMonkey \n", - "497 That "civilized" country has too man... rantanplan \n", - "514 The current Go 2 drafts do. tapirl \n", - "535 Having walked this same path, this blog resona... curiousllama \n", - "607 If people thought the reward for talking to a ... slapfrog \n", - "672 Given that you say you're 38 and looking ... strix_varius \n", - "\n", - " score timestamp type \n", - "15 2021-10-04 23:10:50+00:00 comment \n", - "16 2013-07-13 09:48:34+00:00 comment \n", - "23 2018-02-12 16:03:37+00:00 comment \n", - "27 2024-05-08 05:02:37+00:00 comment \n", - "36 2024-06-04 07:39:08+00:00 comment \n", - "150 2025-02-01 14:26:25+00:00 comment \n", - "160 2022-06-08 16:55:54+00:00 comment \n", - "205 2019-02-01 11:49:23+00:00 comment \n", - "231 2014-08-29 14:55:40+00:00 comment \n", - "250 2023-10-15 11:32:44+00:00 comment \n", - "320 1 2012-01-29 02:14:12+00:00 story \n", - "344 2020-12-02 18:24:19+00:00 comment \n", - "348 3 2018-02-02 02:19:42+00:00 story \n", - "360 2023-06-25 20:04:30+00:00 comment \n", - "398 2 2012-08-05 22:18:51+00:00 story \n", - "407 2023-09-15 16:11:08+00:00 comment \n", - "454 2021-07-23 05:12:52+00:00 comment \n", - "457 1 2008-07-27 17:27:47+00:00 story \n", - "472 2018-12-23 20:37:20+00:00 comment \n", - "493 2023-02-02 16:14:23+00:00 comment \n", - "497 2017-02-17 12:51:51+00:00 comment \n", - "514 2020-08-12 02:37:41+00:00 comment \n", - "535 2020-10-07 20:35:18+00:00 comment \n", - "607 2021-09-08 20:58:13+00:00 comment \n", - "672 2023-08-04 02:41:50+00:00 comment \n", - "...\n", - "\n", - "[112 rows x 6 columns]" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hacker_news.ai.filter(\"{by} contains animal name\", model=gemini_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3bpkaspoiouZ" - }, - "source": [ - "Here are the runtime numbers with 500 requests per minute [raised quota](https://cloud.google.com/vertex-ai/generative-ai/docs/quotas):\n", - "* 3000 rows -> ~6m\n", - "* 10000 rows -> ~26m" + "For `ai.forecast`, see https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_ai_forecast.ipynb" ] } ], diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index fc46a43e7b..c32ac9042b 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -27,9 +27,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Semantic Operators have been deprecated since version 1.42.0. Please use AI Operators instead.\n", + "Semantic Operators have been deprecated since version 1.42.0. Please use AI functions instead.\n", "\n", - "The tutorial notebook for AI operators is located [here](https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/experimental/ai_operators.ipynb)." + "The tutorial notebook for AI functions is located at https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/ai_functions.ipynb" ] } ], diff --git a/notebooks/generative_ai/ai_functions.ipynb b/notebooks/generative_ai/ai_functions.ipynb new file mode 100644 index 0000000000..9362e93b59 --- /dev/null +++ b/notebooks/generative_ai/ai_functions.ipynb @@ -0,0 +1,555 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "acd53f9d", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "e75ce682", + "metadata": {}, + "source": [ + "# BigQuery DataFrames (BigFrames) AI Functions\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "aee05821", + "metadata": {}, + "source": [ + "This notebook provides a brief introduction to how to use BigFrames AI functions" + ] + }, + { + "cell_type": "markdown", + "id": "1232f400", + "metadata": {}, + "source": [ + "## Preparation\n", + "\n", + "First, set up your BigFrames environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f924aa", + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd \n", + "\n", + "PROJECT_ID = \"\" # Your project ID here\n", + "\n", + "bpd.options.bigquery.project = PROJECT_ID\n", + "bpd.options.bigquery.ordering_mode = \"partial\"\n", + "bpd.options.display.progress_bar = None" + ] + }, + { + "cell_type": "markdown", + "id": "e2188773", + "metadata": {}, + "source": [ + "## ai.generate\n", + "\n", + "The `ai.generate` function lets you analyze any combination of text and unstructured data from BigQuery. You can mix BigFrames or Pandas series with string literals as your prompt in the form of a tuple. You are also allowed to provide only a series. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "471a47fe", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/plain": [ + "0 {'result': 'Salad\\n', 'full_response': '{\"cand...\n", + "1 {'result': 'Sausageroll\\n', 'full_response': '...\n", + "dtype: struct>, status: string>[pyarrow]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "ingredients1 = bpd.Series([\"Lettuce\", \"Sausage\"])\n", + "ingredients2 = bpd.Series([\"Cucumber\", \"Long Bread\"])\n", + "\n", + "prompt = (\"What's the food made from \", ingredients1, \" and \", ingredients2, \" One word only\")\n", + "bbq.ai.generate(prompt)" + ] + }, + { + "cell_type": "markdown", + "id": "03953835", + "metadata": {}, + "source": [ + "The function returns a series of structs. The `'result'` field holds the answer, while more metadata can be found in the `'full_response'` field. The `'status'` field tells you whether LLM made a successful response for that specific row. " + ] + }, + { + "cell_type": "markdown", + "id": "b606c51f", + "metadata": {}, + "source": [ + "You can also include additional model parameters into your function call, as long as they satisfy the structure of `generateContent` [request body format](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints/generateContent#request-body). In the next example, you use `maxOutputTokens` to limite the length of the generated content." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4a3229a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Lettuce\n", + "1 The food\n", + "Name: result, dtype: string" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_params = {\n", + " \"generationConfig\": {\"maxOutputTokens\": 2}\n", + "}\n", + "\n", + "ingredients1 = bpd.Series([\"Lettuce\", \"Sausage\"])\n", + "ingredients2 = bpd.Series([\"Cucumber\", \"Long Bread\"])\n", + "\n", + "prompt = (\"What's the food made from \", ingredients1, \" and \", ingredients2)\n", + "bbq.ai.generate(prompt, model_params=model_params).struct.field(\"result\")" + ] + }, + { + "cell_type": "markdown", + "id": "3acba92d", + "metadata": {}, + "source": [ + "The answers are cut short as expected.\n", + "\n", + "In addition to `ai.generate`, you can use `ai.generate_bool`, `ai.generate_int`, and `ai.generate_double` for other type of outputs." + ] + }, + { + "cell_type": "markdown", + "id": "0bf9f1de", + "metadata": {}, + "source": [ + "## ai.if_\n", + "\n", + "`ai.if_` generates a series of booleans, unlike `ai.generate_bool` where you get a series of structs. It's a handy tool for filtering your data. not only because it directly returns a boolean, but also because it provides more optimization during data processing. Here is an example of using `ai.if_`:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "718c6622", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
creaturecategory
0Catmammal
1Salmonfish
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + "creature category\n", + " Cat mammal\n", + " Salmon fish\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "creatures = bpd.DataFrame({\"creature\": [\"Cat\", \"Salmon\"]})\n", + "categories = bpd.DataFrame({\"category\": [\"mammal\", \"fish\"]})\n", + "\n", + "joined_df = creatures.merge(categories, how=\"cross\")\n", + "condition = bbq.ai.if_((joined_df[\"creature\"], \" is a \", joined_df[\"category\"]))\n", + "\n", + "# Filter our dataframe\n", + "joined_df = joined_df[condition]\n", + "joined_df" + ] + }, + { + "cell_type": "markdown", + "id": "bb0999df", + "metadata": {}, + "source": [ + "## ai.score" + ] + }, + { + "cell_type": "markdown", + "id": "63b5a59f", + "metadata": {}, + "source": [ + "`ai.score` ranks your input based on the prompt. You can then sort your data based on their ranks. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6875fe36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalsrelative_weight
1spider1.0
0tiger8.0
2blue whale10.0
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " animals relative_weight\n", + "1 spider 1.0\n", + "0 tiger 8.0\n", + "2 blue whale 10.0\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'animals': ['tiger', 'spider', 'blue whale']})\n", + "\n", + "df['relative_weight'] = bbq.ai.score((\"Rank the relative weight of \", df['animals'], \" on the scale from 1 to 10\"))\n", + "df.sort_values(by='relative_weight')" + ] + }, + { + "cell_type": "markdown", + "id": "1ed0dff1", + "metadata": {}, + "source": [ + "## ai.classify" + ] + }, + { + "cell_type": "markdown", + "id": "c56b91cf", + "metadata": {}, + "source": [ + "`ai.classify` categories your inputs into the specified categories. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8cfb844b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalcategory
0tigermammal
1spideranthropod
2blue whalemammal
3salmonfish
\n", + "

4 rows × 2 columns

\n", + "
[4 rows x 2 columns in total]" + ], + "text/plain": [ + " animal category\n", + "0 tiger mammal\n", + "1 spider anthropod\n", + "2 blue whale mammal\n", + "3 salmon fish\n", + "\n", + "[4 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'animal': ['tiger', 'spider', 'blue whale', 'salmon']})\n", + "\n", + "df['category'] = bbq.ai.classify(df['animal'], categories=['mammal', 'fish', 'anthropod'])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "9e4037bc", + "metadata": {}, + "source": [ + "Note that this function can only return the values that are present in your provided categories. If your categories do not cover all cases, your will get wrong answers:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2e66110a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalcategory
0tigermammal
1spidermammal
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " animal category\n", + "0 tiger mammal\n", + "1 spider mammal\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'animal': ['tiger', 'spider']})\n", + "\n", + "df['category'] = bbq.ai.classify(df['animal'], categories=['mammal', 'fish']) # Spider belongs to neither category\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv (3.10.17)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 760000122dc190ac8a3303234cf4cbee1bbb9493 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 8 Oct 2025 13:42:50 -0700 Subject: [PATCH 11/22] feat: support string literal inputs for AI functions (#2152) * feat: support string literal inputs for AI functions * polish code * update pydoc --- bigframes/bigquery/_operations/ai.py | 20 ++++++++++++-------- tests/system/small/bigquery/test_ai.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 0c5eba9496..5c001d4caf 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -28,6 +28,7 @@ from bigframes.operations import ai_ops, output_schemas PROMPT_TYPE = Union[ + str, series.Series, pd.Series, List[Union[str, series.Series, pd.Series]], @@ -73,7 +74,7 @@ def generate( dtype: struct>, status: string>[pyarrow] Args: - prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series or pandas Series. connection_id (str, optional): @@ -165,7 +166,7 @@ def generate_bool( Name: result, dtype: boolean Args: - prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series or pandas Series. connection_id (str, optional): @@ -240,7 +241,7 @@ def generate_int( Name: result, dtype: Int64 Args: - prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series or pandas Series. connection_id (str, optional): @@ -315,7 +316,7 @@ def generate_double( Name: result, dtype: Float64 Args: - prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series or pandas Series. connection_id (str, optional): @@ -386,7 +387,7 @@ def if_( dtype: string Args: - prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series or pandas Series. connection_id (str, optional): @@ -433,7 +434,7 @@ def classify( [2 rows x 2 columns] Args: - input (Series | List[str|Series] | Tuple[str|Series, ...]): + input (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the input to send to the model. The Series can be BigFrames Series or pandas Series. categories (tuple[str, ...] | list[str]): @@ -482,7 +483,7 @@ def score( dtype: Float64 Args: - prompt (Series | List[str|Series] | Tuple[str|Series, ...]): + prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series or pandas Series. connection_id (str, optional): @@ -514,9 +515,12 @@ def _separate_context_and_series( Input: ("str1", series1, "str2", "str3", series2) Output: ["str1", None, "str2", "str3", None], [series1, series2] """ - if not isinstance(prompt, (list, tuple, series.Series)): + if not isinstance(prompt, (str, list, tuple, series.Series)): raise ValueError(f"Unsupported prompt type: {type(prompt)}") + if isinstance(prompt, str): + return [None], [series.Series([prompt])] + if isinstance(prompt, series.Series): if prompt.dtype == dtypes.OBJ_REF_DTYPE: # Multi-model support diff --git a/tests/system/small/bigquery/test_ai.py b/tests/system/small/bigquery/test_ai.py index 2ccdb01944..203de616ee 100644 --- a/tests/system/small/bigquery/test_ai.py +++ b/tests/system/small/bigquery/test_ai.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + from packaging import version import pandas as pd import pyarrow as pa @@ -42,6 +44,27 @@ def test_ai_function_pandas_input(session): ) +def test_ai_function_string_input(session): + with mock.patch( + "bigframes.core.global_session.get_global_session" + ) as mock_get_session: + mock_get_session.return_value = session + prompt = "Is apple a fruit?" + + result = bbq.ai.generate_bool(prompt, endpoint="gemini-2.5-flash") + + assert _contains_no_nulls(result) + assert result.dtype == pd.ArrowDtype( + pa.struct( + ( + pa.field("result", pa.bool_()), + pa.field("full_response", dtypes.JSON_ARROW_TYPE), + pa.field("status", pa.string()), + ) + ) + ) + + def test_ai_function_compile_model_params(session): if version.Version(sqlglot.__version__) < version.Version("25.18.0"): pytest.skip( From a410d0ae43ef3b053b650804156eda0b1f569da9 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 9 Oct 2025 10:36:18 -0700 Subject: [PATCH 12/22] feat: Replace ML.GENERATE_TEXT with AI.GENERATE for audio transcription (#2151) * change to ai.generate * convert the input data type * remove default value setting --- bigframes/operations/blob.py | 28 +- .../multimodal/multimodal_dataframe.ipynb | 949 +++++++++++++++++- 2 files changed, 925 insertions(+), 52 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 63875ded99..7f419bc5d8 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -804,7 +804,6 @@ def audio_transcribe( raise ValueError("Must specify the engine, supported value is 'bigquery'.") import bigframes.bigquery as bbq - import bigframes.ml.llm as llm import bigframes.pandas as bpd # col name doesn't matter here. Rename to avoid column name conflicts @@ -812,27 +811,22 @@ def audio_transcribe( prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." - llm_model = llm.GeminiTextGenerator( - model_name=model_name, - session=self._block.session, - connection_name=connection, - ) + # Convert the audio series to the runtime representation required by the model. + audio_runtime = audio_series.blob._get_runtime("R", with_metadata=True) - # transcribe audio using ML.GENERATE_TEXT - transcribed_results = llm_model.predict( - X=audio_series, - prompt=[prompt_text, audio_series], - temperature=0.0, + transcribed_results = bbq.ai.generate( + prompt=(prompt_text, audio_runtime), + connection_id=connection, + endpoint=model_name, + model_params={"generationConfig": {"temperature": 0.0}}, ) - transcribed_content_series = cast( - bpd.Series, transcribed_results["ml_generate_text_llm_result"] - ).rename("transcribed_content") + transcribed_content_series = transcribed_results.struct.field("result").rename( + "transcribed_content" + ) if verbose: - transcribed_status_series = cast( - bpd.Series, transcribed_results["ml_generate_text_status"] - ) + transcribed_status_series = transcribed_results.struct.field("status") results_df = bpd.DataFrame( { "status": transcribed_status_series, diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index f6f80b0009..c04463fc4c 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -131,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -139,7 +139,20 @@ "id": "fx6YcZJbeYru", "outputId": "d707954a-0dd0-4c50-b7bf-36b140cf76cf" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + } + ], "source": [ "# Create blob columns from wildcard path.\n", "df_image = bpd.from_glob_path(\n", @@ -155,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -164,7 +177,83 @@ "id": "HhCb8jRsLe9B", "outputId": "03081cf9-3a22-42c9-b38f-649f592fdada" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image
0
1
2
3
4
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " image\n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame\n", "df_image = df_image.head(5)\n", @@ -191,11 +280,143 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "YYYVn7NDH0Me" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "version. Use `json_query` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "version. Use `json_query` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "version. Use `json_query` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageauthorcontent_typesizeupdated
0aliceimage/png15912402025-03-20 17:45:04+00:00
1bobimage/png11829512025-03-20 17:45:02+00:00
2bobimage/png15208842025-03-20 17:44:55+00:00
3aliceimage/png12354012025-03-20 17:45:19+00:00
4bobimage/png15919232025-03-20 17:44:47+00:00
\n", + "

5 rows × 5 columns

\n", + "
[5 rows x 5 columns in total]" + ], + "text/plain": [ + " image author content_type \\\n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "\n", + " size updated \n", + "0 1591240 2025-03-20 17:45:04+00:00 \n", + "1 1182951 2025-03-20 17:45:02+00:00 \n", + "2 1520884 2025-03-20 17:44:55+00:00 \n", + "3 1235401 2025-03-20 17:45:19+00:00 \n", + "4 1591923 2025-03-20 17:44:47+00:00 \n", + "\n", + "[5 rows x 5 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Combine unstructured data with structured data\n", "df_image[\"author\"] = [\"alice\", \"bob\", \"bob\", \"alice\", \"bob\"] # type: ignore\n", @@ -216,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -225,7 +446,53 @@ "id": "UGuAk9PNDRF3", "outputId": "73feb33d-4a05-48fb-96e5-3c48c2a456f3" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "version. Use `json_query` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# filter images and display, you can also display audio and video types\n", "df_image[df_image[\"author\"] == \"alice\"][\"image\"].blob.display()" @@ -243,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -251,7 +518,32 @@ "id": "VWsl5BBPJ6N7", "outputId": "45d2356e-322b-4982-cfa7-42d034dc4344" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n" + ] + } + ], "source": [ "df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n", " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n", @@ -270,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -278,7 +570,20 @@ "id": "rWCAGC8w64vU", "outputId": "d7d456f0-8b56-492c-fe1b-967e9664d813" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n" + ] + } + ], "source": [ "# You can also chain functions together\n", "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" @@ -286,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -295,7 +600,182 @@ "id": "6NGK6GYSU44B", "outputId": "859101c1-2ee4-4f9a-e250-e8947127420a" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageauthorcontent_typesizeupdatedblurredresizednormalizedblur_resized
0aliceimage/png15912402025-03-20 17:45:04+00:00
1bobimage/png11829512025-03-20 17:45:02+00:00
2bobimage/png15208842025-03-20 17:44:55+00:00
3aliceimage/png12354012025-03-20 17:45:19+00:00
4bobimage/png15919232025-03-20 17:44:47+00:00
\n", + "

5 rows × 9 columns

\n", + "
[5 rows x 9 columns in total]" + ], + "text/plain": [ + " image author content_type \\\n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n", + "4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n", + "\n", + " size updated \\\n", + "0 1591240 2025-03-20 17:45:04+00:00 \n", + "1 1182951 2025-03-20 17:45:02+00:00 \n", + "2 1520884 2025-03-20 17:44:55+00:00 \n", + "3 1235401 2025-03-20 17:45:19+00:00 \n", + "4 1591923 2025-03-20 17:44:47+00:00 \n", + "\n", + " blurred \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n", + "\n", + " resized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_resize... \n", + "\n", + " normalized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_normal... \n", + "\n", + " blur_resized \n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_image" ] @@ -311,11 +791,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "mRUGfcaFVW-3" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n" + ] + } + ], "source": [ "from bigframes.ml import llm\n", "gemini = llm.GeminiTextGenerator()" @@ -323,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -332,7 +823,87 @@ "id": "DNFP7CbjWdR9", "outputId": "3f90a062-0abc-4bce-f53c-db57b06a14b9" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultimage
0The item is a tin of K9Guard Dog Paw Balm.
1The item is a bottle of K9 Guard Dog Hot Spot Spray.
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " ml_generate_text_llm_result \\\n", + "0 The item is a tin of K9Guard Dog Paw Balm. \n", + "1 The item is a bottle of K9 Guard Dog Hot Spot ... \n", + "\n", + " image \n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Ask the same question on the images\n", "df_image = df_image.head(2)\n", @@ -342,11 +913,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "IG3J3HsKhyBY" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + } + ], "source": [ "# Ask different questions\n", "df_image[\"question\"] = [\"what item is it?\", \"what color is the picture?\"]" @@ -354,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -363,7 +945,87 @@ "id": "qKOb765IiVuD", "outputId": "731bafad-ea29-463f-c8c1-cb7acfd70e5d" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultimage
0The item is dog paw balm.
1The picture features a white bottle with a light blue spray nozzle and accents. The background is a neutral gray.\\n
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " ml_generate_text_llm_result \\\n", + "0 The item is dog paw balm. \n", + "1 The picture features a white bottle with a lig... \n", + "\n", + " image \n", + "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "answer_alt = gemini.predict(df_image, prompt=[df_image[\"question\"], df_image[\"image\"]])\n", "answer_alt[[\"ml_generate_text_llm_result\", \"image\"]]" @@ -371,7 +1033,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -380,7 +1042,104 @@ "id": "KATVv2CO5RT1", "outputId": "6ec01f27-70b6-4f69-c545-e5e3c879480c" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_embedding_resultml_generate_embedding_statusml_generate_embedding_start_secml_generate_embedding_end_seccontent
0[ 0.00638846 0.01666372 0.00451786 ... -0.02...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...
1[ 0.0097399 0.0214815 0.00244266 ... 0.00...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...
\n", + "

2 rows × 5 columns

\n", + "
[2 rows x 5 columns in total]" + ], + "text/plain": [ + " ml_generate_embedding_result \\\n", + "0 [ 0.00638846 0.01666372 0.00451786 ... -0.02... \n", + "1 [ 0.0097399 0.0214815 0.00244266 ... 0.00... \n", + "\n", + " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", + "0 \n", + "1 \n", + "\n", + " ml_generate_embedding_end_sec \\\n", + "0 \n", + "1 \n", + "\n", + " content \n", + "0 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", + "\n", + "[2 rows x 5 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Generate embeddings.\n", "embed_model = llm.MultimodalEmbeddingGenerator()\n", @@ -399,18 +1158,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "oDDuYtUm5Yiy" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + } + ], "source": [ "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -418,22 +1188,130 @@ "id": "7jLpMYaj7nj8", "outputId": "06d5456f-580f-4693-adff-2605104b056c" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:244: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" + ] + } + ], "source": [ "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "id": "kaPvJATN7zlw" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n", + "0 on a level, stable surface to prevent tipping....\n", + "0 included)\\nto maintain the schedule during pow...\n", + "0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n", + "0 paperclip) for 5\\nseconds. This will reset all...\n", + "0 unit with a damp cloth. Do not immerse the bas...\n", + "0 continues,\\ncontact customer support.\\nE2: Foo...\n", + "Name: chunked, dtype: string" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "chunked = df_pdf[\"chunked\"].explode()\n", "chunked" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Audio transcribe function" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + } + ], + "source": [ + "audio_gcs_path = \"gs://bigframes_blob_test/audio/*\"\n", + "df = bpd.from_glob_path(audio_gcs_path, name=\"audio\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0 Now, as all books, not primarily intended as p...\n", + "Name: transcribed_content, dtype: string" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n", + "transcribed_series" + ] } ], "metadata": { @@ -441,7 +1319,8 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "venv", + "language": "python", "name": "python3" }, "language_info": { @@ -454,7 +1333,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.18" } }, "nbformat": 4, From 615a620dd512839df9ee72dfe623ecc37e198e8f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 9 Oct 2025 11:20:28 -0700 Subject: [PATCH 13/22] refactor: support agg_ops.LastOp, LastNonNullOp, FirstOp, FirstNonNullOp in the sqlglot compiler (#2153) --- .../sqlglot/aggregations/nullary_compiler.py | 2 +- .../sqlglot/aggregations/unary_compiler.py | 50 ++++++++++++++- .../compile/sqlglot/aggregations/windows.py | 7 ++- .../test_unary_compiler/test_first/out.sql | 20 ++++++ .../test_first_non_null/out.sql | 16 +++++ .../test_unary_compiler/test_last/out.sql | 20 ++++++ .../test_last_non_null/out.sql | 16 +++++ .../aggregations/test_unary_compiler.py | 61 +++++++++++++++++++ 8 files changed, 186 insertions(+), 6 deletions(-) create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first/out.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first_non_null/out.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last/out.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last_non_null/out.sql diff --git a/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py index c6418591ba..95dad4ff3b 100644 --- a/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/nullary_compiler.py @@ -50,4 +50,4 @@ def _( if window is None: # ROW_NUMBER always needs an OVER clause. return sge.Window(this=result) - return apply_window_if_present(result, window) + return apply_window_if_present(result, window, include_framing_clauses=False) diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index 1e87fd1fc5..16bd3ef099 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -104,7 +104,51 @@ def _( column: typed_expr.TypedExpr, window: typing.Optional[window_spec.WindowSpec] = None, ) -> sge.Expression: - return apply_window_if_present(sge.func("DENSE_RANK"), window) + return apply_window_if_present( + sge.func("DENSE_RANK"), window, include_framing_clauses=False + ) + + +@UNARY_OP_REGISTRATION.register(agg_ops.FirstOp) +def _( + op: agg_ops.FirstOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + # FIRST_VALUE in BQ respects nulls by default. + return apply_window_if_present(sge.FirstValue(this=column.expr), window) + + +@UNARY_OP_REGISTRATION.register(agg_ops.FirstNonNullOp) +def _( + op: agg_ops.FirstNonNullOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present( + sge.IgnoreNulls(this=sge.FirstValue(this=column.expr)), window + ) + + +@UNARY_OP_REGISTRATION.register(agg_ops.LastOp) +def _( + op: agg_ops.LastOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + # LAST_VALUE in BQ respects nulls by default. + return apply_window_if_present(sge.LastValue(this=column.expr), window) + + +@UNARY_OP_REGISTRATION.register(agg_ops.LastNonNullOp) +def _( + op: agg_ops.LastNonNullOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + return apply_window_if_present( + sge.IgnoreNulls(this=sge.LastValue(this=column.expr)), window + ) @UNARY_OP_REGISTRATION.register(agg_ops.MaxOp) @@ -182,7 +226,9 @@ def _( column: typed_expr.TypedExpr, window: typing.Optional[window_spec.WindowSpec] = None, ) -> sge.Expression: - return apply_window_if_present(sge.func("RANK"), window) + return apply_window_if_present( + sge.func("RANK"), window, include_framing_clauses=False + ) @UNARY_OP_REGISTRATION.register(agg_ops.SizeUnaryOp) diff --git a/bigframes/core/compile/sqlglot/aggregations/windows.py b/bigframes/core/compile/sqlglot/aggregations/windows.py index 5e38bf120e..41b4c674f9 100644 --- a/bigframes/core/compile/sqlglot/aggregations/windows.py +++ b/bigframes/core/compile/sqlglot/aggregations/windows.py @@ -25,6 +25,7 @@ def apply_window_if_present( value: sge.Expression, window: typing.Optional[window_spec.WindowSpec] = None, + include_framing_clauses: bool = True, ) -> sge.Expression: if window is None: return value @@ -64,11 +65,11 @@ def apply_window_if_present( if not window.bounds and not order: return sge.Window(this=value, partition_by=group_by) - if not window.bounds: + if not window.bounds and not include_framing_clauses: return sge.Window(this=value, partition_by=group_by, order=order) kind = ( - "ROWS" if isinstance(window.bounds, window_spec.RowsWindowBounds) else "RANGE" + "RANGE" if isinstance(window.bounds, window_spec.RangeWindowBounds) else "ROWS" ) start: typing.Union[int, float, None] = None @@ -125,7 +126,7 @@ def get_window_order_by( nulls_first=nulls_first, ) ) - elif not nulls_first and not desc: + elif (not nulls_first) and (not desc): order_by.append( sge.Ordered( this=is_null_expr, diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first/out.sql new file mode 100644 index 0000000000..6c7d39c24a --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first/out.sql @@ -0,0 +1,20 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE + WHEN `bfcol_0` IS NULL + THEN NULL + ELSE FIRST_VALUE(`bfcol_0`) OVER ( + ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first_non_null/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first_non_null/out.sql new file mode 100644 index 0000000000..ff90c6fcd9 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_first_non_null/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + FIRST_VALUE(`bfcol_0` IGNORE NULLS) OVER ( + ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last/out.sql new file mode 100644 index 0000000000..788c5ba466 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last/out.sql @@ -0,0 +1,20 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CASE + WHEN `bfcol_0` IS NULL + THEN NULL + ELSE LAST_VALUE(`bfcol_0`) OVER ( + ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + END AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last_non_null/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last_non_null/out.sql new file mode 100644 index 0000000000..17e7dbd446 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_last_non_null/out.sql @@ -0,0 +1,16 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + LAST_VALUE(`bfcol_0` IGNORE NULLS) OVER ( + ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `agg_int64` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index ea7faca7fb..ea15f155ad 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import typing import pytest @@ -126,6 +127,66 @@ def test_dense_rank(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_first(scalar_types_df: bpd.DataFrame, snapshot): + if sys.version_info < (3, 12): + pytest.skip( + "Skipping test due to inconsistent SQL formatting on Python < 3.12.", + ) + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation(agg_ops.FirstOp(), expression.deref(col_name)) + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + +def test_first_non_null(scalar_types_df: bpd.DataFrame, snapshot): + if sys.version_info < (3, 12): + pytest.skip( + "Skipping test due to inconsistent SQL formatting on Python < 3.12.", + ) + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation( + agg_ops.FirstNonNullOp(), expression.deref(col_name) + ) + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + +def test_last(scalar_types_df: bpd.DataFrame, snapshot): + if sys.version_info < (3, 12): + pytest.skip( + "Skipping test due to inconsistent SQL formatting on Python < 3.12.", + ) + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation(agg_ops.LastOp(), expression.deref(col_name)) + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + +def test_last_non_null(scalar_types_df: bpd.DataFrame, snapshot): + if sys.version_info < (3, 12): + pytest.skip( + "Skipping test due to inconsistent SQL formatting on Python < 3.12.", + ) + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + agg_expr = agg_exprs.UnaryAggregation( + agg_ops.LastNonNullOp(), expression.deref(col_name) + ) + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + sql = _apply_unary_window_op(bf_df, agg_expr, window, "agg_int64") + + snapshot.assert_match(sql, "out.sql") + + def test_max(scalar_types_df: bpd.DataFrame, snapshot): col_name = "int64_col" bf_df = scalar_types_df[[col_name]] From 5e1e8098ecf212c91d73fa80d722d1cb3e46668b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 9 Oct 2025 13:52:28 -0500 Subject: [PATCH 14/22] feat: create session-scoped `cut`, `DataFrame`, `MultiIndex`, `Index`, `Series`, `to_datetime`, and `to_timedelta` methods (#2157) * docs: remove import bigframes.pandas as bpd boilerplate from many samples Also, fixes several constructors that didn't take a session for compatibility with multi-session applications. * fix docs * fix unit tests * skip sklearn test * fix snapshot * plumb through session for from_tuples and from_arrays * add from_frame * make sure polars session isnt skipped on Kokoro * fix apply doctest * make doctest conftest available everywhere * add python version flexibility for to_dict * disambiguate explicit names * disambiguate explicit name none versus no name * fix for column name comparison in pandas bin op * avoid setting column labels in special case of Series(block) * revert doctest changes * revert doctest changes * revert df docstrings * add polars series unit tests * restore a test * Revert "restore a test" This reverts commit 765b678b34a7976aef1017d2a1fdb34d7a4cfbe4. * skip null * skip unsupported tests * revert more docs changes * revert more docs * revert more docs * fix unit tests python 3.13 * add test to reproduce name error * add tests for session scoped methods * fix mypy errors --- bigframes/core/indexes/base.py | 11 +- bigframes/core/indexes/multi.py | 48 ++++++- bigframes/core/log_adapter.py | 4 +- bigframes/core/reshape/tile.py | 7 +- bigframes/core/tools/datetimes.py | 10 +- bigframes/formatting_helpers.py | 10 +- bigframes/pandas/__init__.py | 17 +-- bigframes/pandas/core/tools/timedeltas.py | 2 +- bigframes/session/__init__.py | 124 +++++++++++++++-- tests/system/small/test_session_as_bpd.py | 154 ++++++++++++++++++++++ tests/unit/test_pandas.py | 26 ++-- 11 files changed, 375 insertions(+), 38 deletions(-) create mode 100644 tests/system/small/test_session_as_bpd.py diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 83dd11dacb..a258c01195 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -383,9 +383,16 @@ def to_series( name = self.name if name is None else name if index is None: - return bigframes.series.Series(data=self, index=self, name=name) + return bigframes.series.Series( + data=self, index=self, name=name, session=self._session + ) else: - return bigframes.series.Series(data=self, index=Index(index), name=name) + return bigframes.series.Series( + data=self, + index=Index(index, session=self._session), + name=name, + session=self._session, + ) def get_level_values(self, level) -> Index: level_n = level if isinstance(level, int) else self.names.index(level) diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index a8b4b7dffe..a611442b88 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -14,7 +14,7 @@ from __future__ import annotations -from typing import cast, Hashable, Iterable, Sequence +from typing import cast, Hashable, Iterable, Optional, Sequence, TYPE_CHECKING import bigframes_vendored.pandas.core.indexes.multi as vendored_pandas_multindex import pandas @@ -23,6 +23,9 @@ from bigframes.core import expression as ex from bigframes.core.indexes.base import Index +if TYPE_CHECKING: + import bigframes.session + class MultiIndex(Index, vendored_pandas_multindex.MultiIndex): __doc__ = vendored_pandas_multindex.MultiIndex.__doc__ @@ -33,10 +36,12 @@ def from_tuples( tuples: Iterable[tuple[Hashable, ...]], sortorder: int | None = None, names: Sequence[Hashable] | Hashable | None = None, + *, + session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_tuples(tuples, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index)) + return cast(MultiIndex, Index(pd_index, session=session)) @classmethod def from_arrays( @@ -44,10 +49,12 @@ def from_arrays( arrays, sortorder: int | None = None, names=None, + *, + session: Optional[bigframes.session.Session] = None, ) -> MultiIndex: pd_index = pandas.MultiIndex.from_arrays(arrays, sortorder, names) # Index.__new__ should detect multiple levels and properly create a multiindex - return cast(MultiIndex, Index(pd_index)) + return cast(MultiIndex, Index(pd_index, session=session)) def __eq__(self, other) -> Index: # type: ignore import bigframes.operations as ops @@ -71,3 +78,38 @@ def __eq__(self, other) -> Index: # type: ignore index_labels=[None], ) ) + + +class MultiIndexAccessor: + """Proxy to MultiIndex constructors to allow a session to be passed in.""" + + def __init__(self, session: bigframes.session.Session): + self._session = session + + def __call__(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :class:`bigframes.pandas.MultiIndex`. + """ + return MultiIndex(*args, session=self._session, **kwargs) + + def from_arrays(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_arrays`. + """ + return MultiIndex.from_arrays(*args, session=self._session, **kwargs) + + def from_frame(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_frame`. + """ + return cast(MultiIndex, MultiIndex.from_frame(*args, **kwargs)) + + def from_tuples(self, *args, **kwargs) -> MultiIndex: + """Construct a MultiIndex using the associated Session. + + See :func:`bigframes.pandas.MultiIndex.from_tuples`. + """ + return MultiIndex.from_tuples(*args, session=self._session, **kwargs) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 3ec1e86dc7..8179ffbeed 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -155,7 +155,9 @@ def method_logger(method=None, /, *, custom_base_name: Optional[str] = None): def outer_wrapper(method): @functools.wraps(method) def wrapper(*args, **kwargs): - api_method_name = getattr(method, LOG_OVERRIDE_NAME, method.__name__) + api_method_name = getattr( + method, LOG_OVERRIDE_NAME, method.__name__ + ).lower() if custom_base_name is None: qualname_parts = getattr(method, "__qualname__", method.__name__).split( "." diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index 74a941be54..a2efa8f927 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import Optional, TYPE_CHECKING import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -31,6 +32,9 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series +if TYPE_CHECKING: + import bigframes.session + def cut( x, @@ -42,6 +46,7 @@ def cut( *, right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, + session: Optional[bigframes.session.Session] = None, ) -> bigframes.series.Series: if ( labels is not None @@ -65,7 +70,7 @@ def cut( raise ValueError("Cannot cut empty array.") if not isinstance(x, bigframes.series.Series): - x = bigframes.series.Series(x) + x = bigframes.series.Series(x, session=session) if isinstance(bins, int): if bins <= 0: diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 7edf2fa2e4..0e5594d498 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + from collections.abc import Mapping from datetime import date, datetime -from typing import Optional, Union +from typing import Optional, TYPE_CHECKING, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes @@ -25,6 +27,9 @@ import bigframes.operations as ops import bigframes.series +if TYPE_CHECKING: + import bigframes.session + def to_datetime( arg: Union[ @@ -37,6 +42,7 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, + session: Optional[bigframes.session.Session] = None, ) -> Union[pd.Timestamp, datetime, bigframes.series.Series]: if isinstance(arg, (int, float, str, datetime, date)): return pd.to_datetime( @@ -52,7 +58,7 @@ def to_datetime( f"to datetime is not implemented. {constants.FEEDBACK_LINK}" ) - arg = bigframes.series.Series(arg) + arg = bigframes.series.Series(arg, session=session) if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore raise ValueError("cannot specify both format and unit") diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index f75394c47d..55731069a3 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -105,8 +105,14 @@ def progress_callback( """Displays a progress bar while the query is running""" global current_display, current_display_id, previous_display_html - import bigframes._config - import bigframes.core.events + try: + import bigframes._config + import bigframes.core.events + except ImportError: + # Since this gets called from __del__, skip if the import fails to avoid + # ImportError: sys.meta_path is None, Python is likely shutting down. + # This will allow cleanup to continue. + return progress_bar = bigframes._config.options.display.progress_bar diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 2455637b0a..6fcb71f0d8 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -16,8 +16,8 @@ from __future__ import annotations -from collections import namedtuple -from datetime import date, datetime +import collections +import datetime import inspect import sys import typing @@ -198,18 +198,18 @@ def to_datetime( @typing.overload def to_datetime( - arg: Union[int, float, str, datetime, date], + arg: Union[int, float, str, datetime.datetime, datetime.date], *, utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime]: +) -> Union[pandas.Timestamp, datetime.datetime]: ... def to_datetime( arg: Union[ - Union[int, float, str, datetime, date], + Union[int, float, str, datetime.datetime, datetime.date], vendored_pandas_datetimes.local_iterables, bigframes.series.Series, bigframes.dataframe.DataFrame, @@ -218,8 +218,9 @@ def to_datetime( utc: bool = False, format: Optional[str] = None, unit: Optional[str] = None, -) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]: - return bigframes.core.tools.to_datetime( +) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + return global_session.with_default_session( + bigframes.session.Session.to_datetime, arg, utc=utc, format=format, @@ -322,7 +323,7 @@ def clean_up_by_session_id( __version__ = bigframes.version.__version__ # Other public pandas attributes -NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +NamedAgg = collections.namedtuple("NamedAgg", ["column", "aggfunc"]) options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py index 070a41d62d..eb01f9f846 100644 --- a/bigframes/pandas/core/tools/timedeltas.py +++ b/bigframes/pandas/core/tools/timedeltas.py @@ -35,7 +35,7 @@ def to_timedelta( return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) if pdtypes.is_list_like(arg): - return to_timedelta(series.Series(arg), unit, session=session) + return to_timedelta(series.Series(arg, session=session), unit, session=session) return pd.to_timedelta(arg, unit) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 46fb56b88e..886072b884 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,6 +68,8 @@ import bigframes.core from bigframes.core import blocks, log_adapter, utils import bigframes.core.events +import bigframes.core.indexes +import bigframes.core.indexes.multi import bigframes.core.pyformat import bigframes.formatting_helpers import bigframes.functions._function_session as bff_session @@ -79,7 +81,6 @@ # Avoid circular imports. if typing.TYPE_CHECKING: - import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series import bigframes.streaming.dataframe as streaming_dataframe @@ -320,6 +321,15 @@ def bqconnectionmanager(self): ) return self._bq_connection_manager + @property + def options(self) -> bigframes._config.Options: + """Options for configuring BigQuery DataFrames. + + Included for compatibility between bpd and Session. + """ + # TODO(tswast): Consider making a separate session-level options object. + return bigframes._config.options + @property def session_id(self): return self._session_id @@ -1826,7 +1836,7 @@ def udf( Turning an arbitrary python function into a BigQuery managed python udf: >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) + >>> @bpd.udf(dataset="bigfranes_testing", name=bq_name) # doctest: +SKIP ... def minutes_to_hours(x: int) -> float: ... return x/60 @@ -1839,8 +1849,8 @@ def udf( 4 120 dtype: Int64 - >>> hours = minutes.apply(minutes_to_hours) - >>> hours + >>> hours = minutes.apply(minutes_to_hours) # doctest: +SKIP + >>> hours # doctest: +SKIP 0 0.0 1 0.5 2 1.0 @@ -1853,7 +1863,7 @@ def udf( packages (optionally with the package version) via `packages` param. >>> bq_name = datetime.datetime.now().strftime("bigframes_%Y%m%d%H%M%S%f") - >>> @bpd.udf( + >>> @bpd.udf( # doctest: +SKIP ... dataset="bigfranes_testing", ... name=bq_name, ... packages=["cryptography"] @@ -1870,14 +1880,14 @@ def udf( ... return f.encrypt(input.encode()).decode() >>> names = bpd.Series(["Alice", "Bob"]) - >>> hashes = names.apply(get_hash) + >>> hashes = names.apply(get_hash) # doctest: +SKIP You can clean-up the BigQuery functions created above using the BigQuery client from the BigQuery DataFrames session: >>> session = bpd.get_global_session() - >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) - >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) + >>> session.bqclient.delete_routine(minutes_to_hours.bigframes_bigquery_function) # doctest: +SKIP + >>> session.bqclient.delete_routine(get_hash.bigframes_bigquery_function) # doctest: +SKIP Args: input_types (type or sequence(type), Optional): @@ -2297,6 +2307,104 @@ def read_gbq_object_table( s = self._loader.read_gbq_table(object_table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() + # ========================================================================= + # bigframes.pandas attributes + # + # These are included so that Session and bigframes.pandas can be used + # interchangeably. + # ========================================================================= + def cut(self, *args, **kwargs) -> bigframes.series.Series: + """Cuts a BigQuery DataFrames object. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.cut` for full documentation. + """ + import bigframes.core.reshape.tile + + return bigframes.core.reshape.tile.cut( + *args, + session=self, + **kwargs, + ) + + def DataFrame(self, *args, **kwargs): + """Constructs a DataFrame. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.DataFrame` for full documentation. + """ + import bigframes.dataframe + + return bigframes.dataframe.DataFrame(*args, session=self, **kwargs) + + @property + def MultiIndex(self) -> bigframes.core.indexes.multi.MultiIndexAccessor: + """Constructs a MultiIndex. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.MulitIndex` for full documentation. + """ + import bigframes.core.indexes.multi + + return bigframes.core.indexes.multi.MultiIndexAccessor(self) + + def Index(self, *args, **kwargs): + """Constructs a Index. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.Index` for full documentation. + """ + import bigframes.core.indexes + + return bigframes.core.indexes.Index(*args, session=self, **kwargs) + + def Series(self, *args, **kwargs): + """Constructs a Series. + + Included for compatibility between bpd and Session. + + See :class:`bigframes.pandas.Series` for full documentation. + """ + import bigframes.series + + return bigframes.series.Series(*args, session=self, **kwargs) + + def to_datetime( + self, *args, **kwargs + ) -> Union[pandas.Timestamp, datetime.datetime, bigframes.series.Series]: + """Converts a BigQuery DataFrames object to datetime dtype. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.to_datetime` for full documentation. + """ + import bigframes.core.tools + + return bigframes.core.tools.to_datetime( + *args, + session=self, + **kwargs, + ) + + def to_timedelta(self, *args, **kwargs): + """Converts a BigQuery DataFrames object to timedelta/duration dtype. + + Included for compatibility between bpd and Session. + + See :func:`bigframes.pandas.to_timedelta` for full documentation. + """ + import bigframes.pandas.core.tools.timedeltas + + return bigframes.pandas.core.tools.timedeltas.to_timedelta( + *args, + session=self, + **kwargs, + ) + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/tests/system/small/test_session_as_bpd.py b/tests/system/small/test_session_as_bpd.py new file mode 100644 index 0000000000..e280c551cb --- /dev/null +++ b/tests/system/small/test_session_as_bpd.py @@ -0,0 +1,154 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Check that bpd and Session can be used interchangablely.""" + +from __future__ import annotations + +from typing import cast + +import numpy as np +import pandas.testing + +import bigframes.pandas as bpd +import bigframes.session + + +def test_cut(session: bigframes.session.Session): + sc = [30, 80, 40, 90, 60, 45, 95, 75, 55, 100, 65, 85] + x = [20, 40, 60, 80, 100] + + bpd_result = bpd.cut(sc, x) + session_result = session.cut(sc, x) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) + + +def test_dataframe(session: bigframes.session.Session): + data = {"col": ["local", None, "data"]} + + bpd_result = bpd.DataFrame(data) + session_result = session.DataFrame(data) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_frame_equal(bpd_pd, session_pd) + + +def test_multiindex_from_arrays(session: bigframes.session.Session): + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + + bpd_result = bpd.MultiIndex.from_arrays(arrays, names=("number", "color")) + session_result = session.MultiIndex.from_arrays(arrays, names=("number", "color")) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_index_equal(bpd_pd, session_pd) + + +def test_multiindex_from_tuples(session: bigframes.session.Session): + tuples = [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + + bpd_result = bpd.MultiIndex.from_tuples(tuples, names=("number", "color")) + session_result = session.MultiIndex.from_tuples(tuples, names=("number", "color")) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_index_equal(bpd_pd, session_pd) + + +def test_index(session: bigframes.session.Session): + index = [1, 2, 3] + + bpd_result = bpd.Index(index) + session_result = session.Index(index) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_index_equal(bpd_pd, session_pd) + + +def test_series(session: bigframes.session.Session): + series = [1, 2, 3] + + bpd_result = bpd.Series(series) + session_result = session.Series(series) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) + + +def test_to_datetime(session: bigframes.session.Session): + datetimes = ["2018-10-26 12:00:00", "2018-10-26 13:00:15"] + + bpd_result = bpd.to_datetime(datetimes) + session_result = cast(bpd.Series, session.to_datetime(datetimes)) + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) + + +def test_to_timedelta(session: bigframes.session.Session): + offsets = np.arange(5) + + bpd_result = bpd.to_timedelta(offsets, unit="s") + session_result = session.to_timedelta(offsets, unit="s") + + global_session = bpd.get_global_session() + assert global_session is not session + assert bpd_result._session is global_session + assert session_result._session is session + + bpd_pd = bpd_result.to_pandas() + session_pd = session_result.to_pandas() + pandas.testing.assert_series_equal(bpd_pd, session_pd) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 73e0b7f2d6..5e75e6b20f 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -64,8 +64,12 @@ def test_method_matches_session(method_name: str): pandas_method = getattr(bigframes.pandas, method_name) pandas_doc = inspect.getdoc(pandas_method) assert pandas_doc is not None, "docstrings are required" - assert re.sub(leading_whitespace, "", pandas_doc) == re.sub( - leading_whitespace, "", session_doc + + pandas_doc_stripped = re.sub(leading_whitespace, "", pandas_doc) + session_doc_stripped = re.sub(leading_whitespace, "", session_doc) + assert ( + pandas_doc_stripped == session_doc_stripped + or ":`bigframes.pandas" in session_doc_stripped ) # Add `eval_str = True` so that deferred annotations are turned into their @@ -75,18 +79,20 @@ def test_method_matches_session(method_name: str): eval_str=True, globals={**vars(bigframes.session), **{"dataframe": bigframes.dataframe}}, ) - pandas_signature = inspect.signature(pandas_method, eval_str=True) - assert [ - # Kind includes position, which will be an offset. - parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) - for parameter in pandas_signature.parameters.values() - ] == [ + session_args = [ # Kind includes position, which will be an offset. parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) for parameter in session_signature.parameters.values() # Don't include the first parameter, which is `self: Session` - ][ - 1: + ][1:] + pandas_signature = inspect.signature(pandas_method, eval_str=True) + pandas_args = [ + # Kind includes position, which will be an offset. + parameter.replace(kind=inspect.Parameter.POSITIONAL_ONLY) + for parameter in pandas_signature.parameters.values() + ] + assert session_args == pandas_args or ["args", "kwargs"] == [ + parameter.name for parameter in session_args ] assert pandas_signature.return_annotation == session_signature.return_annotation From 5cc3c5b1391a7dfa062b1d77f001726b013f6337 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 9 Oct 2025 13:33:19 -0700 Subject: [PATCH 15/22] feat: Add barh, pie plot types (#2146) --- bigframes/operations/_matplotlib/__init__.py | 2 + bigframes/operations/_matplotlib/core.py | 32 ++++-- bigframes/operations/plotting.py | 19 +++- .../system/small/operations/test_plotting.py | 36 ++++++ .../pandas/plotting/_core.py | 103 ++++++++++++++++++ 5 files changed, 182 insertions(+), 10 deletions(-) diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py index 5f99d3b50a..caacadf5fe 100644 --- a/bigframes/operations/_matplotlib/__init__.py +++ b/bigframes/operations/_matplotlib/__init__.py @@ -22,6 +22,8 @@ PLOT_CLASSES: dict[str, PLOT_TYPES] = { "area": core.AreaPlot, "bar": core.BarPlot, + "barh": core.BarhPlot, + "pie": core.PiePlot, "line": core.LinePlot, "scatter": core.ScatterPlot, "hist": hist.HistPlot, diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py index a5f53b9f64..06fb5235d7 100644 --- a/bigframes/operations/_matplotlib/core.py +++ b/bigframes/operations/_matplotlib/core.py @@ -55,7 +55,12 @@ def _kind(self): @property def _sampling_warning_msg(self) -> typing.Optional[str]: - return None + return ( + "To optimize plotting performance, your data has been downsampled to {sampling_n} " + "rows from the original {total_n} rows. This may result in some data points " + "not being displayed. For a more comprehensive view, consider pre-processing " + "your data by aggregating it or selecting the top categories." + ) def __init__(self, data, **kwargs) -> None: self.kwargs = kwargs @@ -92,6 +97,10 @@ def _compute_plot_data(self): class AreaPlot(SamplingPlot): + @property + def _sampling_warning_msg(self) -> typing.Optional[str]: + return None + @property def _kind(self) -> typing.Literal["area"]: return "area" @@ -102,14 +111,17 @@ class BarPlot(SamplingPlot): def _kind(self) -> typing.Literal["bar"]: return "bar" + +class BarhPlot(SamplingPlot): @property - def _sampling_warning_msg(self) -> typing.Optional[str]: - return ( - "To optimize plotting performance, your data has been downsampled to {sampling_n} " - "rows from the original {total_n} rows. This may result in some data points " - "not being displayed. For a more comprehensive view, consider pre-processing " - "your data by aggregating it or selecting the top categories." - ) + def _kind(self) -> typing.Literal["barh"]: + return "barh" + + +class PiePlot(SamplingPlot): + @property + def _kind(self) -> typing.Literal["pie"]: + return "pie" class LinePlot(SamplingPlot): @@ -123,6 +135,10 @@ class ScatterPlot(SamplingPlot): def _kind(self) -> typing.Literal["scatter"]: return "scatter" + @property + def _sampling_warning_msg(self) -> typing.Optional[str]: + return None + def __init__(self, data, **kwargs) -> None: super().__init__(data, **kwargs) diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index a741ed5dd9..df0c138f0f 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -25,8 +25,8 @@ class PlotAccessor(vendordt.PlotAccessor): __doc__ = vendordt.PlotAccessor.__doc__ - _common_kinds = ("line", "area", "hist", "bar") - _dataframe_kinds = ("scatter",) + _common_kinds = ("line", "area", "hist", "bar", "barh", "pie") + _dataframe_kinds = ("scatter", "hexbin,") _all_kinds = _common_kinds + _dataframe_kinds def __call__(self, **kwargs): @@ -82,6 +82,21 @@ def bar( ): return self(kind="bar", x=x, y=y, **kwargs) + def barh( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self(kind="barh", x=x, y=y, **kwargs) + + def pie( + self, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self(kind="pie", y=y, **kwargs) + def scatter( self, x: typing.Optional[typing.Hashable] = None, diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index c2f3ba423f..2585ac8e81 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -264,6 +264,42 @@ def test_bar(scalars_dfs, col_names, alias): tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) +@pytest.mark.parametrize( + ("col_names",), + [ + pytest.param(["int64_col", "float64_col", "int64_too"], id="df"), + pytest.param(["int64_col"], id="series"), + ], +) +def test_barh(scalars_dfs, col_names): + scalars_df, scalars_pandas_df = scalars_dfs + ax = scalars_df[col_names].plot.barh() + pd_ax = scalars_pandas_df[col_names].plot.barh() + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + for line, pd_line in zip(ax.lines, pd_ax.lines): + # Compare y coordinates between the lines + tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) + + +@pytest.mark.parametrize( + ("col_names",), + [ + pytest.param(["int64_col", "float64_col", "int64_too"], id="df"), + pytest.param(["int64_col"], id="series"), + ], +) +def test_pie(scalars_dfs, col_names): + scalars_df, scalars_pandas_df = scalars_dfs + ax = scalars_df[col_names].abs().plot.pie(y="int64_col") + pd_ax = scalars_pandas_df[col_names].abs().plot.pie(y="int64_col") + tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) + tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) + for line, pd_line in zip(ax.lines, pd_ax.lines): + # Compare y coordinates between the lines + tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) + + @pytest.mark.parametrize( ("col_names", "alias"), [ diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 4ed5c8eb0b..b0c28ddfe9 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -275,6 +275,109 @@ def bar( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def barh( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + """ + Draw a horizontal bar plot. + + This function calls `pandas.plot` to generate a plot with a random sample + of items. For consistent results, the random sampling is reproducible. + Use the `sampling_random_state` parameter to modify the sampling seed. + + **Examples:** + + Basic plot. + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> ax = df.plot.barh(x='lab', y='val', rot=0) + + Plot a whole dataframe to a barh plot. Each column is assigned a distinct color, + and each row is nested in a group along the horizontal axis. + + >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88] + >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28] + >>> index = ['snail', 'pig', 'elephant', + ... 'rabbit', 'giraffe', 'coyote', 'horse'] + >>> df = bpd.DataFrame({'speed': speed, 'lifespan': lifespan}, index=index) + >>> ax = df.plot.barh(rot=0) + + Plot stacked barh charts for the DataFrame. + + >>> ax = df.plot.barh(stacked=True) + + If you don’t like the default colours, you can specify how you’d like each column + to be colored. + + >>> axes = df.plot.barh( + ... rot=0, subplots=True, color={"speed": "red", "lifespan": "green"} + ... ) + + Args: + x (label or position, optional): + Allows plotting of one column versus another. If not specified, the index + of the DataFrame is used. + y (label or position, optional): + Allows plotting of one column versus another. If not specified, all numerical + columns are used. + **kwargs: + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns: + matplotlib.axes.Axes or numpy.ndarray: + Area plot, or array of area plots if subplots is True. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def pie( + self, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + """ + Generate a pie plot. + + A pie plot is a proportional representation of the numerical data in a + column. This function wraps :meth:`matplotlib.pyplot.pie` for the + specified column. If no column reference is passed and + ``subplots=True`` a pie plot is drawn for each numerical column + independently. + + **Examples:** + + In the example below we have a DataFrame with the information about + planet's mass and radius. We pass the 'mass' column to the + pie function to get a pie plot. + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'mass': [0.330, 4.87 , 5.97], + ... 'radius': [2439.7, 6051.8, 6378.1]}, + ... index=['Mercury', 'Venus', 'Earth']) + >>> plot = df.plot.pie(y='mass', figsize=(5, 5)) + + >>> plot = df.plot.pie(subplots=True, figsize=(11, 6)) + + Args: + y (int or label, optional): + Label or position of the column to plot. + If not provided, ``subplots=True`` argument must be passed. + **kwargs: + Keyword arguments to pass on to :meth:`DataFrame.plot`. + + Returns: + matplotlib.axes.Axes or np.ndarray: + A NumPy array is returned when `subplots` is True. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def scatter( self, x: typing.Optional[typing.Hashable] = None, From 35c1c33b85d1b92e402aab73677df3ffe43a51b4 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 9 Oct 2025 16:21:20 -0700 Subject: [PATCH 16/22] fix: Fix too many cluster columns requested by caching (#2155) --- bigframes/session/bq_caching_executor.py | 8 ++++++-- tests/system/small/test_dataframe.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index d4cfa13aa4..c830ca1e29 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -637,14 +637,18 @@ def _execute_plan_gbq( create_table = True if not cache_spec.cluster_cols: - assert len(cache_spec.cluster_cols) <= _MAX_CLUSTER_COLUMNS offsets_id = bigframes.core.identifiers.ColumnId( bigframes.core.guid.generate_guid() ) plan = nodes.PromoteOffsetsNode(plan, offsets_id) cluster_cols = [offsets_id.sql] else: - cluster_cols = cache_spec.cluster_cols + cluster_cols = [ + col + for col in cache_spec.cluster_cols + if bigframes.dtypes.is_clusterable(plan.schema.get_type(col)) + ] + cluster_cols = cluster_cols[:_MAX_CLUSTER_COLUMNS] compiled = compile.compile_sql( compile.CompileRequest( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index d0847eee4e..1e6151b7f4 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5537,6 +5537,23 @@ def test_df_cached(scalars_df_index): pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) +def test_df_cached_many_index_cols(scalars_df_index): + index_cols = [ + "int64_too", + "time_col", + "int64_col", + "bool_col", + "date_col", + "timestamp_col", + "string_col", + ] + df = scalars_df_index.set_index(index_cols) + df = df[df["rowindex_2"] % 2 == 0] + + df_cached_copy = df.cache() + pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) + + def test_assign_after_binop_row_joins(): pd_df = pd.DataFrame( { From 7cb9e476b9742f59a7b00b43df1f5697903da2be Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 10 Oct 2025 07:49:30 -0700 Subject: [PATCH 17/22] test: fix test_read_gbq_query timeout on g3 python 3.13 tests (#2160) --- tests/unit/session/test_read_gbq_query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/session/test_read_gbq_query.py b/tests/unit/session/test_read_gbq_query.py index 1f9d2fb945..d078c64af7 100644 --- a/tests/unit/session/test_read_gbq_query.py +++ b/tests/unit/session/test_read_gbq_query.py @@ -35,3 +35,4 @@ def test_read_gbq_query_sets_destination_table(): assert query == "SELECT 'my-test-query';" assert config.destination is not None + session.close() From 8714977aa21567264e304d01965a7bb7e34b09e5 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 10 Oct 2025 10:26:03 -0700 Subject: [PATCH 18/22] refactor: support agg_ops.ShiftOp and DiffOp for the sqlglot compiler (#2156) --- .../sqlglot/aggregations/unary_compiler.py | 38 ++++++++++++++ .../test_diff/diff_bool.sql | 13 +++++ .../test_diff/diff_int.sql | 13 +++++ .../test_unary_compiler/test_shift/lag.sql | 13 +++++ .../test_unary_compiler/test_shift/lead.sql | 13 +++++ .../test_unary_compiler/test_shift/noop.sql | 13 +++++ .../aggregations/test_unary_compiler.py | 49 +++++++++++++++++++ 7 files changed, 152 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_bool.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_int.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lag.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lead.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/noop.sql diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index 16bd3ef099..cfa27909c6 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -151,6 +151,23 @@ def _( ) +@UNARY_OP_REGISTRATION.register(agg_ops.DiffOp) +def _( + op: agg_ops.DiffOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + shift_op_impl = UNARY_OP_REGISTRATION[agg_ops.ShiftOp(0)] + shifted = shift_op_impl(agg_ops.ShiftOp(op.periods), column, window) + if column.dtype in (dtypes.BOOL_DTYPE, dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): + if column.dtype == dtypes.BOOL_DTYPE: + return sge.NEQ(this=column.expr, expression=shifted) + else: + return sge.Sub(this=column.expr, expression=shifted) + else: + raise TypeError(f"Cannot perform diff on type {column.dtype}") + + @UNARY_OP_REGISTRATION.register(agg_ops.MaxOp) def _( op: agg_ops.MaxOp, @@ -240,6 +257,27 @@ def _( return apply_window_if_present(sge.func("COUNT", sge.convert(1)), window) +@UNARY_OP_REGISTRATION.register(agg_ops.ShiftOp) +def _( + op: agg_ops.ShiftOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + if op.periods == 0: # No-op + return column.expr + if op.periods > 0: + return apply_window_if_present( + sge.func("LAG", column.expr, sge.convert(op.periods)), + window, + include_framing_clauses=False, + ) + return apply_window_if_present( + sge.func("LEAD", column.expr, sge.convert(-op.periods)), + window, + include_framing_clauses=False, + ) + + @UNARY_OP_REGISTRATION.register(agg_ops.SumOp) def _( op: agg_ops.SumOp, diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_bool.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_bool.sql new file mode 100644 index 0000000000..6c7d37c037 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_bool.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` <> LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `diff_bool` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_int.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_int.sql new file mode 100644 index 0000000000..1ce4953d87 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_diff/diff_int.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` - LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `diff_int` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lag.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lag.sql new file mode 100644 index 0000000000..59e2c47edf --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lag.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `lag` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lead.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lead.sql new file mode 100644 index 0000000000..5c82b5db39 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/lead.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + LEAD(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `lead` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/noop.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/noop.sql new file mode 100644 index 0000000000..fef4a2bde8 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_shift/noop.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_0` AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `noop` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index ea15f155ad..a83a494e55 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -127,6 +127,28 @@ def test_dense_rank(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_diff(scalar_types_df: bpd.DataFrame, snapshot): + # Test integer + int_col = "int64_col" + bf_df_int = scalar_types_df[[int_col]] + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(int_col),)) + int_op = agg_exprs.UnaryAggregation( + agg_ops.DiffOp(periods=1), expression.deref(int_col) + ) + int_sql = _apply_unary_window_op(bf_df_int, int_op, window, "diff_int") + snapshot.assert_match(int_sql, "diff_int.sql") + + # Test boolean + bool_col = "bool_col" + bf_df_bool = scalar_types_df[[bool_col]] + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(bool_col),)) + bool_op = agg_exprs.UnaryAggregation( + agg_ops.DiffOp(periods=1), expression.deref(bool_col) + ) + bool_sql = _apply_unary_window_op(bf_df_bool, bool_op, window, "diff_bool") + snapshot.assert_match(bool_sql, "diff_bool.sql") + + def test_first(scalar_types_df: bpd.DataFrame, snapshot): if sys.version_info < (3, 12): pytest.skip( @@ -271,6 +293,33 @@ def test_rank(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_shift(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "int64_col" + bf_df = scalar_types_df[[col_name]] + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + + # Test lag + lag_op = agg_exprs.UnaryAggregation( + agg_ops.ShiftOp(periods=1), expression.deref(col_name) + ) + lag_sql = _apply_unary_window_op(bf_df, lag_op, window, "lag") + snapshot.assert_match(lag_sql, "lag.sql") + + # Test lead + lead_op = agg_exprs.UnaryAggregation( + agg_ops.ShiftOp(periods=-1), expression.deref(col_name) + ) + lead_sql = _apply_unary_window_op(bf_df, lead_op, window, "lead") + snapshot.assert_match(lead_sql, "lead.sql") + + # Test no-op + noop_op = agg_exprs.UnaryAggregation( + agg_ops.ShiftOp(periods=0), expression.deref(col_name) + ) + noop_sql = _apply_unary_window_op(bf_df, noop_op, window, "noop") + snapshot.assert_match(noop_sql, "noop.sql") + + def test_sum(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] agg_ops_map = { From e0aa9cc0d8ea032cbb0d8cd5907a97feee0e6165 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 10 Oct 2025 10:45:20 -0700 Subject: [PATCH 19/22] chore: improve wording of ai notebook (#2161) --- notebooks/generative_ai/ai_functions.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/notebooks/generative_ai/ai_functions.ipynb b/notebooks/generative_ai/ai_functions.ipynb index 9362e93b59..3783ad8365 100644 --- a/notebooks/generative_ai/ai_functions.ipynb +++ b/notebooks/generative_ai/ai_functions.ipynb @@ -56,7 +56,7 @@ "id": "aee05821", "metadata": {}, "source": [ - "This notebook provides a brief introduction to how to use BigFrames AI functions" + "This notebook provides a brief introduction to AI functions in BigQuery Dataframes." ] }, { @@ -145,7 +145,7 @@ "id": "b606c51f", "metadata": {}, "source": [ - "You can also include additional model parameters into your function call, as long as they satisfy the structure of `generateContent` [request body format](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints/generateContent#request-body). In the next example, you use `maxOutputTokens` to limite the length of the generated content." + "You can also include additional model parameters into your function call, as long as they conform to the structure of `generateContent` [request body format](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints/generateContent#request-body). In the next example, you use `maxOutputTokens` to limit the length of the generated content." ] }, { @@ -186,7 +186,7 @@ "source": [ "The answers are cut short as expected.\n", "\n", - "In addition to `ai.generate`, you can use `ai.generate_bool`, `ai.generate_int`, and `ai.generate_double` for other type of outputs." + "In addition to `ai.generate`, you can use `ai.generate_bool`, `ai.generate_int`, and `ai.generate_double` for other output types." ] }, { @@ -196,7 +196,7 @@ "source": [ "## ai.if_\n", "\n", - "`ai.if_` generates a series of booleans, unlike `ai.generate_bool` where you get a series of structs. It's a handy tool for filtering your data. not only because it directly returns a boolean, but also because it provides more optimization during data processing. Here is an example of using `ai.if_`:" + "`ai.if_` generates a series of booleans. It's a handy tool for joining and filtering your data, not only because it directly returns boolean values, but also because it provides more optimization during data processing. Here is an example of using `ai.if_`:" ] }, { @@ -284,7 +284,7 @@ "id": "63b5a59f", "metadata": {}, "source": [ - "`ai.score` ranks your input based on the prompt. You can then sort your data based on their ranks. For example:" + "`ai.score` ranks your input based on the prompt and assigns a double value (i.e. a score) to each item. You can then sort your data based on their scores. For example:" ] }, { @@ -460,7 +460,7 @@ "id": "9e4037bc", "metadata": {}, "source": [ - "Note that this function can only return the values that are present in your provided categories. If your categories do not cover all cases, your will get wrong answers:" + "Note that this function can only return the values that are provided in the `categories` argument. If your categories do not cover all cases, your may get wrong answers:" ] }, { From 8f9cbc3f35ef345009c632aadbcbce0d98402241 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 10 Oct 2025 13:16:21 -0700 Subject: [PATCH 20/22] refactor: add agg_ops.TimeSeriesDiffOp and DateSeriesDiffOp to sqlglot compiler (#2164) --- .../sqlglot/aggregations/unary_compiler.py | 38 +++++++++++++++++++ .../test_date_series_diff/out.sql | 17 +++++++++ .../test_time_series_diff/out.sql | 17 +++++++++ .../aggregations/test_unary_compiler.py | 22 +++++++++++ 4 files changed, 94 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_date_series_diff/out.sql create mode 100644 tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_time_series_diff/out.sql diff --git a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py index cfa27909c6..d157f07df2 100644 --- a/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py +++ b/bigframes/core/compile/sqlglot/aggregations/unary_compiler.py @@ -98,6 +98,27 @@ def _( return apply_window_if_present(sge.func("COUNT", column.expr), window) +@UNARY_OP_REGISTRATION.register(agg_ops.DateSeriesDiffOp) +def _( + op: agg_ops.DateSeriesDiffOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + if column.dtype != dtypes.DATE_DTYPE: + raise TypeError(f"Cannot perform date series diff on type {column.dtype}") + shift_op_impl = UNARY_OP_REGISTRATION[agg_ops.ShiftOp(0)] + shifted = shift_op_impl(agg_ops.ShiftOp(op.periods), column, window) + # Conversion factor from days to microseconds + conversion_factor = 24 * 60 * 60 * 1_000_000 + return sge.Cast( + this=sge.DateDiff( + this=column.expr, expression=shifted, unit=sge.Identifier(this="DAY") + ) + * sge.convert(conversion_factor), + to="INT64", + ) + + @UNARY_OP_REGISTRATION.register(agg_ops.DenseRankOp) def _( op: agg_ops.DenseRankOp, @@ -293,3 +314,20 @@ def _( # Will be null if all inputs are null. Pandas defaults to zero sum though. zero = pd.to_timedelta(0) if column.dtype == dtypes.TIMEDELTA_DTYPE else 0 return sge.func("IFNULL", expr, ir._literal(zero, column.dtype)) + + +@UNARY_OP_REGISTRATION.register(agg_ops.TimeSeriesDiffOp) +def _( + op: agg_ops.TimeSeriesDiffOp, + column: typed_expr.TypedExpr, + window: typing.Optional[window_spec.WindowSpec] = None, +) -> sge.Expression: + if column.dtype != dtypes.TIMESTAMP_DTYPE: + raise TypeError(f"Cannot perform time series diff on type {column.dtype}") + shift_op_impl = UNARY_OP_REGISTRATION[agg_ops.ShiftOp(0)] + shifted = shift_op_impl(agg_ops.ShiftOp(op.periods), column, window) + return sge.TimestampDiff( + this=column.expr, + expression=shifted, + unit=sge.Identifier(this="MICROSECOND"), + ) diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_date_series_diff/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_date_series_diff/out.sql new file mode 100644 index 0000000000..599d8333c9 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_date_series_diff/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `date_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CAST(DATE_DIFF( + `bfcol_0`, + LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST), + DAY + ) * 86400000000 AS INT64) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `diff_date` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_time_series_diff/out.sql b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_time_series_diff/out.sql new file mode 100644 index 0000000000..8ed95b3c07 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/aggregations/snapshots/test_unary_compiler/test_time_series_diff/out.sql @@ -0,0 +1,17 @@ +WITH `bfcte_0` AS ( + SELECT + `timestamp_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + TIMESTAMP_DIFF( + `bfcol_0`, + LAG(`bfcol_0`, 1) OVER (ORDER BY `bfcol_0` IS NULL ASC NULLS LAST, `bfcol_0` ASC NULLS LAST), + MICROSECOND + ) AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `diff_time` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py index a83a494e55..da388ccad1 100644 --- a/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py +++ b/tests/unit/core/compile/sqlglot/aggregations/test_unary_compiler.py @@ -127,6 +127,17 @@ def test_dense_rank(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_date_series_diff(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "date_col" + bf_df = scalar_types_df[[col_name]] + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + op = agg_exprs.UnaryAggregation( + agg_ops.DateSeriesDiffOp(periods=1), expression.deref(col_name) + ) + sql = _apply_unary_window_op(bf_df, op, window, "diff_date") + snapshot.assert_match(sql, "out.sql") + + def test_diff(scalar_types_df: bpd.DataFrame, snapshot): # Test integer int_col = "int64_col" @@ -331,3 +342,14 @@ def test_sum(scalar_types_df: bpd.DataFrame, snapshot): ) snapshot.assert_match(sql, "out.sql") + + +def test_time_series_diff(scalar_types_df: bpd.DataFrame, snapshot): + col_name = "timestamp_col" + bf_df = scalar_types_df[[col_name]] + window = window_spec.WindowSpec(ordering=(ordering.ascending_over(col_name),)) + op = agg_exprs.UnaryAggregation( + agg_ops.TimeSeriesDiffOp(periods=1), expression.deref(col_name) + ) + sql = _apply_unary_window_op(bf_df, op, window, "diff_time") + snapshot.assert_match(sql, "out.sql") From 93a0749392b84f27162654fe5ea5baa329a23f99 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 13 Oct 2025 12:08:53 -0700 Subject: [PATCH 21/22] docs: fix ai function related docs (#2149) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * doc: fix ai function related docs * fix docs * fix format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- bigframes/bigquery/_operations/ai.py | 52 +++++++++++++++++++++++++++- docs/templates/toc.yml | 3 +- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 5c001d4caf..f4302f8ece 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -65,7 +65,7 @@ def generate( 1 Ottawa\\n Name: result, dtype: string - You get structured output when the `output_schema` parameter is set: + You get structured output when the `output_schema` parameter is set: >>> animals = bpd.Series(["Rabbit", "Spider"]) >>> bbq.ai.generate(animals, output_schema={"number_of_legs": "INT64", "is_herbivore": "BOOL"}) @@ -73,6 +73,13 @@ def generate( 1 {'is_herbivore': False, 'number_of_legs': 8, '... dtype: struct>, status: string>[pyarrow] + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series @@ -165,6 +172,13 @@ def generate_bool( 2 False Name: result, dtype: boolean + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series @@ -240,6 +254,13 @@ def generate_int( 2 8 Name: result, dtype: Int64 + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series @@ -315,6 +336,13 @@ def generate_double( 2 8.0 Name: result, dtype: Float64 + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series @@ -371,6 +399,7 @@ def if_( provides optimization such that not all rows are evaluated with the LLM. **Examples:** + >>> import bigframes.pandas as bpd >>> import bigframes.bigquery as bbq >>> bpd.options.display.progress_bar = None @@ -386,6 +415,13 @@ def if_( 1 Illinois dtype: string + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series @@ -433,6 +469,13 @@ def classify( [2 rows x 2 columns] + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: input (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the input to send to the model. The Series can be BigFrames Series @@ -482,6 +525,13 @@ def score( 2 3.0 dtype: Float64 + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + Args: prompt (str | Series | List[str|Series] | Tuple[str|Series, ...]): A mixture of Series and string literals that specifies the prompt to send to the model. The Series can be BigFrames Series diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index ad96977152..f368cf21ae 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -219,7 +219,8 @@ - name: BigQuery built-in functions uid: bigframes.bigquery - name: BigQuery AI Functions - uid: bigframes.bigquery.ai + uid: bigframes.bigquery._operations.ai + status: beta name: bigframes.bigquery - items: - name: GeoSeries From bbfdb207a5595d3621048fac9d8138bbf736fb7b Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 13:25:41 -0700 Subject: [PATCH 22/22] chore(main): release 2.25.0 (#2145) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 28 +++++++++++++++++++++++ bigframes/version.py | 4 ++-- third_party/bigframes_vendored/version.py | 4 ++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4b00fd956d..86d7315896 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,34 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.25.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.24.0...v2.25.0) (2025-10-13) + + +### Features + +* Add barh, pie plot types ([#2146](https://github.com/googleapis/python-bigquery-dataframes/issues/2146)) ([5cc3c5b](https://github.com/googleapis/python-bigquery-dataframes/commit/5cc3c5b1391a7dfa062b1d77f001726b013f6337)) +* Add Index.__eq__ for consts, aligned objects ([#2141](https://github.com/googleapis/python-bigquery-dataframes/issues/2141)) ([8514200](https://github.com/googleapis/python-bigquery-dataframes/commit/85142008ec895fa078d192bbab942d0257f70df3)) +* Add output_schema parameter to ai.generate() ([#2139](https://github.com/googleapis/python-bigquery-dataframes/issues/2139)) ([ef0b0b7](https://github.com/googleapis/python-bigquery-dataframes/commit/ef0b0b73843da2a93baf08e4cd5457fbb590b89c)) +* Create session-scoped `cut`, `DataFrame`, `MultiIndex`, `Index`, `Series`, `to_datetime`, and `to_timedelta` methods ([#2157](https://github.com/googleapis/python-bigquery-dataframes/issues/2157)) ([5e1e809](https://github.com/googleapis/python-bigquery-dataframes/commit/5e1e8098ecf212c91d73fa80d722d1cb3e46668b)) +* Replace ML.GENERATE_TEXT with AI.GENERATE for audio transcription ([#2151](https://github.com/googleapis/python-bigquery-dataframes/issues/2151)) ([a410d0a](https://github.com/googleapis/python-bigquery-dataframes/commit/a410d0ae43ef3b053b650804156eda0b1f569da9)) +* Support string literal inputs for AI functions ([#2152](https://github.com/googleapis/python-bigquery-dataframes/issues/2152)) ([7600001](https://github.com/googleapis/python-bigquery-dataframes/commit/760000122dc190ac8a3303234cf4cbee1bbb9493)) + + +### Bug Fixes + +* Address typo in error message ([#2142](https://github.com/googleapis/python-bigquery-dataframes/issues/2142)) ([cdf2dd5](https://github.com/googleapis/python-bigquery-dataframes/commit/cdf2dd55a0c03da50ab92de09788cafac0abf6f6)) +* Avoid possible circular imports in global session ([#2115](https://github.com/googleapis/python-bigquery-dataframes/issues/2115)) ([095c0b8](https://github.com/googleapis/python-bigquery-dataframes/commit/095c0b85a25a2e51087880909597cc62a0341c93)) +* Fix too many cluster columns requested by caching ([#2155](https://github.com/googleapis/python-bigquery-dataframes/issues/2155)) ([35c1c33](https://github.com/googleapis/python-bigquery-dataframes/commit/35c1c33b85d1b92e402aab73677df3ffe43a51b4)) +* Show progress even in job optional queries ([#2119](https://github.com/googleapis/python-bigquery-dataframes/issues/2119)) ([1f48d3a](https://github.com/googleapis/python-bigquery-dataframes/commit/1f48d3a62e7e6dac4acb39e911daf766b8e2fe62)) +* Yield row count from read session if otherwise unknown ([#2148](https://github.com/googleapis/python-bigquery-dataframes/issues/2148)) ([8997d4d](https://github.com/googleapis/python-bigquery-dataframes/commit/8997d4d7d9965e473195f98c550c80657035b7e1)) + + +### Documentation + +* Add a brief intro notebook for bbq AI functions ([#2150](https://github.com/googleapis/python-bigquery-dataframes/issues/2150)) ([1f434fb](https://github.com/googleapis/python-bigquery-dataframes/commit/1f434fb5c7c00601654b3ab19c6ad7fceb258bd6)) +* Fix ai function related docs ([#2149](https://github.com/googleapis/python-bigquery-dataframes/issues/2149)) ([93a0749](https://github.com/googleapis/python-bigquery-dataframes/commit/93a0749392b84f27162654fe5ea5baa329a23f99)) +* Remove progress bar from getting started template ([#2143](https://github.com/googleapis/python-bigquery-dataframes/issues/2143)) ([d13abad](https://github.com/googleapis/python-bigquery-dataframes/commit/d13abadbcd68d03997e8dc11bb7a2b14bbd57fcc)) + ## [2.24.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.23.0...v2.24.0) (2025-10-07) diff --git a/bigframes/version.py b/bigframes/version.py index 93445c0c0d..0236e8236e 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.24.0" +__version__ = "2.25.0" # {x-release-please-start-date} -__release_date__ = "2025-10-07" +__release_date__ = "2025-10-13" # {x-release-please-end} diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 93445c0c0d..0236e8236e 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.24.0" +__version__ = "2.25.0" # {x-release-please-start-date} -__release_date__ = "2025-10-07" +__release_date__ = "2025-10-13" # {x-release-please-end}