From 7fcd4b656380cdfb0a8e409d7bc1baa46748f10d Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Tue, 22 Jul 2025 21:33:36 +0000 Subject: [PATCH 1/8] I am working on adding support for pyarrow.Scalar to infer_literal_method. --- bigframes/dtypes.py | 2 ++ setup.py | 1 + tests/unit/core/test_dtypes.py | 12 ++++++++++++ 3 files changed, 15 insertions(+) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 0be31505df..a58619dc21 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -659,6 +659,8 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: def infer_literal_type(literal) -> typing.Optional[Dtype]: # Maybe also normalize literal to canonical python representation to remove this burden from compilers? + if isinstance(literal, pa.Scalar): + return arrow_dtype_to_bigframes_dtype(literal.type) if pd.api.types.is_list_like(literal): element_types = [infer_literal_type(i) for i in literal] common_type = lcd_type(*element_types) diff --git a/setup.py b/setup.py index 63d019caa0..9b777ed8d9 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ "matplotlib >=3.7.1", "db-dtypes >=1.4.2", # For vendored ibis-framework. + "ibis-framework==6.2.0", "atpublic>=2.3,<6", "python-dateutil>=2.8.2,<3", "pytz>=2022.7", diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 77392bea2f..3cce7f6f02 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -272,3 +272,15 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): ValueError, ): bigframes.core.compile.ibis_types.literal_to_ibis_scalar({"mykey": "myval"}) + + +@pytest.mark.parametrize( + ["scalar", "expected_dtype"], + [ + (pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE), + (pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), + (pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE), + ], +) +def test_infer_literal_type_arrow_scalar(scalar, expected_dtype): + assert bigframes.dtypes.infer_literal_type(scalar) == expected_dtype From a083fa1faed0ecfa404adbbb192536332d71c196 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 22 Jul 2025 16:36:01 -0500 Subject: [PATCH 2/8] Update setup.py --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 9b777ed8d9..63d019caa0 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,6 @@ "matplotlib >=3.7.1", "db-dtypes >=1.4.2", # For vendored ibis-framework. - "ibis-framework==6.2.0", "atpublic>=2.3,<6", "python-dateutil>=2.8.2,<3", "pytz>=2022.7", From 64c8872034cff05262a42134c80cc964122695bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 22 Jul 2025 16:37:35 -0500 Subject: [PATCH 3/8] Update tests/unit/core/test_dtypes.py --- tests/unit/core/test_dtypes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 3cce7f6f02..cd23614bbf 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -280,6 +280,10 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): (pa.scalar(1_000_000_000, type=pa.int64()), bigframes.dtypes.INT_DTYPE), (pa.scalar(True, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), (pa.scalar("hello", type=pa.string()), bigframes.dtypes.STRING_DTYPE), + # Support NULL scalars. + (pa.scalar(None, type=pa.int64()), bigframes.dtypes.INT_DTYPE), + (pa.scalar(None, type=pa.bool_()), bigframes.dtypes.BOOL_DTYPE), + (pa.scalar(None, type=pa.string()), bigframes.dtypes.STRING_DTYPE), ], ) def test_infer_literal_type_arrow_scalar(scalar, expected_dtype): From 29bdab54f43dec517bb05798db064f92fea36f0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 23 Jul 2025 11:43:20 -0500 Subject: [PATCH 4/8] patch ibis --- tests/system/small/test_dataframe.py | 50 ++++++++++++++++--- .../ibis/common/temporal.py | 5 ++ .../ibis/expr/datatypes/value.py | 12 +++++ .../ibis/formats/pyarrow.py | 2 - 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index caf39bd9e9..bc773d05b2 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -906,15 +906,53 @@ def test_df_to_pandas_batches(scalars_dfs): assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) -def test_assign_new_column(scalars_dfs): +@pytest.mark.parametrize( + ("literal", "expected_dtype"), + ( + pytest.param( + 2, + dtypes.INT_DTYPE, + id="INT64", + ), + # ==================================================================== + # NULL values + # + # These are regression tests for b/428999884. It needs to be possible to + # set a column to NULL with a desired type (not just the pandas default + # of float64). + # ==================================================================== + pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), + pytest.param( + pa.scalar(None, type=pa.int64()), + dtypes.INT_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us", tz="UTC")), + dtypes.TIMESTAMP_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us")), + dtypes.DATETIME_DTYPE, + id="NULL-pyarrow-DATETIME", + ), + ), +) +def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"new_col": 2} - df = scalars_df.assign(**kwargs) + df = scalars_df.assign(new_col=literal) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") + new_col_pd = literal + if isinstance(literal, pa.Scalar): + # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. + new_col_pd = literal.as_py() + + # Pandas might not pick the same dtype as BigFrames, but it should at least + # be castable to it. + pd_result = scalars_pandas_df.assign(new_col=new_col_pd) + pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) assert_pandas_df_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/ibis/common/temporal.py b/third_party/bigframes_vendored/ibis/common/temporal.py index 1b0e4fa985..8d84caf5a1 100644 --- a/third_party/bigframes_vendored/ibis/common/temporal.py +++ b/third_party/bigframes_vendored/ibis/common/temporal.py @@ -260,3 +260,8 @@ def _from_numpy_datetime64(value): raise TypeError("Unable to convert np.datetime64 without pandas") else: return pd.Timestamp(value).to_pydatetime() + + +@normalize_datetime.register("pyarrow.Scalar") +def _from_pyarrow_scalar(value): + return value.as_py() diff --git a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py index e390cea02c..85be0ac749 100644 --- a/third_party/bigframes_vendored/ibis/expr/datatypes/value.py +++ b/third_party/bigframes_vendored/ibis/expr/datatypes/value.py @@ -27,6 +27,7 @@ import bigframes_vendored.ibis.expr.datatypes as dt from bigframes_vendored.ibis.expr.datatypes.cast import highest_precedence from public import public +import pyarrow as pa import toolz @@ -71,6 +72,14 @@ def infer_list(values: Sequence[Any]) -> dt.Array: return dt.Array(highest_precedence(map(infer, values))) +@infer.register("pyarrow.Scalar") +def infer_pyarrow_scalar(value: "pa.Scalar"): + """Infert the type of a PyArrow Scalar value.""" + import bigframes_vendored.ibis.formats.pyarrow + + return bigframes_vendored.ibis.formats.pyarrow.PyArrowType.to_ibis(value.type) + + @infer.register(datetime.time) def infer_time(value: datetime.time) -> dt.Time: return dt.time @@ -253,6 +262,9 @@ def infer_shapely_multipolygon(value) -> dt.MultiPolygon: def normalize(typ, value): """Ensure that the Python type underlying a literal resolves to a single type.""" + if pa is not None and isinstance(value, pa.Scalar): + value = value.as_py() + dtype = dt.dtype(typ) if value is None: if not dtype.nullable: diff --git a/third_party/bigframes_vendored/ibis/formats/pyarrow.py b/third_party/bigframes_vendored/ibis/formats/pyarrow.py index a6861b52e1..491e551ec1 100644 --- a/third_party/bigframes_vendored/ibis/formats/pyarrow.py +++ b/third_party/bigframes_vendored/ibis/formats/pyarrow.py @@ -24,7 +24,6 @@ @functools.cache def _from_pyarrow_types(): import pyarrow as pa - import pyarrow_hotfix # noqa: F401 return { pa.int8(): dt.Int8, @@ -87,7 +86,6 @@ class PyArrowType(TypeMapper): def to_ibis(cls, typ: pa.DataType, nullable=True) -> dt.DataType: """Convert a pyarrow type to an ibis type.""" import pyarrow as pa - import pyarrow_hotfix # noqa: F401 if pa.types.is_null(typ): return dt.null From 526d472166cfe876edf75702442387fa4dcb6b7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 23 Jul 2025 12:13:20 -0500 Subject: [PATCH 5/8] increase timeout --- tests/system/small/test_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 4bb1c6589a..f48304afc6 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -606,7 +606,7 @@ def test_read_gbq_wildcard( "query": { "useQueryCache": True, "maximumBytesBilled": "1000000000", - "timeoutMs": 10000, + "timeoutMs":120_000, } }, pytest.param( From 81d17579923c1f4056c715af8bd7a359058863b4 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 23 Jul 2025 17:15:40 +0000 Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/small/test_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index f48304afc6..a04da64af0 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -606,7 +606,7 @@ def test_read_gbq_wildcard( "query": { "useQueryCache": True, "maximumBytesBilled": "1000000000", - "timeoutMs":120_000, + "timeoutMs": 120_000, } }, pytest.param( From 2598b7333563303827ee1635b9aab64be14ad081 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 23 Jul 2025 17:16:36 +0000 Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/small/test_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index f48304afc6..a04da64af0 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -606,7 +606,7 @@ def test_read_gbq_wildcard( "query": { "useQueryCache": True, "maximumBytesBilled": "1000000000", - "timeoutMs":120_000, + "timeoutMs": 120_000, } }, pytest.param( From ea0792a4d426d72d519ce6d7699f4cd7d924982e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Wed, 23 Jul 2025 12:17:04 -0500 Subject: [PATCH 8/8] lint --- tests/system/small/test_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index f48304afc6..a04da64af0 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -606,7 +606,7 @@ def test_read_gbq_wildcard( "query": { "useQueryCache": True, "maximumBytesBilled": "1000000000", - "timeoutMs":120_000, + "timeoutMs": 120_000, } }, pytest.param(