From 21b044f63d58c73d26080e6e1559d70f808fc70d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 28 Sep 2023 17:28:20 -0500 Subject: [PATCH 1/9] feat: support STRUCT data type with `Series.struct.field` to extract subfields --- bigframes/dtypes.py | 66 +++++++++++++++++-- bigframes/operations/structs.py | 47 +++++++++++++ .../pandas/core/arrays/__init__.py | 0 .../pandas/core/arrays/arrow/__init__.py | 0 .../pandas/core/arrays/arrow/accessors.py | 63 ++++++++++++++++++ 5 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 bigframes/operations/structs.py create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py create mode 100644 third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 271b8aa2f2..644fda0913 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -84,10 +84,10 @@ BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), + (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), (ibis_dtypes.float64, pd.Float64Dtype()), (ibis_dtypes.int64, pd.Int64Dtype()), (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), (ibis_dtypes.Timestamp(timezone=None), pd.ArrowDtype(pa.timestamp("us"))), ( @@ -100,6 +100,19 @@ pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } +IBIS_TO_ARROW: Dict[ibis_dtypes.DataType, pa.DataType] = { + ibis_dtypes.boolean: pa.bool_(), + ibis_dtypes.date: pa.date32(), + ibis_dtypes.float64: pa.float64(), + ibis_dtypes.int64: pa.int64(), + ibis_dtypes.string: pa.string(), + ibis_dtypes.time: pa.time64("us"), + ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), + ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), +} + +ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} + IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } @@ -148,11 +161,12 @@ def ibis_dtype_to_bigframes_dtype( # Special cases: Ibis supports variations on these types, but currently # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) - if isinstance(ibis_dtype, ibis_dtypes.Array) or isinstance( - ibis_dtype, ibis_dtypes.Struct - ): + if isinstance(ibis_dtype, ibis_dtypes.Array): return np.dtype("O") + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): @@ -164,6 +178,29 @@ def ibis_dtype_to_bigframes_dtype( ) +def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: + if isinstance(ibis_dtype, ibis_dtypes.Array): + return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type)) + + if isinstance(ibis_dtype, ibis_dtypes.Struct): + return pa.struct( + [ + (name, ibis_dtype_to_arrow_dtype(dtype)) + for name, dtype in ibis_dtype.fields.items() + ] + ) + + if ibis_dtype in IBIS_TO_ARROW: + return IBIS_TO_ARROW[ibis_dtype] + elif isinstance(ibis_dtype, ibis_dtypes.Null): + # Fallback to STRING for NULL values for most flexibility in SQL. + return IBIS_TO_ARROW[ibis_dtypes.string] + else: + raise ValueError( + f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" + ) + + def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: """Converts an Ibis expression to canonical type. @@ -187,6 +224,24 @@ def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: return table.select(*casted_columns) +def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: + if pa.types.is_struct(arrow_dtype): + struct_dtype = typing.cast(pa.StructType, arrow_dtype) + return ibis_dtypes.Struct.from_tuples( + [ + (field.name, arrow_dtype_to_ibis_dtype(field.type)) + for field in struct_dtype + ] + ) + + if arrow_dtype in ARROW_TO_IBIS: + return ARROW_TO_IBIS[arrow_dtype] + else: + raise ValueError( + f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" + ) + + def bigframes_dtype_to_ibis_dtype( bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] ) -> ibis_dtypes.DataType: @@ -202,6 +257,9 @@ def bigframes_dtype_to_ibis_dtype( Raises: ValueError: If passed a dtype not supported by BigQuery DataFrames. """ + if isinstance(bigframes_dtype, pd.ArrowDtype): + return arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) + type_string = str(bigframes_dtype) if type_string in BIGFRAMES_STRING_TO_BIGFRAMES: bigframes_dtype = BIGFRAMES_STRING_TO_BIGFRAMES[ diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py new file mode 100644 index 0000000000..f12c97981b --- /dev/null +++ b/bigframes/operations/structs.py @@ -0,0 +1,47 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import typing + +import ibis.expr.types as ibis_types + +import bigframes.dataframe +import bigframes.operations +import bigframes.operations.base +import bigframes.series +import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors + + +class StructField(bigframes.operations.UnaryOp): + def __init__(self, name_or_index: str | int): + self._name_or_index = name_or_index + + def _as_ibis(self, x: ibis_types.Value): + struct_value = typing.cast(ibis_types.StructValue, x) + if isinstance(self._name_or_index, str): + name = self._name_or_index + else: + name = struct_value.names[self._name_or_index] + return struct_value[name] + + +class StructAccessor( + bigframes.operations.base.SeriesMethods, vendoracessors.StructAccessor +): + __doc__ = vendoracessors.StructAccessor.__doc__ + + def field(self, name_or_index: str | int) -> bigframes.series.Series: + return self._apply_unary_op(StructField(name_or_index)) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py new file mode 100644 index 0000000000..cabb3566ee --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -0,0 +1,63 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arrays/arrow/accessors.py +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from bigframes import constants + + +class StructAccessor: + """ + Accessor object for structured data properties of the Series values. + """ + + def field(self, name_or_index: str | int): + """ + Extract a child field of a struct as a Series. + + Parameters + ---------- + name_or_index : str | int + Name or index of the child field to extract. + + Returns + ------- + pandas.Series + The data corresponding to the selected child field. + + See Also + -------- + Series.struct.explode : Return all child fields as a DataFrame. + + Examples + -------- + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: int64[pyarrow] + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 05105a8de1bd9e1510fa62def8e16849a725c8d8 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 29 Sep 2023 10:55:02 -0500 Subject: [PATCH 2/9] implement explode --- bigframes/dataframe.py | 10 +++++- bigframes/operations/base.py | 10 +++++- bigframes/operations/structs.py | 18 ++++++++-- bigframes/series.py | 5 +++ .../pandas/core/arrays/arrow/accessors.py | 36 +++++++++++++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0d357e7c3d..5a3834f84f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -161,7 +161,15 @@ def __init__( columns=columns, # type:ignore dtype=dtype, # type:ignore ) - if pd_dataframe.size < MAX_INLINE_DF_SIZE: + if ( + pd_dataframe.size < MAX_INLINE_DF_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pd_dataframe.dtypes + if isinstance(dt, pandas.ArrowDtype) + ) + ): self._block = blocks.block_from_local( pd_dataframe, session or bigframes.pandas.get_global_session() ) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index add6af57f4..51eaad18b9 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -86,7 +86,15 @@ def __init__( if pd_series.name is None: # to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1) - if pd_dataframe.size < MAX_INLINE_SERIES_SIZE: + if ( + pd_dataframe.size < MAX_INLINE_SERIES_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pd_dataframe.dtypes + if isinstance(dt, pd.ArrowDtype) + ) + ): self._block = blocks.block_from_local( pd_dataframe, session or bigframes.pandas.get_global_session() ) diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index f12c97981b..80d51115d0 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -35,7 +35,7 @@ def _as_ibis(self, x: ibis_types.Value): name = self._name_or_index else: name = struct_value.names[self._name_or_index] - return struct_value[name] + return struct_value[name].name(name) class StructAccessor( @@ -44,4 +44,18 @@ class StructAccessor( __doc__ = vendoracessors.StructAccessor.__doc__ def field(self, name_or_index: str | int) -> bigframes.series.Series: - return self._apply_unary_op(StructField(name_or_index)) + series = self._apply_unary_op(StructField(name_or_index)) + if isinstance(name_or_index, str): + name = name_or_index + else: + struct_field = self._dtype.pyarrow_dtype[name_or_index] + name = struct_field.name + return series.rename(name) + + def explode(self) -> bigframes.dataframe.DataFrame: + import bigframes.pandas + + pa_type = self._dtype.pyarrow_dtype + return bigframes.pandas.concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff --git a/bigframes/series.py b/bigframes/series.py index c1c0cb0537..5efe7b3365 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -51,6 +51,7 @@ import bigframes.operations.base import bigframes.operations.datetimes as dt import bigframes.operations.strings as strings +import bigframes.operations.structs as structs import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series LevelType = typing.Union[str, int] @@ -118,6 +119,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]: self._set_internal_query_job(self._compute_dry_run()) return self._query_job + @property + def struct(self) -> structs.StructAccessor: + return structs.StructAccessor(self._block) + def _set_internal_query_job(self, query_job: bigquery.QueryJob): self._query_job = query_job diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index cabb3566ee..7268775f25 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -61,3 +61,39 @@ def field(self, name_or_index: str | int): Name: version, dtype: int64[pyarrow] """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def explode(self): + """ + Extract all child fields of a struct as a DataFrame. + + Returns + ------- + pandas.DataFrame + The data corresponding to all child fields. + + See Also + -------- + Series.struct.field : Return a single child field as a Series. + + Examples + -------- + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 31290425af4b161fc0c395d133540d4592b1c2e4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 10:46:23 -0500 Subject: [PATCH 3/9] fix docstrings --- noxfile.py | 2 +- .../pandas/core/arrays/arrow/accessors.py | 129 +++++++++--------- .../bigframes_vendored/sklearn/__init__.py | 0 .../sklearn/ensemble/__init__.py | 0 .../bigframes_vendored/xgboost/__init__.py | 0 5 files changed, 63 insertions(+), 68 deletions(-) create mode 100644 third_party/bigframes_vendored/sklearn/__init__.py create mode 100644 third_party/bigframes_vendored/sklearn/ensemble/__init__.py create mode 100644 third_party/bigframes_vendored/xgboost/__init__.py diff --git a/noxfile.py b/noxfile.py index 033bbfefe4..da9dff92fe 100644 --- a/noxfile.py +++ b/noxfile.py @@ -362,7 +362,7 @@ def doctest(session: nox.sessions.Session): run_system( session=session, prefix_name="doctest", - extra_pytest_options=("--doctest-modules",), + extra_pytest_options=("--doctest-modules", "third_party"), test_folder="bigframes", check_cov=True, ) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 7268775f25..8e3ea06a3d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -15,50 +15,45 @@ def field(self, name_or_index: str | int): """ Extract a child field of a struct as a Series. - Parameters - ---------- - name_or_index : str | int - Name or index of the child field to extract. - - Returns - ------- - pandas.Series - The data corresponding to the selected child field. - - See Also - -------- - Series.struct.explode : Return all child fields as a DataFrame. - - Examples - -------- - >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> s = bpd.Series( - ... [ - ... {"version": 1, "project": "pandas"}, - ... {"version": 2, "project": "pandas"}, - ... {"version": 1, "project": "numpy"}, - ... ], - ... dtype=bpd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) - ... ) + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) Extract by field name. - >>> s.struct.field("project") - 0 pandas - 1 pandas - 2 numpy - Name: project, dtype: string[pyarrow] + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string Extract by field index. - >>> s.struct.field(0) - 0 1 - 1 2 - 2 1 - Name: version, dtype: int64[pyarrow] + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: Int64 + + Args: + name_or_index: + Name (str) or index (int) of the child field to extract. + + Returns: + Series: + The data corresponding to the selected child field. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -66,34 +61,34 @@ def explode(self): """ Extract all child fields of a struct as a DataFrame. - Returns - ------- - pandas.DataFrame - The data corresponding to all child fields. - - See Also - -------- - Series.struct.field : Return a single child field as a Series. - - Examples - -------- - >>> import bigframes.pandas as bpd - >>> import pyarrow as pa - >>> s = bpd.Series( - ... [ - ... {"version": 1, "project": "pandas"}, - ... {"version": 2, "project": "pandas"}, - ... {"version": 1, "project": "numpy"}, - ... ], - ... dtype=bpd.ArrowDtype(pa.struct( - ... [("version", pa.int64()), ("project", pa.string())] - ... )) - ... ) - - >>> s.struct.explode() - version project - 0 1 pandas - 1 2 pandas - 2 1 numpy + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract all child fields. + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + + [3 rows x 2 columns] + + Returns: + DataFrame: + The data corresponding to all child fields. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/__init__.py b/third_party/bigframes_vendored/sklearn/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/sklearn/ensemble/__init__.py b/third_party/bigframes_vendored/sklearn/ensemble/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/xgboost/__init__.py b/third_party/bigframes_vendored/xgboost/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From f4671fce4134bbcd88715a6beff639f47978f478 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 10:57:52 -0500 Subject: [PATCH 4/9] add unit tests --- tests/unit/test_dtypes.py | 64 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index bb8ae570dc..3baff2e1f5 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -85,6 +85,70 @@ def test_ibis_float32_raises_unexpected_datatype(): bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_dtypes.float32) +IBIS_ARROW_DTYPES = ( + (ibis_dtypes.boolean, pa.bool_()), + (ibis_dtypes.date, pa.date32()), + (ibis_dtypes.Timestamp(), pa.timestamp("us")), + (ibis_dtypes.float64, pa.float64()), + ( + ibis_dtypes.Timestamp(timezone="UTC"), + pa.timestamp("us", tz="UTC"), + ), + ( + ibis_dtypes.Struct.from_tuples( + [ + ("name", ibis_dtypes.string()), + ("version", ibis_dtypes.int64()), + ] + ), + pa.struct( + [ + ("name", pa.string()), + ("version", pa.int64()), + ] + ), + ), + ( + ibis_dtypes.Struct.from_tuples( + [ + ( + "nested", + ibis_dtypes.Struct.from_tuples( + [ + ("field", ibis_dtypes.string()), + ] + ), + ), + ] + ), + pa.struct( + [ + ( + "nested", + pa.struct( + [ + ("field", pa.string()), + ] + ), + ), + ] + ), + ), +) + + +@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) +def test_arrow_dtype_to_ibis_dtype(ibis_dtype, arrow_dtype): + result = bigframes.dtypes.arrow_dtype_to_ibis_dtype(arrow_dtype) + assert result == ibis_dtype + + +@pytest.mark.parametrize(("ibis_dtype", "arrow_dtype"), IBIS_ARROW_DTYPES) +def test_ibis_dtype_to_arrow_dtype(ibis_dtype, arrow_dtype): + result = bigframes.dtypes.ibis_dtype_to_arrow_dtype(ibis_dtype) + assert result == arrow_dtype + + @pytest.mark.parametrize( ["bigframes_dtype", "ibis_dtype"], [ From a18370888ef720235b2b75d9bbc788ff727cf123 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 11:37:09 -0500 Subject: [PATCH 5/9] update struct dtype tests --- tests/system/small/test_dataframe.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e71b1430e6..43dfbed426 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -884,7 +884,19 @@ def test_get_dtypes_array_struct(session): dtypes = df.dtypes pd.testing.assert_series_equal( dtypes, - pd.Series({"array_column": np.dtype("O"), "struct_column": np.dtype("O")}), + pd.Series( + { + "array_column": np.dtype("O"), + "struct_column": pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + } + ), ) From d600a1c7e53a686f14f3959f150be78b5edb4241 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 12:49:36 -0500 Subject: [PATCH 6/9] cleanup before doctest --- noxfile.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/noxfile.py b/noxfile.py index a113e1fcde..1ce3965d6e 100644 --- a/noxfile.py +++ b/noxfile.py @@ -275,6 +275,20 @@ def install_systemtest_dependencies(session, install_test_extra, *constraints): session.install("-e", ".", *constraints) +def clean_pycache(): + paths = CURRENT_DIRECTORY.glob("**/__pycache__/**/*") + for path in paths: + path.unlink() + + paths = CURRENT_DIRECTORY.glob("**/__pycache__") + for path in paths: + path.rmdir() + + paths = CURRENT_DIRECTORY.glob("**/*.pyc") + for path in paths: + path.unlink() + + def run_system( session: nox.sessions.Session, prefix_name, @@ -286,6 +300,7 @@ def run_system( extra_pytest_options=(), ): """Run the system test suite.""" + clean_pycache() constraints_path = str( CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) From 9e90b1919401dd9a3869f07a317f2797b7868423 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 12:58:49 -0500 Subject: [PATCH 7/9] alternative workaround for mismatch import error --- noxfile.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/noxfile.py b/noxfile.py index 1ce3965d6e..15c87746f4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -275,20 +275,6 @@ def install_systemtest_dependencies(session, install_test_extra, *constraints): session.install("-e", ".", *constraints) -def clean_pycache(): - paths = CURRENT_DIRECTORY.glob("**/__pycache__/**/*") - for path in paths: - path.unlink() - - paths = CURRENT_DIRECTORY.glob("**/__pycache__") - for path in paths: - path.rmdir() - - paths = CURRENT_DIRECTORY.glob("**/*.pyc") - for path in paths: - path.unlink() - - def run_system( session: nox.sessions.Session, prefix_name, @@ -300,7 +286,6 @@ def run_system( extra_pytest_options=(), ): """Run the system test suite.""" - clean_pycache() constraints_path = str( CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" ) @@ -374,6 +359,9 @@ def system_noextras(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) def doctest(session: nox.sessions.Session): """Run the system test suite.""" + # Workaround https://github.com/pytest-dev/pytest/issues/9567 + os.environ["PY_IGNORE_IMPORTMISMATCH"] = "1" + run_system( session=session, prefix_name="doctest", From e55be81e42a8c8a7f92fe15241ce6929947a558e Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 13:00:30 -0500 Subject: [PATCH 8/9] alternative workaround for mismatch import error --- .kokoro/build.sh | 3 +++ noxfile.py | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.kokoro/build.sh b/.kokoro/build.sh index a0fa4bc787..58eaa7fedf 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -26,6 +26,9 @@ cd "${PROJECT_ROOT}" # Disable buffering, so that the logs stream through. export PYTHONUNBUFFERED=1 +# Workaround https://github.com/pytest-dev/pytest/issues/9567 +export PY_IGNORE_IMPORTMISMATCH=1 + # Debug: show build environment env | grep KOKORO diff --git a/noxfile.py b/noxfile.py index 15c87746f4..a113e1fcde 100644 --- a/noxfile.py +++ b/noxfile.py @@ -359,9 +359,6 @@ def system_noextras(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) def doctest(session: nox.sessions.Session): """Run the system test suite.""" - # Workaround https://github.com/pytest-dev/pytest/issues/9567 - os.environ["PY_IGNORE_IMPORTMISMATCH"] = "1" - run_system( session=session, prefix_name="doctest", From 6c133143f450949295424da5c03b3f96eac7529a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 2 Oct 2023 17:07:30 -0500 Subject: [PATCH 9/9] remove dead ibis null to arrow check --- bigframes/dtypes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 85b473ebbd..46a7a1cb50 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -192,9 +192,6 @@ def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: if ibis_dtype in IBIS_TO_ARROW: return IBIS_TO_ARROW[ibis_dtype] - elif isinstance(ibis_dtype, ibis_dtypes.Null): - # Fallback to STRING for NULL values for most flexibility in SQL. - return IBIS_TO_ARROW[ibis_dtypes.string] else: raise ValueError( f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}"