From 14367943bb7e33987da4b701b8eeaf9efe2e6182 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 9 May 2024 22:37:54 +0000 Subject: [PATCH 1/3] feat: Series.str.split --- bigframes/core/compile/scalar_op_compiler.py | 5 ++ bigframes/dtypes.py | 6 +++ bigframes/operations/__init__.py | 17 +++++-- bigframes/operations/strings.py | 12 +++++ tests/system/small/operations/test_strings.py | 13 +++++ .../pandas/core/strings/accessor.py | 48 +++++++++++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a65ff6fe0c..06d889beaa 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -588,6 +588,11 @@ def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp): return any_match if any_match is not None else ibis_types.literal(False) +@scalar_op_compiler.register_unary_op(ops.StringSplitOp, pass_op=True) +def stringsplit_op_impl(x: ibis_types.Value, op: ops.StringSplitOp): + return typing.cast(ibis_types.StringValue, x).split(op.pat) + + @scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True) def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp): str_value = typing.cast(ibis_types.StringValue, x) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index d2dc210e0d..2a344aff2d 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -405,6 +405,12 @@ def bigframes_dtype_to_ibis_dtype( return BIGFRAMES_TO_IBIS[bigframes_dtype] +def bigframes_dtype_to_arrow_dtype( + bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] +) -> pa.DataType: + return ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype)) + + def literal_to_ibis_scalar( literal, force_dtype: typing.Optional[Dtype] = None, validate: bool = True ): diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a7c385a2b8..ce8b38d536 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -366,6 +366,19 @@ def output_type(self, *input_types): return op_typing.STRING_PREDICATE.output_type(input_types[0]) +@dataclasses.dataclass(frozen=True) +class StringSplitOp(UnaryOp): + name: typing.ClassVar[str] = "str_split" + pat: typing.Sequence[str] + + def output_type(self, *input_types): + input_type = input_types[0] + if not isinstance(input_type, pd.StringDtype): + raise TypeError("field accessor input must be a string type") + arrow_type = dtypes.bigframes_dtype_to_arrow_dtype(input_type) + return pd.ArrowDtype(pa.list_(arrow_type)) + + @dataclasses.dataclass(frozen=True) class EndsWithOp(UnaryOp): name: typing.ClassVar[str] = "str_endswith" @@ -443,9 +456,7 @@ def output_type(self, *input_types): raise TypeError("field accessor input must be a struct type") pa_result_type = pa_type[self.name_or_index].type - # TODO: Directly convert from arrow to pandas type - ibis_result_type = dtypes.arrow_dtype_to_ibis_dtype(pa_result_type) - return dtypes.ibis_dtype_to_bigframes_dtype(ibis_result_type) + return dtypes.arrow_dtype_to_bigframes_dtype(pa_result_type) @dataclasses.dataclass(frozen=True) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 883d19a1e3..4f4c3594a6 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -247,6 +247,18 @@ def endswith( pat = (pat,) return self._apply_unary_op(ops.EndsWithOp(pat=pat)) + def split( + self, + pat: str = " ", + regex: bool | None = None, + ) -> series.Series: + if regex is True or (regex is None and len(pat) > 1): + raise NotImplementedError( + "Regular expressions aren't currently supported. Please set " + + f"`regex=False` and try again. {constants.FEEDBACK_LINK}" + ) + return self._apply_unary_op(ops.StringSplitOp(pat=pat)) + def zfill(self, width: int) -> series.Series: return self._apply_unary_op(ops.ZfillOp(width=width)) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 9654c77ec4..4f6c71df98 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -531,3 +531,16 @@ def test_str_rjust(scalars_dfs): pd_result, bf_result, ) + + +def test_str_split(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_series: bigframes.series.Series = scalars_df[col_name] + bf_result = bf_series.str.split(" ").to_pandas() + pd_result = scalars_pandas_df[col_name].str.split(" ") + + # TODO(b/336880368): Allow for NULL values for ARRAY columns in BigQuery. + pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x) + + assert_series_equal(pd_result, bf_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 5bb69dc1f2..fc360a4e50 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -940,6 +940,54 @@ def endswith( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def split( + self, + pat: str = " ", + regex: bool | None = None, + ): + """ + Split strings around given separator/delimiter. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series( + ... [ + ... "a regular sentence", + ... "https://docs.python.org/index.html", + ... np.nan + ... ] + ... ) + >>> s.str.split() + 0 ['a' 'regular' 'sentence'] + 1 ['https://docs.python.org/index.html'] + 2 [] + dtype: list[pyarrow] + + The pat parameter can be used to split by other characters. + + >>> s.str.split("//", regex=False) + 0 ['a regular sentence'] + 1 ['https:' 'docs.python.org/index.html'] + 2 [] + dtype: list[pyarrow] + + Args: + pat (str, default " "): + String to split on. If not specified, split on whitespace. + regex (bool, default None): + Determines if the passed-in pattern is a regular expression. Regular + expressions aren't currently supported. Please set `regex=False` when + `pat` length is not 1. + + Returns: + bigframes.series.Series: Type matches caller. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def match(self, pat: str, case: bool = True, flags: int = 0): """ Determine if each string starts with a match of a regular expression. From eb05e3d101bb41ffc444df7678ddd8f826ab913c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 9 May 2024 23:17:54 +0000 Subject: [PATCH 2/3] add more tests --- tests/system/small/operations/test_strings.py | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 4f6c71df98..b8a8ad2d1e 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -533,12 +533,30 @@ def test_str_rjust(scalars_dfs): ) -def test_str_split(scalars_dfs): +@pytest.mark.parametrize( + ("pat", "regex"), + [ + pytest.param(" ", None, id="one_char"), + pytest.param("ll", False, id="two_chars"), + pytest.param( + " ", + True, + id="one_char_reg", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + "ll", + None, + id="two_chars_reg", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_str_split_raise_errors(scalars_dfs, pat, regex): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" - bf_series: bigframes.series.Series = scalars_df[col_name] - bf_result = bf_series.str.split(" ").to_pandas() - pd_result = scalars_pandas_df[col_name].str.split(" ") + bf_result = scalars_df[col_name].str.split(pat=pat, regex=regex).to_pandas() + pd_result = scalars_pandas_df[col_name].str.split(pat=pat, regex=regex) # TODO(b/336880368): Allow for NULL values for ARRAY columns in BigQuery. pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x) From 354bf9339ccd38525387df1c483347583f8d073a Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 9 May 2024 23:45:05 +0000 Subject: [PATCH 3/3] format fix --- bigframes/operations/strings.py | 2 +- third_party/bigframes_vendored/pandas/core/strings/accessor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 4f4c3594a6..22c325d7e0 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -250,7 +250,7 @@ def endswith( def split( self, pat: str = " ", - regex: bool | None = None, + regex: Union[bool, None] = None, ) -> series.Series: if regex is True or (regex is None and len(pat) > 1): raise NotImplementedError( diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index fc360a4e50..b02c23f945 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -943,7 +943,7 @@ def endswith( def split( self, pat: str = " ", - regex: bool | None = None, + regex: typing.Union[bool, None] = None, ): """ Split strings around given separator/delimiter.