8000 feat: Series.str.split by chelsea-lin · Pull Request #675 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,11 @@ def endswith_op_impl(x: ibis_types.Value, op: ops.EndsWithOp):
return any_match if any_match is not None else ibis_types.literal(False)


@scalar_op_compiler.register_unary_op(ops.StringSplitOp, pass_op=True)
def stringsplit_op_impl(x: ibis_types.Value, op: ops.StringSplitOp):
return typing.cast(ibis_types.StringValue, x).split(op.pat)


@scalar_op_compiler.register_unary_op(ops.ZfillOp, pass_op=True)
def zfill_op_impl(x: ibis_types.Value, op: ops.ZfillOp):
str_value = typing.cast(ibis_types.StringValue, x)
Expand Down
6 changes: 6 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,12 @@ def bigframes_dtype_to_ibis_dtype(
return BIGFRAMES_TO_IBIS[bigframes_dtype]


def bigframes_dtype_to_arrow_dtype(
bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
) -> pa.DataType:
return ibis_dtype_to_arrow_dtype(bigframes_dtype_to_ibis_dtype(bigframes_dtype))


def literal_to_ibis_scalar(
literal, force_dtype: typing.Optional[Dtype] = None, validate: bool = True
):
Expand Down
17 changes: 14 additions & 3 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,19 @@ def output_type(self, *input_types):
return op_typing.STRING_PREDICATE.output_type(input_types[0])


@dataclasses.dataclass(frozen=True)
class StringSplitOp(UnaryOp):
name: typing.ClassVar[str] = "str_split"
pat: typing.Sequence[str]

def output_type(self, *input_types):
input_type = input_types[0]
if not isinstance(input_type, pd.StringDtype):
raise TypeError("field accessor input must be a string type")
arrow_type = dtypes.bigframes_dtype_to_arrow_dtype(input_type)
return pd.ArrowDtype(pa.list_(arrow_type))


@dataclasses.dataclass(frozen=True)
class EndsWithOp(UnaryOp):
name: typing.C 8000 lassVar[str] = "str_endswith"
Expand Down Expand Up @@ -463,9 +476,7 @@ def output_type(self, *input_types):
raise TypeError("field accessor input must be a struct type")

pa_result_type = pa_type[self.name_or_index].type
# TODO: Directly convert from arrow to pandas type
ibis_result_type = dtypes.arrow_dtype_to_ibis_dtype(pa_result_type)
return dtypes.ibis_dtype_to_bigframes_dtype(ibis_result_type)
return dtypes.arrow_dtype_to_bigframes_dtype(pa_result_type)


@dataclasses.dataclass(frozen=True)
Expand Down
12 changes: 12 additions & 0 deletions bigframes/operations/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,18 @@ def endswith(
pat = (pat,)
return self._apply_unary_op(ops.EndsWithOp(pat=pat))

def split(
self,
pat: str = " ",
regex: Union[bool, None] = None,
) -> series.Series:
if regex is True or (regex is None and len(pat) > 1):
raise NotImplementedError(
"Regular expressions aren't currently supported. Please set "
+ f"`regex=False` and try again. {constants.FEEDBACK_LINK}"
)
return self._apply_unary_op(ops.StringSplitOp(pat=pat))

def zfill(self, width: int) -> series.Series:
return self._apply_unary_op(ops.ZfillOp(width=width))

Expand Down
31 changes: 31 additions & 0 deletions tests/system/small/operations/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,3 +531,34 @@ def test_str_rjust(scalars_dfs):
pd_result,
bf_result,
)


@pytest.mark.parametrize(
("pat", "regex"),
[
pytest.param(" ", None, id="one_char"),
pytest.param("ll", False, id="two_chars"),
pytest.param(
" ",
True,
id="one_char_reg",
marks=pytest.mark.xfail(raises=NotImplementedError),
),
pytest.param(
"ll",
None,
id="two_chars_reg",
marks=pytest.mark.xfail(raises=NotImplementedError),
),
],
)
def test_str_split_raise_errors(scalars_dfs, pat, regex):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "string_col"
bf_result = scalars_df[col_name].str.split(pat=pat, regex=regex).to_pandas()
pd_result = scalars_pandas_df[col_name].str.split(pat=pat, regex=regex)

# TODO(b/336880368): Allow for NULL values for ARRAY columns in BigQuery.
pd_result = pd_result.apply(lambda x: [] if pd.isnull(x) is True else x)

assert_series_equal(pd_result, bf_result, check_dtype=False)
48 changes: 48 additions & 0 deletions third_party/bigframes_vendored/pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,54 @@ def endswith(
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def split(
self,
pat: str = " ",
regex: typing.Union[bool, None] = None,
):
"""
Split strings around given separator/delimiter.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import numpy as np
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series(
... [
... "a regular sentence",
... "https://docs.python.org/index.html",
... np.nan
... ]
... )
>>> s.str.split()
0 ['a' 'regular' 'sentence']
1 ['https://docs.python.org/index.html']
2 []
dtype: list<item: string>[pyarrow]

The pat parameter can be used to split by other characters.

>>> s.str.split("//", regex=False)
0 ['a regular sentence']
1 ['https:' 'docs.python.org/index.html']
2 []
dtype: list<item: string>[pyarrow]

Args:
pat (str, default " "):
String to split on. If not specified, split on whitespace.
regex (bool, default None):
Determines if the passed-in pattern is a regular expression. Regular
expressions aren't currently supported. Please set `regex=False` when
`pat` length is not 1.

Returns:
bigframes.series.Series: Type matches caller.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def match(self, pat: str, case: bool = True, flags: int = 0):
"""
Determine if each string starts with a match of a regular expression.
Expand Down
0