8000 feat: Add bbq.json_value_array and deprecate bbq.json_extract_string_array by chelsea-lin · Pull Request #1818 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content

feat: Add bbq.json_value_array and deprecate bbq.json_extract_string_array #1818

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
json_query_array,
json_set,
json_value,
json_value_array,
parse_json,
)
from bigframes.bigquery._operations.search import create_vector_index, vector_search
Expand Down Expand Up @@ -71,6 +72,7 @@
"json_query_array",
"json_set",
"json_value",
"json_value_array",
"parse_json",
# search ops
"create_vector_index",
Expand Down
66 changes: 65 additions & 1 deletion bigframes/bigquery/_operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,10 @@ def json_extract_string_array(
values in the array. This function uses single quotes and brackets to escape
invalid JSONPath characters in JSON keys.

.. deprecated:: 2.6.0
The ``json_extract_string_array`` is deprecated and will be removed in a future version.
Use ``json_value_array`` instead.

**Examples:**

>>> import bigframes.pandas as bpd
Expand Down Expand Up @@ -233,6 +237,11 @@ def json_extract_string_array(
Returns:
bigframes.series.Series: A new Series with the parsed arrays from the input.
"""
msg = (
"The `json_extract_string_array` is deprecated and will be removed in a future version. "
"Use `json_value_array` instead."
)
warnings.warn(bfe.format_message(msg), category=UserWarning)
array_series = input._apply_unary_op(
ops.JSONExtractStringArray(json_path=json_path)
)
Expand Down Expand Up @@ -334,7 +343,7 @@ def json_query_array(

def json_value(
input: series.Series,
json_path: str,
json_path: str = "$",
) -> series.Series:
"""Extracts a JSON scalar value and converts it to a SQL ``STRING`` value. In
addtion, this function:
Expand Down Expand Up @@ -366,6 +375,61 @@ def json_value(
return input._apply_unary_op(ops.JSONValue(json_path=json_path))


def json_value_array(
input: series.Series,
json_path: str = "$",
) -> series.Series:
"""
Extracts a JSON array of scalar values and converts it to a SQL ``ARRAY<STRING>``
value. In addition, this function:

- Removes the outermost quotes and unescapes the values.
- Returns a SQL ``NULL`` if the selected value isn't an array or not an array
containing only scalar values.
- Uses double quotes to escape invalid ``JSON_PATH`` characters in JSON keys.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
>>> bbq.json_value_array(s)
0 ['1' '2' '3']
1 ['4' '5']
dtype: list<item: string>[pyarrow]

>>> s = bpd.Series([
... '{"fruits": ["apples", "oranges", "grapes"]',
... '{"fruits": ["guava", "grapes"]}'
... ])
>>> bbq.json_value_array(s, "$.fruits")
0 ['apples' 'oranges' 'grapes']
1 ['guava' 'grapes']
dtype: list<item: string>[pyarrow]

>>> s = bpd.Series([
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
... ])
>>> bbq.json_value_array(s, "$.fruits.names")
0 ['apple' 'cherry']
1 ['guava' 'grapes']
dtype: list<item: string>[pyarrow]

Args:
input (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.

Returns:
bigframes.series.Series: A new Series with the parsed arrays from the input.
"""
return input._apply_unary_op(ops.JSONValueArray(json_path=json_path))


@utils.preview(name="The JSON-related API `parse_json`")
def parse_json(
input: series.Series,
Expand Down
12 changes: 12 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,6 +1448,11 @@ def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue):
return json_value(json_obj=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True)
def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray):
return json_value_array(json_obj=x, json_path=op.json_path)


# Blob Ops
@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op)
def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
Expand Down Expand Up @@ -2157,6 +2162,13 @@ def json_value( # type: ignore[empty-body]
"""Retrieve value of a JSON field as plain STRING."""


@ibis_udf.scalar.builtin(name="json_value_array")
def json_value_array( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
) -> ibis_dtypes.Array[ibis_dtypes.String]:
"""Extracts a JSON array and converts it to a SQL ARRAY of STRINGs."""


@ibis_udf.scalar.builtin(name="INT64")
def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body]
"""Converts a JSON number to a SQL INT64 value."""
Expand Down
2 changes: 2 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
JSONQueryArray,
JSONSet,
JSONValue,
JSONValueArray,
ParseJSON,
ToJSONString,
)
Expand Down Expand Up @@ -363,6 +364,7 @@
"JSONQueryArray",
"JSONSet",
"JSONValue",
"JSONValueArray",
"ParseJSON",
"ToJSONString",
# Bool ops
Expand Down
17 changes: 17 additions & 0 deletions bigframes/operations/json_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,23 @@ def output_type(self, *input_types):
return dtypes.STRING_DTYPE


@dataclasses.dataclass(frozen=True)
class JSONValueArray(base_ops.UnaryOp):
name: typing.ClassVar[str] = "json_value_array"
json_path: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be a valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return pd.ArrowDtype(
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
)


@dataclasses.dataclass(frozen=True)
class JSONQuery(base_ops.UnaryOp):
name: typing.ClassVar[str] = "json_query"
Expand Down
52 changes: 51 additions & 1 deletion tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,10 @@ def test_json_extract_array_w_invalid_series_type():

def test_json_extract_string_array_from_json_strings():
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
actual = bbq.json_extract_string_array(s, "$.a")
with pytest.warns(
UserWarning, match="The `json_extract_string_array` is deprecated"
):
actual = bbq.json_extract_string_array(s, "$.a")
expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])

pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
Expand Down Expand Up @@ -214,6 +217,53 @@ def test_json_extract_string_array_w_invalid_series_type():
bbq.json_extract_string_array(s)


def test_json_value_array_from_json_strings():
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
actual = bbq.json_value_array(s, "$.a")
expected_data = [["ab", "2", "3 xy"], [], ["4", "5"]]
# Expected dtype after JSON_VALUE_ARRAY is ARRAY<STRING>
expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string())))
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_value_array_from_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
actual = bbq.json_value_array(s)
expected_data = [["1", "2", "3"], [], ["4", "5"]]
expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string())))
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_value_array_w_invalid_series_type():
s = bpd.Series([1, 2], dtype=dtypes.INT_DTYPE) # Not a JSON-like string
with pytest.raises(TypeError):
bbq.json_value_array(s)


def test_json_value_array_from_json_native():
json_data = [
'{"key": ["hello", "world"]}',
'{"key": ["123", "45.6"]}',
'{"key": []}',
"{}", # case with missing key
]
s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_value_array(s, json_path="$.key")

expected_data_pandas = [["hello", "world"], ["123", "45.6"], [], None]
expected = bpd.Series(
expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
).fillna(pd.NA)
result_pd = actual.to_pandas().fillna(pd.NA)
pd.testing.assert_series_equal(result_pd, expected.to_pandas())


def test_json_query_from_json():
s = bpd.Series(
['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],
Expand Down
0