diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index cdc3718893..7ca7fb693b 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -43,6 +43,7 @@ json_query_array, json_set, json_value, + json_value_array, parse_json, ) from bigframes.bigquery._operations.search import create_vector_index, vector_search @@ -71,6 +72,7 @@ "json_query_array", "json_set", "json_value", + "json_value_array", "parse_json", # search ops "create_vector_index", diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 00d230d684..7ad7855dba 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -196,6 +196,10 @@ def json_extract_string_array( values in the array. This function uses single quotes and brackets to escape invalid JSONPath characters in JSON keys. + .. deprecated:: 2.6.0 + The ``json_extract_string_array`` is deprecated and will be removed in a future version. + Use ``json_value_array`` instead. + **Examples:** >>> import bigframes.pandas as bpd @@ -233,6 +237,11 @@ def json_extract_string_array( Returns: bigframes.series.Series: A new Series with the parsed arrays from the input. """ + msg = ( + "The `json_extract_string_array` is deprecated and will be removed in a future version. " + "Use `json_value_array` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) array_series = input._apply_unary_op( ops.JSONExtractStringArray(json_path=json_path) ) @@ -334,7 +343,7 @@ def json_query_array( def json_value( input: series.Series, - json_path: str, + json_path: str = "$", ) -> series.Series: """Extracts a JSON scalar value and converts it to a SQL ``STRING`` value. In addtion, this function: @@ -366,6 +375,61 @@ def json_value( return input._apply_unary_op(ops.JSONValue(json_path=json_path)) +def json_value_array( + input: series.Series, + json_path: str = "$", +) -> series.Series: + """ + Extracts a JSON array of scalar values and converts it to a SQL ``ARRAY`` + value. In addition, this function: + + - Removes the outermost quotes and unescapes the values. + - Returns a SQL ``NULL`` if the selected value isn't an array or not an array + containing only scalar values. + - Uses double quotes to escape invalid ``JSON_PATH`` characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_value_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": ["apples", "oranges", "grapes"]', + ... '{"fruits": ["guava", "grapes"]}' + ... ]) + >>> bbq.json_value_array(s, "$.fruits") + 0 ['apples' 'oranges' 'grapes'] + 1 ['guava' 'grapes'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_value_array(s, "$.fruits.names") + 0 ['apple' 'cherry'] + 1 ['guava' 'grapes'] + dtype: list[pyarrow] + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the parsed arrays from the input. + """ + return input._apply_unary_op(ops.JSONValueArray(json_path=json_path)) + + @utils.preview(name="The JSON-related API `parse_json`") def parse_json( input: series.Series, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b819b1c4e2..075089bb7a 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1448,6 +1448,11 @@ def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue): return json_value(json_obj=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True) +def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray): + return json_value_array(json_obj=x, json_path=op.json_path) + + # Blob Ops @scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op) def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value): @@ -2157,6 +2162,13 @@ def json_value( # type: ignore[empty-body] """Retrieve value of a JSON field as plain STRING.""" +@ibis_udf.scalar.builtin(name="json_value_array") +def json_value_array( # type: ignore[empty-body] + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String +) -> ibis_dtypes.Array[ibis_dtypes.String]: + """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" + + @ibis_udf.scalar.builtin(name="INT64") def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body] """Converts a JSON number to a SQL INT64 value.""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 291bf17fa5..86098d47cf 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -112,6 +112,7 @@ JSONQueryArray, JSONSet, JSONValue, + JSONValueArray, ParseJSON, ToJSONString, ) @@ -363,6 +364,7 @@ "JSONQueryArray", "JSONSet", "JSONValue", + "JSONValueArray", "ParseJSON", "ToJSONString", # Bool ops diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 95a47dcadb..81f00c39ce 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -153,6 +153,23 @@ def output_type(self, *input_types): return dtypes.STRING_DTYPE +@dataclasses.dataclass(frozen=True) +class JSONValueArray(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_value_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE)) + ) + + @dataclasses.dataclass(frozen=True) class JSONQuery(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_query" diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 4ad16d6cc8..4ecbd01318 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -186,7 +186,10 @@ def test_json_extract_array_w_invalid_series_type(): def test_json_extract_string_array_from_json_strings(): s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) - actual = bbq.json_extract_string_array(s, "$.a") + with pytest.warns( + UserWarning, match="The `json_extract_string_array` is deprecated" + ): + actual = bbq.json_extract_string_array(s, "$.a") expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) @@ -214,6 +217,53 @@ def test_json_extract_string_array_w_invalid_series_type(): bbq.json_extract_string_array(s) +def test_json_value_array_from_json_strings(): + s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) + actual = bbq.json_value_array(s, "$.a") + expected_data = [["ab", "2", "3 xy"], [], ["4", "5"]] + # Expected dtype after JSON_VALUE_ARRAY is ARRAY + expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string()))) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_value_array_from_array_strings(): + s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) + actual = bbq.json_value_array(s) + expected_data = [["1", "2", "3"], [], ["4", "5"]] + expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string()))) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_value_array_w_invalid_series_type(): + s = bpd.Series([1, 2], dtype=dtypes.INT_DTYPE) # Not a JSON-like string + with pytest.raises(TypeError): + bbq.json_value_array(s) + + +def test_json_value_array_from_json_native(): + json_data = [ + '{"key": ["hello", "world"]}', + '{"key": ["123", "45.6"]}', + '{"key": []}', + "{}", # case with missing key + ] + s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE) + actual = bbq.json_value_array(s, json_path="$.key") + + expected_data_pandas = [["hello", "world"], ["123", "45.6"], [], None] + expected = bpd.Series( + expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string())) + ).fillna(pd.NA) + result_pd = actual.to_pandas().fillna(pd.NA) + pd.testing.assert_series_equal(result_pd, expected.to_pandas()) + + def test_json_query_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],