From 8a2c0227060f02d6e6e3b3bae1e3cfffc8739caa Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 7 May 2025 21:24:35 +0000 Subject: [PATCH] feat: add blob.exif function support --- bigframes/blob/_functions.py | 32 +++++++++++++++ bigframes/operations/blob.py | 40 +++++++++++++++++++ .../experimental/multimodal_dataframe.ipynb | 2 +- tests/system/large/blob/test_function.py | 24 +++++++++++ 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index f7f035bff4..f8fdb21946 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -108,6 +108,38 @@ def udf(self): return self._session.read_gbq_function(udf_name) +def exif_func(src_obj_ref_rt: str) -> str: + import io + import json + + from PIL import ExifTags, Image + import requests + from requests import adapters + + session = requests.Session() + session.mount("https://", adapters.HTTPAdapter(max_retries=3)) + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = session.get(src_url, timeout=30) + bts = response.content + + image = Image.open(io.BytesIO(bts)) + exif_data = image.getexif() + exif_dict = {} + if exif_data: + for tag, value in exif_data.items(): + tag_name = ExifTags.TAGS.get(tag, tag) + exif_dict[tag_name] = value + + return json.dumps(exif_dict) + + +exif_func_def = FunctionDef(exif_func, ["pillow", "requests"]) + + # Blur images. Takes ObjectRefRuntime as JSON string. Outputs ObjectRefRuntime JSON string. def image_blur_func( src_obj_ref_rt: str, dst_obj_ref_rt: str, ksize_x: int, ksize_y: int, ext: str diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index d211c2b918..8da88d1ff8 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -300,6 +300,46 @@ def get_runtime_json_str( runtime = self._get_runtime(mode=mode, with_metadata=with_metadata) return runtime._apply_unary_op(ops.ToJSONString()) + def exif( + self, + *, + connection: Optional[str] = None, + max_batching_rows: int = 8192, + container_cpu: Union[float, int] = 0.33, + container_memory: str = "512Mi", + ) -> bigframes.series.Series: + """Extract EXIF data. Now only support image types. + + Args: + connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. + max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. + container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. + container_memory (str, default "512Mi"): container memory size. String of the format . Possible values are from 512Mi to 32Gi. + + Returns: + bigframes.series.Series: JSON series of key-value pairs. + """ + + import bigframes.bigquery as bbq + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + df = self.get_runtime_json_str(mode="R").to_frame() + + exif_udf = blob_func.TransformFunction( + blob_func.exif_func_def, + session=self._block.session, + connection=connection, + max_batching_rows=max_batching_rows, + container_cpu=container_cpu, + container_memory=container_memory, + ).udf() + + res = self._df_apply_udf(df, exif_udf) + res = bbq.parse_json(res) + + return res + def image_blur( self, ksize: tuple[int, int], diff --git a/notebooks/experimental/multimodal_dataframe.ipynb b/notebooks/experimental/multimodal_dataframe.ipynb index 4a0cd57a45..05b133a345 100644 --- a/notebooks/experimental/multimodal_dataframe.ipynb +++ b/notebooks/experimental/multimodal_dataframe.ipynb @@ -106,7 +106,7 @@ }, "source": [ "### 1. Create Multimodal DataFrame\n", - "There are several ways to create Multimodal DataFrame. The easiest way is from the wiledcard paths." + "There are several ways to create Multimodal DataFrame. The easiest way is from the wildcard paths." ] }, { diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 5913df8add..c189d249a7 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -51,6 +51,30 @@ def images_output_uris(images_output_folder: str) -> list[str]: ] +def test_blob_exif( + bq_connection: str, + test_session: bigframes.Session, +): + exif_image_df = test_session.from_glob_path( + "gs://bigframes_blob_test/images_exif/*", + name="blob_col", + connection=bq_connection, + ) + + actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection) + expected = bpd.Series( + ['{"ExifOffset": 47, "Make": "MyCamera"}'], + session=test_session, + dtype=dtypes.JSON_DTYPE, + ) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + check_dtype=False, + check_index_type=False, + ) + + def test_blob_image_blur_to_series( images_mm_df: bpd.DataFrame, bq_connection: str,