From f3b0f542ab516c768de6dfc230b9f9e9eed567ba Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2024 13:14:46 -0500 Subject: [PATCH 01/36] chore(python): use python 3.10 for docs build (#828) Source-Link: https://github.com/googleapis/synthtool/commit/9ae07858520bf035a3d5be569b5a65d960ee4392 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:52210e0e0559f5ea8c52be148b33504022e1faef4e95fbe4b32d68022af2fa7e Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou --- .github/.OwlBot.lock.yaml | 4 +-- .kokoro/docker/docs/Dockerfile | 21 +++++++------ .kokoro/docker/docs/requirements.txt | 40 +++++++++++++----------- .kokoro/requirements.txt | 46 ++++++++++++++-------------- 4 files changed, 59 insertions(+), 52 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 91d742b5b9..f30cb3775a 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:d3de8a02819f65001effcbd3ea76ce97e9bcff035c7a89457f40f892c87c5b32 -# created: 2024-07-03T17:43:00.77142528Z + digest: sha256:52210e0e0559f5ea8c52be148b33504022e1faef4e95fbe4b32d68022af2fa7e +# created: 2024-07-08T19:25:35.862283192Z diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile index a26ce61930..5205308b33 100644 --- a/.kokoro/docker/docs/Dockerfile +++ b/.kokoro/docker/docs/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ubuntu:22.04 +from ubuntu:24.04 ENV DEBIAN_FRONTEND noninteractive @@ -40,7 +40,6 @@ RUN apt-get update \ libssl-dev \ libsqlite3-dev \ portaudio19-dev \ - python3-distutils \ redis-server \ software-properties-common \ ssh \ @@ -60,18 +59,22 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && rm -f /var/cache/apt/archives/*.deb -###################### Install python 3.9.13 -# Download python 3.9.13 -RUN wget https://www.python.org/ftp/python/3.9.13/Python-3.9.13.tgz +###################### Install python 3.10.14 for docs/docfx session + +# Download python 3.10.14 +RUN wget https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz # Extract files -RUN tar -xvf Python-3.9.13.tgz +RUN tar -xvf Python-3.10.14.tgz -# Install python 3.9.13 -RUN ./Python-3.9.13/configure --enable-optimizations +# Install python 3.10.14 +RUN ./Python-3.10.14/configure --enable-optimizations RUN make altinstall +RUN python3.10 -m venv /venv +ENV PATH /venv/bin:$PATH + ###################### Install pip RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \ && python3 /tmp/get-pip.py \ @@ -84,4 +87,4 @@ RUN python3 -m pip COPY requirements.txt /requirements.txt RUN python3 -m pip install --require-hashes -r requirements.txt -CMD ["python3.8"] +CMD ["python3.10"] diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt index 0e5d70f20f..7129c77155 100644 --- a/.kokoro/docker/docs/requirements.txt +++ b/.kokoro/docker/docs/requirements.txt @@ -4,9 +4,9 @@ # # pip-compile --allow-unsafe --generate-hashes requirements.in # -argcomplete==3.2.3 \ - --hash=sha256:bf7900329262e481be5a15f56f19736b376df6f82ed27576fa893652c5de6c23 \ - --hash=sha256:c12355e0494c76a2a7b73e3a59b09024ca0ba1e279fb9ed6c1b82d5b74b6a70c +argcomplete==3.4.0 \ + --hash=sha256:69a79e083a716173e5532e0fa3bef45f793f4e61096cf52b5a42c0211c8b8aa5 \ + --hash=sha256:c2abcdfe1be8ace47ba777d4fce319eb13bf8ad9dace8d085dcad6eded88057f # via nox colorlog==6.8.2 \ --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ @@ -16,23 +16,27 @@ distlib==0.3.8 \ --hash=sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 \ --hash=sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64 # via virtualenv -filelock==3.13.1 \ - --hash=sha256:521f5f56c50f8426f5e03ad3b281b490a87ef15bc6c526f168290f0c7148d44e \ - --hash=sha256:57dbda9b35157b05fb3e58ee91448612eb674172fab98ee235ccb0b5bee19a1c +filelock==3.15.4 \ + --hash=sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb \ + --hash=sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 # via virtualenv -nox==2024.3.2 \ - --hash=sha256:e53514173ac0b98dd47585096a55572fe504fecede58ced708979184d05440be \ - --hash=sha256:f521ae08a15adbf5e11f16cb34e8d0e6ea521e0b92868f684e91677deb974553 +nox==2024.4.15 \ + --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ + --hash=sha256:ecf6700199cdfa9e5ea0a41ff5e6ef4641d09508eda6edb89d9987864115817f # via -r requirements.in -packaging==24.0 \ - --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \ - --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9 +packaging==24.1 \ + --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ + --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 # via nox -platformdirs==4.2.0 \ - --hash=sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068 \ - --hash=sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768 +platformdirs==4.2.2 \ + --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ + --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 # via virtualenv -virtualenv==20.25.1 \ - --hash=sha256:961c026ac520bac5f69acb8ea063e8a4f071bcc9457b9c1f28f6b085c511583a \ - --hash=sha256:e08e13ecdca7a0bd53798f356d5831434afa5b07b93f0abdf0797b7a06ffe197 +tomli==2.0.1 \ + --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ + --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f + # via nox +virtualenv==20.26.3 \ + --hash=sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a \ + --hash=sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 # via nox diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 35ece0e4d2..9622baf0ba 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -20,9 +20,9 @@ cachetools==5.3.3 \ --hash=sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945 \ --hash=sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105 # via google-auth -certifi==2024.6.2 \ - --hash=sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516 \ - --hash=sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56 +certifi==2024.7.4 \ + --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \ + --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 # via requests cffi==1.16.0 \ --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ @@ -371,23 +371,23 @@ more-itertools==10.3.0 \ # via # jaraco-classes # jaraco-functools -nh3==0.2.17 \ - --hash=sha256:0316c25b76289cf23be6b66c77d3608a4fdf537b35426280032f432f14291b9a \ - --hash=sha256:1a814dd7bba1cb0aba5bcb9bebcc88fd801b63e21e2450ae6c52d3b3336bc911 \ - --hash=sha256:1aa52a7def528297f256de0844e8dd680ee279e79583c76d6fa73a978186ddfb \ - --hash=sha256:22c26e20acbb253a5bdd33d432a326d18508a910e4dcf9a3316179860d53345a \ - --hash=sha256:40015514022af31975c0b3bca4014634fa13cb5dc4dbcbc00570acc781316dcc \ - --hash=sha256:40d0741a19c3d645e54efba71cb0d8c475b59135c1e3c580f879ad5514cbf028 \ - --hash=sha256:551672fd71d06cd828e282abdb810d1be24e1abb7ae2543a8fa36a71c1006fe9 \ - --hash=sha256:66f17d78826096291bd264f260213d2b3905e3c7fae6dfc5337d49429f1dc9f3 \ - --hash=sha256:85cdbcca8ef10733bd31f931956f7fbb85145a4d11ab9e6742bbf44d88b7e351 \ - --hash=sha256:a3f55fabe29164ba6026b5ad5c3151c314d136fd67415a17660b4aaddacf1b10 \ - --hash=sha256:b4427ef0d2dfdec10b641ed0bdaf17957eb625b2ec0ea9329b3d28806c153d71 \ - --hash=sha256:ba73a2f8d3a1b966e9cdba7b211779ad8a2561d2dba9674b8a19ed817923f65f \ - --hash=sha256:c21bac1a7245cbd88c0b0e4a420221b7bfa838a2814ee5bb924e9c2f10a1120b \ - --hash=sha256:c551eb2a3876e8ff2ac63dff1585236ed5dfec5ffd82216a7a174f7c5082a78a \ - --hash=sha256:c790769152308421283679a142dbdb3d1c46c79c823008ecea8e8141db1a2062 \ - --hash=sha256:d7a25fd8c86657f5d9d576268e3b3767c5cd4f42867c9383618be8517f0f022a +nh3==0.2.18 \ + --hash=sha256:0411beb0589eacb6734f28d5497ca2ed379eafab8ad8c84b31bb5c34072b7164 \ + --hash=sha256:14c5a72e9fe82aea5fe3072116ad4661af5cf8e8ff8fc5ad3450f123e4925e86 \ + --hash=sha256:19aaba96e0f795bd0a6c56291495ff59364f4300d4a39b29a0abc9cb3774a84b \ + --hash=sha256:34c03fa78e328c691f982b7c03d4423bdfd7da69cd707fe572f544cf74ac23ad \ + --hash=sha256:36c95d4b70530b320b365659bb5034341316e6a9b30f0b25fa9c9eff4c27a204 \ + --hash=sha256:3a157ab149e591bb638a55c8c6bcb8cdb559c8b12c13a8affaba6cedfe51713a \ + --hash=sha256:42c64511469005058cd17cc1537578eac40ae9f7200bedcfd1fc1a05f4f8c200 \ + --hash=sha256:5f36b271dae35c465ef5e9090e1fdaba4a60a56f0bb0ba03e0932a66f28b9189 \ + --hash=sha256:6955369e4d9f48f41e3f238a9e60f9410645db7e07435e62c6a9ea6135a4907f \ + --hash=sha256:7b7c2a3c9eb1a827d42539aa64091640bd275b81e097cd1d8d82ef91ffa2e811 \ + --hash=sha256:8ce0f819d2f1933953fca255db2471ad58184a60508f03e6285e5114b6254844 \ + --hash=sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4 \ + --hash=sha256:a7f1b5b2c15866f2db413a3649a8fe4fd7b428ae58be2c0f6bca5eefd53ca2be \ + --hash=sha256:c8b3a1cebcba9b3669ed1a84cc65bf005728d2f0bc1ed2a6594a992e817f3a50 \ + --hash=sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307 \ + --hash=sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe # via readme-renderer nox==2024.4.15 \ --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ @@ -460,9 +460,9 @@ python-dateutil==2.9.0.post0 \ --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 # via gcp-releasetool -readme-renderer==43.0 \ - --hash=sha256:1818dd28140813509eeed8d62687f7cd4f7bad90d4db586001c5dc09d4fde311 \ - --hash=sha256:19db308d86ecd60e5affa3b2a98f017af384678c63c88e5d4556a380e674f3f9 +readme-renderer==44.0 \ + --hash=sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151 \ + --hash=sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1 # via twine requests==2.32.3 \ --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ From cdfd979596f8dcb5f6266f56d37689203a4d8e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 9 Jul 2024 16:57:03 -0500 Subject: [PATCH 02/36] chore: remove references to conda (#830) --- noxfile.py | 2 -- third_party/bigframes_vendored/ibis/README.md | 15 --------------- third_party/bigframes_vendored/pandas/README.md | 10 ++-------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/noxfile.py b/noxfile.py index 177e0e2ab8..b38bcacfdb 100644 --- a/noxfile.py +++ b/noxfile.py @@ -552,8 +552,6 @@ def prerelease(session: nox.sessions.Session, tests_path): already_installed.add("pyarrow") session.install( - "--extra-index-url", - "https://pypi.anaconda.org/scipy-wheels-nightly/simple", "--prefer-binary", "--pre", "--upgrade", diff --git a/third_party/bigframes_vendored/ibis/README.md b/third_party/bigframes_vendored/ibis/README.md index 8a00750e92..fa8224214f 100644 --- a/third_party/bigframes_vendored/ibis/README.md +++ b/third_party/bigframes_vendored/ibis/README.md @@ -1,7 +1,6 @@ # Ibis [![Documentation Status](https://img.shields.io/badge/docs-docs.ibis--project.org-blue.svg)](http://ibis-project.org) -[![Anaconda-Server Badge](https://anaconda.org/conda-forge/ibis-framework/badges/version.svg)](https://anaconda.org/conda-forge/ibis-framework) [![PyPI](https://img.shields.io/pypi/v/ibis-framework.svg)](https://pypi.org/project/ibis-framework) [![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-main.yml?query=branch%3Amaster) [![Build status](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml/badge.svg)](https://github.com/ibis-project/ibis/actions/workflows/ibis-backends.yml?query=branch%3Amaster) @@ -83,28 +82,14 @@ Install Ibis from PyPI with: pip install 'ibis-framework[duckdb]' ``` -Or from conda-forge with: - -```bash -conda install ibis-framework -c conda-forge -``` - (It’s a common mistake to `pip install ibis`. If you try to use Ibis and get errors early on try uninstalling `ibis` and installing `ibis-framework`) -To discover ibis, we suggest starting with the DuckDB backend (which is included by default in the conda-forge package). The DuckDB backend is performant and fully featured. - To use ibis with other backends, include the backend name in brackets for PyPI: ```bash pip install 'ibis-framework[postgres]' ``` -Or use `ibis-$BACKEND` where `$BACKEND` is the specific backend you want to use when installing from conda-forge: - -```bash -conda install ibis-postgres -c conda-forge -``` - ## Getting Started with Ibis We provide a number of tutorial and example notebooks in the diff --git a/third_party/bigframes_vendored/pandas/README.md b/third_party/bigframes_vendored/pandas/README.md index 9f2bc800e8..1aa5068d5e 100644 --- a/third_party/bigframes_vendored/pandas/README.md +++ b/third_party/bigframes_vendored/pandas/README.md @@ -6,7 +6,6 @@ # pandas: powerful Python data analysis toolkit [![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) -[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) [![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) @@ -86,15 +85,10 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). +Package Index (PyPI)](https://pypi.org/project/pandas). ```sh -# conda -conda install -c conda-forge pandas -``` - -```sh -# or PyPI +# PyPI pip install pandas ``` From eaa1db0e5527606a84b49e58aa58b8a36e70e10c Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 9 Jul 2024 18:41:35 -0700 Subject: [PATCH 03/36] refactor: read transformer output columns from model entity (#817) * refactor: read transformer output columns from model entity * fix tests and docs * remove dup code * fix comment --- bigframes/ml/base.py | 18 +++ bigframes/ml/compose.py | 20 +--- bigframes/ml/impute.py | 32 ++--- bigframes/ml/pipeline.py | 4 +- bigframes/ml/preprocessing.py | 220 +++++++++++++--------------------- 5 files changed, 124 insertions(+), 170 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 70854a36e9..6ae06c9d9f 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -188,6 +188,24 @@ def __init__(self): def _keys(self): pass + def _extract_output_names(self): + """Extract transform output column names. Save the results to self._output_names.""" + assert self._bqml_model is not None + + output_names = [] + for transform_col in self._bqml_model._model._properties["transformColumns"]: + transform_col_dict = cast(dict, transform_col) + # pass the columns that are not transformed + if "transformSql" not in transform_col_dict: + continue + transform_sql: str = transform_col_dict["transformSql"] + if not transform_sql.startswith("ML."): + continue + + output_names.append(transform_col_dict["name"]) + + self._output_names = output_names + def __eq__(self, other) -> bool: return type(self) is type(other) and self._keys() == other._keys() diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 7f1bfe8d55..4ea63d2e81 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -201,25 +201,20 @@ def _merge( def _compile_to_sql( self, - columns: List[str], X: bpd.DataFrame, - ) -> List[Tuple[str, str]]: + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns (List[str]): - a list of column names to transform - X (bpd.DataFrame): - The Dataframe with training data. + X: DataFrame to transform. - Returns: - a list of tuples of (sql_expression, output_name)""" + Returns: a list of sql_expr.""" result = [] for _, transformer, target_columns in self.transformers: if isinstance(target_columns, str): target_columns = [target_columns] - result += transformer._compile_to_sql(target_columns, X=X) + result += transformer._compile_to_sql(X, target_columns) return result def fit( @@ -229,17 +224,14 @@ def fit( ) -> ColumnTransformer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index ae71637aa5..4955eb5de5 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import Iterable, List, Literal, Optional, Tuple, Union +from typing import Iterable, List, Literal, Optional, Union import bigframes_vendored.sklearn.impute._base @@ -49,25 +49,22 @@ def _keys(self): def _compile_to_sql( self, - columns: Iterable[str], - X=None, - ) -> List[Tuple[str, str]]: + X: bpd.DataFrame, + columns: Optional[Iterable[str]] = None, + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - A list of column names to transform. - X: - The Dataframe with training data. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_imputer( - column, self.strategy, f"imputer_{column}" - ), - f"imputer_{column}", + self._base_sql_generator.ml_imputer( + column, self.strategy, f"imputer_{column}" ) for column in columns ] @@ -92,17 +89,14 @@ def fit( ) -> SimpleImputer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 04b8d73cf5..4cd60c5836 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -106,9 +106,7 @@ def fit( ) -> Pipeline: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._transform._compile_to_sql(X.columns.tolist(), X=X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._transform._compile_to_sql(X) if y is not None: # If labels columns are present, they should pass through un-transformed (y,) = utils.convert_to_dataframe(y) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 07fdc171cf..13d2041ef3 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import cast, Iterable, List, Literal, Optional, Tuple, Union +from typing import cast, Iterable, List, Literal, Optional, Union import bigframes_vendored.sklearn.preprocessing._data import bigframes_vendored.sklearn.preprocessing._discretization @@ -46,23 +46,22 @@ def __init__(self): def _keys(self): return (self._bqml_model,) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_standard_scaler( - column, f"standard_scaled_{column}" - ), - f"standard_scaled_{column}", + self._base_sql_generator.ml_standard_scaler( + column, f"standard_scaled_{column}" ) for column in columns ] @@ -86,17 +85,14 @@ def fit( ) -> StandardScaler: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -127,23 +123,22 @@ def __init__(self): def _keys(self): return (self._bqml_model,) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_max_abs_scaler( - column, f"max_abs_scaled_{column}" - ), - f"max_abs_scaled_{column}", + self._base_sql_generator.ml_max_abs_scaler( + column, f"max_abs_scaled_{column}" ) for column in columns ] @@ -167,17 +162,14 @@ def fit( ) -> MaxAbsScaler: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -208,23 +200,22 @@ def __init__(self): def _keys(self): return (self._bqml_model,) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns return [ - ( - self._base_sql_generator.ml_min_max_scaler( - column, f"min_max_scaled_{column}" - ), - f"min_max_scaled_{column}", + self._base_sql_generator.ml_min_max_scaler( + column, f"min_max_scaled_{column}" ) for column in columns ] @@ -248,17 +239,14 @@ def fit( ) -> MinMaxScaler: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -302,20 +290,18 @@ def _keys(self): return (self._bqml_model, self.n_bins, self.strategy) def _compile_to_sql( - self, - columns: Iterable[str], - X: bpd.DataFrame, - ) -> List[Tuple[str, str]]: + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform - X: - The Dataframe with training data. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns array_split_points = {} if self.strategy == "uniform": for column in columns: @@ -327,11 +313,8 @@ def _compile_to_sql( ] return [ - ( - self._base_sql_generator.ml_bucketize( - column, array_split_points[column], f"kbinsdiscretizer_{column}" - ), - f"kbinsdiscretizer_{column}", + self._base_sql_generator.ml_bucketize( + column, array_split_points[column], f"kbinsdiscretizer_{column}" ) for column in columns ] @@ -339,11 +322,8 @@ def _compile_to_sql( elif self.strategy == "quantile": return [ - ( - self._base_sql_generator.ml_quantile_bucketize( - column, self.n_bins, f"kbinsdiscretizer_{column}" - ), - f"kbinsdiscretizer_{column}", + self._base_sql_generator.ml_quantile_bucketize( + column, self.n_bins, f"kbinsdiscretizer_{column}" ) for column in columns ] @@ -381,17 +361,14 @@ def fit( ) -> KBinsDiscretizer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -440,18 +417,19 @@ def __init__( def _keys(self): return (self._bqml_model, self.drop, self.min_frequency, self.max_categories) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. - - Returns: a list of tuples of (sql_expression, output_name)""" + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns drop = self.drop if self.drop is not None else "none" # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. top_k = ( @@ -465,11 +443,8 @@ def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str else OneHotEncoder.FREQUENCY_THRESHOLD_DEFAULT ) return [ - ( - self._base_sql_generator.ml_one_hot_encoder( - column, drop, top_k, frequency_threshold, f"onehotencoded_{column}" - ), - f"onehotencoded_{column}", + self._base_sql_generator.ml_one_hot_encoder( + column, drop, top_k, frequency_threshold, f"onehotencoded_{column}" ) for column in columns ] @@ -502,17 +477,14 @@ def fit( ) -> OneHotEncoder: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -559,17 +531,19 @@ def __init__( def _keys(self): return (self._bqml_model, self.min_frequency, self.max_categories) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. top_k = ( @@ -583,11 +557,8 @@ def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str else LabelEncoder.FREQUENCY_THRESHOLD_DEFAULT ) return [ - ( - self._base_sql_generator.ml_label_encoder( - column, top_k, frequency_threshold, f"labelencoded_{column}" - ), - f"labelencoded_{column}", + self._base_sql_generator.ml_label_encoder( + column, top_k, frequency_threshold, f"labelencoded_{column}" ) for column in columns ] @@ -614,17 +585,14 @@ def fit( ) -> LabelEncoder: (y,) = utils.convert_to_dataframe(y) - compiled_transforms = self._compile_to_sql(y.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(y) self._bqml_model = self._bqml_model_factory.create_model( y, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # The schema of TRANSFORM output is not available in the model API, so save it during fitting - self._output_names = [name for _, name in compiled_transforms] + self._extract_output_names() return self def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @@ -660,24 +628,23 @@ def __init__(self, degree: int = 2): def _keys(self): return (self._bqml_model, self.degree) - def _compile_to_sql(self, columns: Iterable[str], X=None) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None + ) -> List[str]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: - a list of column names to transform. - X (default None): - Ignored. + X: DataFrame to transform. + columns: transform columns. If None, transform all columns in X. - Returns: a list of tuples of (sql_expression, output_name)""" + Returns: a list of tuples sql_expr.""" + if columns is None: + columns = X.columns output_name = "poly_feat" return [ - ( - self._base_sql_generator.ml_polynomial_expand( - columns, self.degree, output_name - ), - output_name, + self._base_sql_generator.ml_polynomial_expand( + columns, self.degree, output_name ) ] @@ -702,29 +669,14 @@ def fit( ) -> PolynomialFeatures: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) - transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - + transform_sqls = self._compile_to_sql(X) self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, ) - # TODO(garrettwu): generalize the approach to other transformers - output_names = [] - for transform_col in self._bqml_model._model._properties["transformColumns"]: - transform_col_dict = cast(dict, transform_col) - # pass the columns that are not transformed - if "transformSql" not in transform_col_dict: - continue - transform_sql: str = transform_col_dict["transformSql"] - if not transform_sql.startswith("ML."): - continue - - output_names.append(transform_col_dict["name"]) - - self._output_names = output_names + self._extract_output_names() return self From 27f8631be81a3e136cfeb8904558bb4f3f5caa05 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 10 Jul 2024 12:53:59 -0700 Subject: [PATCH 04/36] feat: add stratify param support to ml.model_selection.train_test_split method (#815) * feat: add stratify param to ml.model_selection.train_test_split * fix mypy * add notes for limit --- bigframes/ml/model_selection.py | 40 +++++++++++- tests/system/small/ml/test_model_selection.py | 62 +++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 48eb5a93a7..6220e899ae 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -18,7 +18,7 @@ import typing -from typing import List, Union +from typing import cast, List, Union from bigframes.ml import utils import bigframes.pandas as bpd @@ -29,6 +29,7 @@ def train_test_split( test_size: Union[float, None] = None, train_size: Union[float, None] = None, random_state: Union[int, None] = None, + stratify: Union[bpd.Series, None] = None, ) -> List[Union[bpd.DataFrame, bpd.Series]]: """Splits dataframes or series into random train and test subsets. @@ -46,6 +47,10 @@ def train_test_split( random_state (default None): A seed to use for randomly choosing the rows of the split. If not set, a random split will be generated each time. + stratify: (bigframes.series.Series or None, default None): + If not None, data is split in a stratified fashion, using this as the class labels. Each split has the same distribution of the class labels with the original dataset. + Default to None. + Note: By setting the stratify parameter, the memory consumption and generated SQL will be linear to the unique values in the Series. May return errors if the unique values size is too large. Returns: List[Union[bigframes.dataframe.DataFrame, bigframes.series.Series]]: A list of BigQuery DataFrames or Series. @@ -76,7 +81,38 @@ def train_test_split( dfs = list(utils.convert_to_dataframe(*arrays)) - split_dfs = dfs[0]._split(fracs=(train_size, test_size), random_state=random_state) + def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFrame]: + """Split a single DF accoding to the stratify Series.""" + stratify = stratify.rename("bigframes_stratify_col") # avoid name conflicts + merged_df = df.join(stratify.to_frame(), how="outer") + + train_dfs, test_dfs = [], [] + uniq = stratify.unique() + for value in uniq: + cur = merged_df[merged_df["bigframes_stratify_col"] == value] + train, test = train_test_split( + cur, + test_size=test_size, + train_size=train_size, + random_state=random_state, + ) + train_dfs.append(train) + test_dfs.append(test) + + train_df = cast( + bpd.DataFrame, bpd.concat(train_dfs).drop(columns="bigframes_stratify_col") + ) + test_df = cast( + bpd.DataFrame, bpd.concat(test_dfs).drop(columns="bigframes_stratify_col") + ) + return [train_df, test_df] + + if stratify is None: + split_dfs = dfs[0]._split( + fracs=(train_size, test_size), random_state=random_state + ) + else: + split_dfs = _stratify_split(dfs[0], stratify) train_index = split_dfs[0].index test_index = split_dfs[1].index diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index 63d0840d29..ea9220feb4 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -234,3 +234,65 @@ def test_train_test_split_value_error(penguins_df_default_index, train_size, tes model_selection.train_test_split( X, y, train_size=train_size, test_size=test_size ) + + +def test_train_test_split_stratify(penguins_df_default_index): + X = penguins_df_default_index[ + [ + "species", + "island", + "culmen_length_mm", + ] + ] + y = penguins_df_default_index[["species"]] + X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y, stratify=penguins_df_default_index["species"] + ) + + # Original distribution is [152, 124, 68]. All the categories follow 75/25 split + train_counts = pd.Series( + [114, 93, 51], + index=pd.Index( + [ + "Adelie Penguin (Pygoscelis adeliae)", + "Gentoo penguin (Pygoscelis papua)", + "Chinstrap penguin (Pygoscelis antarctica)", + ], + name="species", + ), + dtype="Int64", + name="count", + ) + test_counts = pd.Series( + [38, 31, 17], + index=pd.Index( + [ + "Adelie Penguin (Pygoscelis adeliae)", + "Gentoo penguin (Pygoscelis papua)", + "Chinstrap penguin (Pygoscelis antarctica)", + ], + name="species", + ), + dtype="Int64", + name="count", + ) + pd.testing.assert_series_equal( + X_train["species"].value_counts().to_pandas(), + train_counts, + check_index_type=False, + ) + pd.testing.assert_series_equal( + X_test["species"].value_counts().to_pandas(), + test_counts, + check_index_type=False, + ) + pd.testing.assert_series_equal( + y_train["species"].value_counts().to_pandas(), + train_counts, + check_index_type=False, + ) + pd.testing.assert_series_equal( + y_test["species"].value_counts().to_pandas(), + test_counts, + check_index_type=False, + ) From 0d24f737041c7dd70253ebb4baa8d8ef67bd4f1d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 10 Jul 2024 18:14:49 -0700 Subject: [PATCH 05/36] fix: Fewer relation joins from df self-operations (#823) --- bigframes/core/__init__.py | 15 +++--- bigframes/core/blocks.py | 79 ++++++++++++++++------------ bigframes/core/join_def.py | 9 ++++ bigframes/core/nodes.py | 1 - bigframes/core/rewrite.py | 74 +++++++++++++------------- tests/system/small/test_dataframe.py | 23 ++++++-- 6 files changed, 118 insertions(+), 83 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9b858046bc..cfe8f29327 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -460,9 +460,9 @@ def _cross_join_w_labels( conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross" ) if join_side == "left": - joined_array = self.join(labels_array, join_def=join) + joined_array = self.relational_join(labels_array, join_def=join) else: - joined_array = labels_array.join(self, join_def=join) + joined_array = labels_array.relational_join(self, join_def=join) return joined_array def _create_unpivot_labels_array( @@ -485,30 +485,27 @@ def _create_unpivot_labels_array( return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) - def join( + def relational_join( self, other: ArrayValue, join_def: join_def.JoinDefinition, - allow_row_identity_join: bool = False, - ): + ) -> ArrayValue: join_node = nodes.JoinNode( left_child=self.node, right_child=other.node, join=join_def, - allow_row_identity_join=allow_row_identity_join, ) - if allow_row_identity_join: - return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node)) return ArrayValue(join_node) def try_align_as_projection( self, other: ArrayValue, join_type: join_def.JoinType, + join_keys: typing.Tuple[join_def.CoalescedColumnMapping, ...], mappings: typing.Tuple[join_def.JoinColumnMapping, ...], ) -> typing.Optional[ArrayValue]: result = bigframes.core.rewrite.join_as_projection( - self.node, other.node, mappings, join_type + self.node, other.node, join_keys, mappings, join_type ) if result is not None: return ArrayValue(result) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index fef91f88dc..fab0035e1a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2008,7 +2008,7 @@ def merge( mappings=(*left_mappings, *right_mappings), type=how, ) - joined_expr = self.expr.join(other.expr, join_def=join_def) + joined_expr = self.expr.relational_join(other.expr, join_def=join_def) result_columns = [] matching_join_labels = [] @@ -2267,25 +2267,33 @@ def join( raise NotImplementedError( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) - # Special case for null index, + # Handle null index, which only supports row join + if (self.index.nlevels == other.index.nlevels == 0) and not block_identity_join: + if not block_identity_join: + result = try_row_join(self, other, how=how) + if result is not None: + return result + raise bigframes.exceptions.NullIndexError( + "Cannot implicitly align objects. Set an explicit index using set_index." + ) + + # Oddly, pandas row-wise join ignores right index names if ( - (self.index.nlevels == other.index.nlevels == 0) - and not sort - and not block_identity_join + not block_identity_join + and (self.index.nlevels == other.index.nlevels) + and (self.index.dtypes == other.index.dtypes) ): - return join_indexless(self, other, how=how) + result = try_row_join(self, other, how=how) + if result is not None: + return result self._throw_if_null_index("join") other._throw_if_null_index("join") if self.index.nlevels == other.index.nlevels == 1: - return join_mono_indexed( - self, other, how=how, sort=sort, block_identity_join=block_identity_join - ) - else: + return join_mono_indexed(self, other, how=how, sort=sort) + else: # Handles cases where one or both sides are multi-indexed # Always sort mult-index join - return join_multi_indexed( - self, other, how=how, sort=sort, block_identity_join=block_identity_join - ) + return join_multi_indexed(self, other, how=how, sort=sort) def _force_reproject(self) -> Block: """Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations.""" @@ -2623,22 +2631,31 @@ def is_uniquely_named(self: BlockIndexProperties): return len(set(self.names)) == len(self.names) -def join_indexless( +def try_row_join( left: Block, right: Block, *, how="left", -) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: - """Joins two blocks""" +) -> Optional[Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]]: + """Joins two blocks that have a common root expression by merging the projections.""" left_expr = left.expr right_expr = right.expr + # Create a new array value, mapping from both, then left, and then right + join_keys = tuple( + join_defs.CoalescedColumnMapping( + left_source_id=left_id, + right_source_id=right_id, + destination_id=guid.generate_guid(), + ) + for left_id, right_id in zip(left.index_columns, right.index_columns) + ) left_mappings = [ join_defs.JoinColumnMapping( source_table=join_defs.JoinSide.LEFT, source_id=id, destination_id=guid.generate_guid(), ) - for id in left_expr.column_ids + for id in left.value_columns ] right_mappings = [ join_defs.JoinColumnMapping( @@ -2646,23 +2663,23 @@ def join_indexless( source_id=id, destination_id=guid.generate_guid(), ) - for id in right_expr.column_ids + for id in right.value_columns ] combined_expr = left_expr.try_align_as_projection( right_expr, join_type=how, + join_keys=join_keys, mappings=(*left_mappings, *right_mappings), ) if combined_expr is None: - raise bigframes.exceptions.NullIndexError( - "Cannot implicitly align objects. Set an explicit index using set_index." - ) + return None get_column_left = {m.source_id: m.destination_id for m in left_mappings} get_column_right = {m.source_id: m.destination_id for m in right_mappings} block = Block( combined_expr, column_labels=[*left.column_labels, *right.column_labels], - index_columns=(), + index_columns=(key.destination_id for key in join_keys), + index_labels=left.index.names, ) return ( block, @@ -2704,7 +2721,7 @@ def join_with_single_row( mappings=(*left_mappings, *right_mappings), type="cross", ) - combined_expr = left_expr.join( + combined_expr = left_expr.relational_join( right_expr, join_def=join_def, ) @@ -2731,7 +2748,6 @@ def join_mono_indexed( *, how="left", sort=False, - block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left.expr right_expr = right.expr @@ -2759,14 +2775,14 @@ def join_mono_indexed( mappings=(*left_mappings, *right_mappings), type=how, ) - combined_expr = left_expr.join( + + combined_expr = left_expr.relational_join( right_expr, join_def=join_def, - allow_row_identity_join=(not block_identity_join), ) + get_column_left = join_def.get_left_mapping() get_column_right = join_def.get_right_mapping() - # Drop original indices from each side. and used the coalesced combination generated by the join. left_index = get_column_left[left.index_columns[0]] right_index = get_column_right[right.index_columns[0]] # Drop original indices from each side. and used the coalesced combination generated by the join. @@ -2800,7 +2816,6 @@ def join_multi_indexed( *, how="left", sort=False, - block_identity_join: bool = False, ) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]: if not (left.index.is_uniquely_named() and right.index.is_uniquely_named()): raise ValueError("Joins not supported on indices with non-unique level names") @@ -2819,8 +2834,6 @@ def join_multi_indexed( left_join_ids = [left.index.resolve_level_exact(name) for name in common_names] right_join_ids = [right.index.resolve_level_exact(name) for name in common_names] - names_fully_match = len(left_only_names) == 0 and len(right_only_names) == 0 - left_expr = left.expr right_expr = right.expr @@ -2850,13 +2863,11 @@ def join_multi_indexed( type=how, ) - combined_expr = left_expr.join( + combined_expr = left_expr.relational_join( right_expr, join_def=join_def, - # If we're only joining on a subset of the index columns, we need to - # perform a true join. - allow_row_identity_join=(names_fully_match and not block_identity_join), ) + get_column_left = join_def.get_left_mapping() get_column_right = join_def.get_right_mapping() left_ids_post_join = [get_column_left[id] for id in left_join_ids] diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py index 632a1864da..4079abc8fa 100644 --- a/bigframes/core/join_def.py +++ b/bigframes/core/join_def.py @@ -43,6 +43,15 @@ class JoinColumnMapping: destination_id: str +@dataclasses.dataclass(frozen=True) +class CoalescedColumnMapping: + """Special column mapping used only by implicit joiner only""" + + left_source_id: str + right_source_id: str + destination_id: str + + @dataclasses.dataclass(frozen=True) class JoinDefinition: conditions: Tuple[JoinCondition, ...] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index a703cf1969..dbcfc282e4 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -183,7 +183,6 @@ class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode join: JoinDefinition - allow_row_identity_join: bool = False @property def row_preserving(self) -> bool: diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index 101d5cc882..60ed4069a9 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -106,21 +106,25 @@ def order_with(self, by: Tuple[order.OrderingExpression, ...]): ) def can_merge( - self, right: SquashedSelect, join_def: join_defs.JoinDefinition + self, + right: SquashedSelect, + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], ) -> bool: """Determines whether the two selections can be merged into a single selection.""" - if join_def.type == "cross": - # Cannot convert cross join to projection - return False - r_exprs_by_id = {id: expr for expr, id in right.columns} l_exprs_by_id = {id: expr for expr, id in self.columns} - l_join_exprs = [l_exprs_by_id[cond.left_id] for cond in join_def.conditions] - r_join_exprs = [r_exprs_by_id[cond.right_id] for cond in join_def.conditions] + l_join_exprs = [ + l_exprs_by_id[join_key.left_source_id] for join_key in join_keys + ] + r_join_exprs = [ + r_exprs_by_id[join_key.right_source_id] for join_key in join_keys + ] - if (self.root != right.root) or any( - l_expr != r_expr for l_expr, r_expr in zip(l_join_exprs, r_join_exprs) - ): + if self.root != right.root: + return False + if len(l_join_exprs) != len(r_join_exprs): + return False + if any(l_expr != r_expr for l_expr, r_expr in zip(l_join_exprs, r_join_exprs)): return False return True @@ -128,6 +132,7 @@ def merge( self, right: SquashedSelect, join_type: join_defs.JoinType, + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], mappings: Tuple[join_defs.JoinColumnMapping, ...], ) -> SquashedSelect: if self.root != right.root: @@ -147,11 +152,9 @@ def merge( l_relative, r_relative = relative_predicates(self.predicate, right.predicate) lmask = l_relative if join_type in {"right", "outer"} else None rmask = r_relative if join_type in {"left", "outer"} else None - if lmask is not None: - lselection = tuple((apply_mask(expr, lmask), id) for expr, id in lselection) - if rmask is not None: - rselection = tuple((apply_mask(expr, rmask), id) for expr, id in rselection) - new_columns = remap_names(mappings, lselection, rselection) + new_columns = merge_expressions( + join_keys, mappings, lselection, rselection, lmask, rmask + ) # Reconstruct ordering reverse_root = self.reverse_root @@ -204,26 +207,10 @@ def expand(self) -> nodes.BigFrameNode: return nodes.ProjectionNode(child=root, assignments=self.columns) -def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode: - rewrite_common_node = common_selection_root( - join_node.left_child, join_node.right_child - ) - if rewrite_common_node is None: - return join_node - left_side = SquashedSelect.from_node_span(join_node.left_child, rewrite_common_node) - right_side = SquashedSelect.from_node_span( - join_node.right_child, rewrite_common_node - ) - if left_side.can_merge(right_side, join_node.join): - return left_side.merge( - right_side, join_node.join.type, join_node.join.mappings - ).expand() - return join_node - - def join_as_projection( l_node: nodes.BigFrameNode, r_node: nodes.BigFrameNode, + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], mappings: Tuple[join_defs.JoinColumnMapping, ...], how: join_defs.JoinType, ) -> Optional[nodes.BigFrameNode]: @@ -231,7 +218,10 @@ def join_as_projection( if rewrite_common_node is not None: left_side = SquashedSelect.from_node_span(l_node, rewrite_common_node) right_side = SquashedSelect.from_node_span(r_node, rewrite_common_node) - merged = left_side.merge(right_side, how, mappings) + if not left_side.can_merge(right_side, join_keys): + # Most likely because join keys didn't match + return None + merged = left_side.merge(right_side, how, join_keys, mappings) assert ( merged is not None ), "Couldn't merge nodes. This shouldn't happen. Please share full stacktrace with the BigQuery DataFrames team at bigframes-feedback@google.com." @@ -240,21 +230,33 @@ def join_as_projection( return None -def remap_names( +def merge_expressions( + join_keys: Tuple[join_defs.CoalescedColumnMapping, ...], mappings: Tuple[join_defs.JoinColumnMapping, ...], lselection: Selection, rselection: Selection, + lmask: Optional[scalar_exprs.Expression], + rmask: Optional[scalar_exprs.Expression], ) -> Selection: new_selection: Selection = tuple() l_exprs_by_id = {id: expr for expr, id in lselection} r_exprs_by_id = {id: expr for expr, id in rselection} + for key in join_keys: + # Join keys expressions are equivalent on both sides, so can choose either left or right key + assert l_exprs_by_id[key.left_source_id] == r_exprs_by_id[key.right_source_id] + expr = l_exprs_by_id[key.left_source_id] + id = key.destination_id + new_selection = (*new_selection, (expr, id)) for mapping in mappings: if mapping.source_table == join_defs.JoinSide.LEFT: expr = l_exprs_by_id[mapping.source_id] + if lmask is not None: + expr = apply_mask(expr, lmask) else: # Right expr = r_exprs_by_id[mapping.source_id] - id = mapping.destination_id - new_selection = (*new_selection, (expr, id)) + if rmask is not None: + expr = apply_mask(expr, rmask) + new_selection = (*new_selection, (expr, mapping.destination_id)) return new_selection diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 625b920763..7273e2079f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -140,8 +140,6 @@ def test_df_construct_from_dict(): def test_df_construct_inline_respects_location(): - import bigframes.pandas as bpd - # Note: This starts a thread-local session. with bpd.option_context("bigquery.location", "europe-west1"): df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) @@ -4336,6 +4334,25 @@ def test_df_cached(scalars_df_index): pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) +def test_assign_after_binop_row_joins(): + pd_df = pd.DataFrame( + { + "idx1": [1, 1, 1, 1, 2, 2, 2, 2], + "idx2": [10, 10, 20, 20, 10, 10, 20, 20], + "metric1": [10, 14, 2, 13, 6, 2, 9, 5], + "metric2": [25, -3, 8, 2, -1, 0, 0, -4], + }, + dtype=pd.Int64Dtype(), + ).set_index(["idx1", "idx2"]) + bf_df = dataframe.DataFrame(pd_df) + + # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join + bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 + pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 + + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + def test_df_cache_with_implicit_join(scalars_df_index): """expectation is that cache will be used, but no explicit join will be performed""" df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 @@ -4510,7 +4527,7 @@ def test_query_complexity_repeated_subtrees( bf_df = scalars_df_index for _ in range(5): pd_df = pd.concat(10 * [pd_df]).head(5) - bf_df = bigframes.pandas.concat(10 * [bf_df]).head(5) + bf_df = bpd.concat(10 * [bf_df]).head(5) bf_result = bf_df.to_pandas() pd_result = pd_df assert_pandas_df_equal(bf_result, pd_result) From d7b333fa26acddaeb5ccca4f81b1d624dff03ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 11 Jul 2024 10:59:02 -0500 Subject: [PATCH 06/36] docs: add partner attribution steps to integrations sample notebook (#835) --- notebooks/dataframes/integrations.ipynb | 138 ++++++++++++++++++------ 1 file changed, 104 insertions(+), 34 deletions(-) diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb index 735e18d94e..9edb174f18 100644 --- a/notebooks/dataframes/integrations.ipynb +++ b/notebooks/dataframes/integrations.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 35, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,11 +30,47 @@ "This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attributing requests initiated by BigQuery DataFrames\n", + "\n", + "Partners are required to attribute API calls to BigQuery and other Google APIs. Where possible, this should be done via the User-Agent string, but can also be done via job labels if your integration doesn't initialize the BigQuery DataFrames session.\n", + "\n", + "### Setting the User-Agent\n", + "\n", + "Set [`bpd.options.bigquery.application_name`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes._config.bigquery_options.BigQueryOptions#bigframes__config_bigquery_options_BigQueryOptions_application_name) to a compliant string. Reach out to your Google Partner Engineering team contact for further instructions." + ] + }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "# Set this to the string informed by your Google Partner Engineering team contact.\n", + "# Note: This can only be set once per session, so is most appropriate for partners\n", + "# who provide a Python + BigQuery DataFrames environment to their customers.\n", + "bpd.options.bigquery.application_name = \"notebook-samples/1.0.0 (GPN:notebook-samples)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/swast/src/bigframes-2/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + } + ], "source": [ "import bigframes.pandas as bpd\n", "\n", @@ -47,6 +83,40 @@ "}).set_index(\"index\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting the job label\n", + "\n", + "If your application works with customer-created BigQuery DataFrames objects, you might not be able to set the user-agent header because the session has already started (watch https://github.com/googleapis/python-bigquery-dataframes/issues/833 for updates on this limitation). Instead, attach a label to the jobs your application initiates, such as if you are performing `to_gbq()`on an existing DataFrame, as described below.\n", + "\n", + "Use `bpd.option_context()` so that the labels are only set during the operations your application performs." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job eb7f3bbe-dda9-4d2f-b195-21de862d7055 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with bpd.option_context(\"compute.extra_query_labels\", {\"application-name\": \"notebook-samples\"}):\n", + " table_id = df.to_gbq()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -58,13 +128,13 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 00b5c727-f2bf-4265-be22-d7d505619db7 is DONE. 0 Bytes processed. Open Job" + "Query job 4ad50c3c-91d0-4fef-91f6-0a2c5a30c38f is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -76,10 +146,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_43bbc4c64fb947f7b69db570a5641506'" + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_9a045ff143db4f8ab2018994287020f3'" ] }, - "execution_count": 37, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -102,13 +172,13 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job f9c39ac2-a428-45c9-bb3a-643fc62a1c5b is DONE. 0 Bytes processed. Open Job" + "Query job 9e7d4b1a-d7fc-4599-bab4-40062c83288e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -122,11 +192,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 2 3 0.2500 c\n", - "1 4 5 0.0625 e\n", + "0 3 4 -0.1250 d\n", + "1 1 2 -0.5000 b\n", "2 0 1 1.0000 a\n", - "3 1 2 -0.5000 b\n", - "4 3 4 -0.1250 d\n" + "3 4 5 0.0625 e\n", + "4 2 3 0.2500 c\n" ] } ], @@ -168,13 +238,13 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job ad53c7f2-e3bd-4667-b60b-b700c24b7a81 is DONE. 0 Bytes processed. Open Job" + "Query job 62db313e-7632-4dbb-8eff-5035d0e6c27e is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -188,11 +258,11 @@ "output_type": "stream", "text": [ " index int_col float_col string_col\n", - "0 4 5 0.0625 e\n", - "1 0 1 1.0000 a\n", - "2 2 3 0.2500 c\n", - "3 3 4 -0.1250 d\n", - "4 1 2 -0.5000 b\n" + "0 1 2 -0.5000 b\n", + "1 3 4 -0.1250 d\n", + "2 0 1 1.0000 a\n", + "3 4 5 0.0625 e\n", + "4 2 3 0.2500 c\n" ] } ], @@ -265,13 +335,13 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 2aa7033c-c547-4ae2-a9aa-33272be82b9c is DONE. 0 Bytes processed. Open Job" + "Query job 1cbd8898-97c7-419e-87af-b72a9432afb6 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -283,10 +353,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_b484a3967fba4a41850f4eb21b4b3bd8'" + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_58b9b6fc0c3349bf8d3dd6fb29ab5322'" ] }, - "execution_count": 40, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -308,13 +378,13 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 1d489f94-2840-405e-9114-d439dcfcf7aa is DONE. 0 Bytes processed. Open Job" + "Query job 40e54aa9-fad7-47c3-9bec-144f6c7106d8 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -326,10 +396,10 @@ { "data": { "text/plain": [ - "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_d00699eeeed743b487c870dca5bcf23b'" + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240710_sessionf75568_cdb4f54063b0417a8309c462b70239fa'" ] }, - "execution_count": 41, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -357,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -366,7 +436,7 @@ "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" ] }, - "execution_count": 42, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -381,13 +451,13 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 40977e60-97c3-4c93-89e2-d7334e5af71d is DONE. 0 Bytes processed. Open Job" + "Query job 73cf9e04-d5fa-4765-827c-665f0e6b9e00 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -399,7 +469,7 @@ { "data": { "text/html": [ - "Query job 81e35bb8-2e27-4a18-b596-15a7805331f0 is DONE. 270 Bytes processed. Open Job" + "Query job b177eb37-197f-4732-8978-c74cccb36e01 is DONE. 270 Bytes processed. Open Job" ], "text/plain": [ "" @@ -523,7 +593,7 @@ "[10 rows x 3 columns]" ] }, - "execution_count": 43, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -627,7 +697,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.9" } }, "nbformat": 4, From 8d1a03ab8916873fe977cf8ba02cd00de3ffed78 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 11 Jul 2024 13:33:01 -0700 Subject: [PATCH 07/36] test: fix llm load tests (#836) --- tests/system/load/test_llm.py | 85 +++++++++---------------------- tests/system/small/ml/test_llm.py | 45 ++++++++++++++++ 2 files changed, 70 insertions(+), 60 deletions(-) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index fd047b3ba6..9b8868bb27 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -15,7 +15,8 @@ import pandas as pd import pytest -import bigframes.ml.llm +from bigframes.ml import llm +from tests.system import utils @pytest.fixture(scope="session") @@ -39,9 +40,7 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df): @pytest.mark.flaky(retries=2) def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): - model = bigframes.ml.llm.PaLM2TextGenerator( - model_name="text-bison", max_iterations=1 - ) + model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) X_train = llm_fine_tune_df_default_index[["prompt"]] y_train = llm_fine_tune_df_default_index[["label"]] @@ -50,62 +49,23 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_ assert model is not None df = model.predict(llm_remote_text_df["prompt"]).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_text_llm_result" in df.columns - series = df["ml_generate_text_llm_result"] - assert all(series.str.len() == 1) - - # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept - - -@pytest.mark.flaky(retries=2) -def test_llm_palm_score(llm_fine_tune_df_default_index): - model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison") - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index[["prompt"]], - y=llm_fine_tune_df_default_index[["label"]], - ).to_pandas() - score_result_col = score_result.columns.to_list() - expected_col = [ - "bleu4_score", - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - "evaluation_status", - ] - assert all(col in score_result_col for col in expected_col) - - -@pytest.mark.flaky(retries=2) -def test_llm_palm_score_params(llm_fine_tune_df_default_index): - model = bigframes.ml.llm.PaLM2TextGenerator( - model_name="text-bison", max_iterations=1 + utils.check_pandas_df_schema_and_index( + df, + columns=[ + "ml_generate_text_llm_result", + "ml_generate_text_rai_result", + "ml_generate_text_status", + "prompt", + ], + index=3, ) - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index["prompt"], - y=llm_fine_tune_df_default_index["label"], - task_type="classification", - ).to_pandas() - score_result_col = score_result.columns.to_list() - expected_col = [ - "precision", - "recall", - "f1_score", - "label", - "evaluation_status", - ] - assert all(col in score_result_col for col in expected_col) + # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept +@pytest.mark.skip(reason="b/351905648. Credential error to be fixed.") @pytest.mark.flaky(retries=2) def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): - model = bigframes.ml.llm.GeminiTextGenerator( - model_name="gemini-pro", max_iterations=1 - ) + model = llm.GeminiTextGenerator(model_name="gemini-pro", max_iterations=1) X_train = llm_fine_tune_df_default_index[["prompt"]] y_train = llm_fine_tune_df_default_index[["label"]] @@ -120,9 +80,14 @@ def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_tex top_k=20, top_p=0.5, ).to_pandas() - assert df.shape == (3, 4) - assert "ml_generate_text_llm_result" in df.columns - series = df["ml_generate_text_llm_result"] - assert all(series.str.len() == 1) - + utils.check_pandas_df_schema_and_index( + df, + columns=[ + "ml_generate_text_llm_result", + "ml_generate_text_rai_result", + "ml_generate_text_status", + "prompt", + ], + index=3, + ) # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 36d01e126f..ee9d654d93 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -369,6 +369,51 @@ def test_gemini_text_generator_predict_with_params_success( assert all(series.str.len() > 20) +@pytest.mark.flaky(retries=2) +def test_llm_palm_score(llm_fine_tune_df_default_index): + model = llm.PaLM2TextGenerator(model_name="text-bison") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index[["prompt"]], + y=llm_fine_tune_df_default_index[["label"]], + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ], + index=1, + ) + + +@pytest.mark.flaky(retries=2) +def test_llm_palm_score_params(llm_fine_tune_df_default_index): + model = llm.PaLM2TextGenerator(model_name="text-bison", max_iterations=1) + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index["prompt"], + y=llm_fine_tune_df_default_index["label"], + task_type="classification", + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "precision", + "recall", + "f1_score", + "label", + "evaluation_status", + ], + index=6, + ) + + @pytest.mark.flaky(retries=2) def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): model = llm.GeminiTextGenerator(model_name="gemini-pro") From 4e7e67bfe07844b1faa9763c6ab1ad7ac89ed043 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 11 Jul 2024 14:24:48 -0700 Subject: [PATCH 08/36] refactor: SQL builder supports partial ordering (#838) --- bigframes/core/compile/api.py | 3 +- bigframes/core/compile/compiled.py | 46 ++++++++++------- bigframes/core/compile/compiler.py | 4 +- bigframes/core/ordering.py | 80 +++++++++++++++++++++++++----- 4 files changed, 101 insertions(+), 32 deletions(-) diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 1f7d0a4507..a918cbd324 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -51,7 +51,8 @@ def compile_raw( ir = compiler.compile_ordered_ir(node) sql = ir.raw_sql() ordering_info = ir._ordering - return sql, ordering_info + assert ir.has_total_order + return sql, ordering_info # type: ignore def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cc601744c1..c822dd331c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -37,7 +37,9 @@ ascending_over, encode_order_string, IntegerEncoding, + join_orderings, OrderingExpression, + RowOrdering, TotalOrdering, ) import bigframes.core.schema as schemata @@ -519,7 +521,7 @@ def __init__( table: ibis_types.Table, columns: Sequence[ibis_types.Value], hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: TotalOrdering = TotalOrdering(), + ordering: RowOrdering = RowOrdering(), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): super().__init__(table, columns, predicates) @@ -566,6 +568,10 @@ def __init__( def is_ordered_ir(self) -> bool: return True + @property + def has_total_order(self) -> bool: + return isinstance(self._ordering, TotalOrdering) + @classmethod def from_pandas( cls, @@ -757,16 +763,13 @@ def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR: ], table_w_unnest[unnest_offset_id], ] - ordering = TotalOrdering( - ordering_value_columns=tuple( - [ - *self._ordering.ordering_value_columns, - ascending_over(unnest_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*self._ordering.total_ordering_columns, unnest_offset_id] - ), + l_mappings = {id: id for id in self._ordering.referenced_columns} + r_mappings = {unnest_offset_id: unnest_offset_id} + ordering = join_orderings( + self._ordering, + TotalOrdering.from_offset_col(unnest_offset_id), + l_mappings, + r_mappings, ) return OrderedIR( @@ -1150,12 +1153,19 @@ def _bake_ordering(self) -> OrderedIR: self._ibis_bindings[expr.scalar_expression.id] ) - new_ordering = TotalOrdering( - tuple(new_exprs), - self._ordering.integer_encoding, - self._ordering.string_encoding, - self._ordering.total_ordering_columns, - ) + if isinstance(self._ordering, TotalOrdering): + new_ordering: RowOrdering = TotalOrdering( + tuple(new_exprs), + self._ordering.integer_encoding, + self._ordering.string_encoding, + self._ordering.total_ordering_columns, + ) + else: + new_ordering = RowOrdering( + tuple(new_exprs), + self._ordering.integer_encoding, + self._ordering.string_encoding, + ) return OrderedIR( self._table, columns=self.columns, @@ -1297,7 +1307,7 @@ class Builder: def __init__( self, table: ibis_types.Table, - ordering: TotalOrdering, + ordering: RowOrdering, columns: Collection[ibis_types.Value] = (), hidden_ordering_columns: Collection[ibis_types.Value] = (), predicates: Optional[Collection[ibis_types.BooleanValue]] = None, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index ca9c479fff..5bea88739d 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -38,7 +38,9 @@ def compile_ordered_ir(node: nodes.BigFrameNode) -> compiled.OrderedIR: - return typing.cast(compiled.OrderedIR, compile_node(node, True)) + ir = typing.cast(compiled.OrderedIR, compile_node(node, True)) + assert ir.has_total_order + return ir def compile_unordered_ir(node: nodes.BigFrameNode) -> compiled.UnorderedIR: diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 406ca52731..bff7e2ce44 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -98,6 +98,8 @@ class RowOrdering: """Immutable object that holds information about the ordering of rows in a ArrayValue object. May not be unambiguous.""" ordering_value_columns: typing.Tuple[OrderingExpression, ...] = () + integer_encoding: IntegerEncoding = IntegerEncoding(False) + string_encoding: StringEncoding = StringEncoding(False) @property def all_ordering_columns(self) -> Sequence[OrderingExpression]: @@ -111,6 +113,20 @@ def referenced_columns(self) -> Set[str]: for col in part.scalar_expression.unbound_variables ) + @property + def is_string_encoded(self) -> bool: + """True if ordering is fully defined by a fixed length string column.""" + return self.string_encoding.is_encoded + + @property + def is_sequential(self) -> bool: + return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential + + @property + def total_order_col(self) -> Optional[OrderingExpression]: + """Returns column id of columns that defines total ordering, if such as column exists""" + return None + def with_reverse(self) -> RowOrdering: """Reverses the ordering.""" return RowOrdering( @@ -121,17 +137,66 @@ def with_column_remap(self, mapping: typing.Mapping[str, str]) -> RowOrdering: new_value_columns = [ col.remap_names(mapping) for col in self.all_ordering_columns ] - return TotalOrdering( + return RowOrdering( tuple(new_value_columns), ) + def with_non_sequential(self): + """Create a copy that is marked as non-sequential. + + This is useful when filtering, but not sorting, an expression. + """ + if self.integer_encoding.is_sequential: + return RowOrdering( + self.ordering_value_columns, + integer_encoding=IntegerEncoding( + self.integer_encoding.is_encoded, is_sequential=False + ), + ) + + return self + + def with_ordering_columns( + self, + ordering_value_columns: Sequence[OrderingExpression] = (), + ) -> RowOrdering: + """Creates a new ordering that reorders by the given columns. + + Args: + ordering_value_columns: + In decreasing precedence order, the values used to sort the ordering + + Returns: + Modified ExpressionOrdering + """ + + # Truncate to remove any unneded col references after all total order cols included + new_ordering = self._truncate_ordering( + (*ordering_value_columns, *self.ordering_value_columns) + ) + return RowOrdering( + new_ordering, + ) + + def _truncate_ordering( + self, order_refs: tuple[OrderingExpression, ...] + ) -> tuple[OrderingExpression, ...]: + # Truncate once we refer to a full key in bijective operations + columns_seen: Set[str] = set() + truncated_refs = [] + for order_part in order_refs: + expr = order_part.scalar_expression + if not set(expr.unbound_variables).issubset(columns_seen): + if expr.is_bijective: + columns_seen.update(expr.unbound_variables) + truncated_refs.append(order_part) + return tuple(truncated_refs) + @dataclass(frozen=True) class TotalOrdering(RowOrdering): """Immutable object that holds information about the ordering of rows in a ArrayValue object. Guaranteed to be unambiguous.""" - integer_encoding: IntegerEncoding = IntegerEncoding(False) - string_encoding: StringEncoding = StringEncoding(False) # A table has a total ordering defined by the identities of a set of 1 or more columns. # These columns must always be part of the ordering, in order to guarantee that the ordering is total. # Therefore, any modifications(or drops) done to these columns must result in hidden copies being made. @@ -234,15 +299,6 @@ def total_order_col(self) -> Optional[OrderingExpression]: return None return order_ref - @property - def is_string_encoded(self) -> bool: - """True if ordering is fully defined by a fixed length string column.""" - return self.string_encoding.is_encoded - - @property - def is_sequential(self) -> bool: - return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential - def encode_order_string( order_id: ibis_types.IntegerColumn, length: int = DEFAULT_ORDERING_ID_LENGTH From 93785cb48be4a2eb8770129148bd0b897fed4ee7 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 11 Jul 2024 17:34:27 -0700 Subject: [PATCH 09/36] fix: Fix unordered mode using ordered path to print frame (#839) --- bigframes/core/blocks.py | 7 +++++-- tests/system/small/test_unordered.py | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index fab0035e1a..c2bf20076a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -498,7 +498,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: bool = True, + ordered: Optional[bool] = None, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): @@ -517,7 +517,10 @@ def to_pandas( df, query_job = self._materialize_local( materialize_options=MaterializationOptions( - downsampling=sampling, ordered=ordered + downsampling=sampling, + ordered=ordered + if ordered is not None + else self.session._strictly_ordered, ) ) df.set_axis(self.column_labels, axis=1, copy=False) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 36bf2a2585..4448ddc838 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -31,6 +31,12 @@ def test_unordered_mode_cache_aggregate(unordered_session): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_unordered_mode_print(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session).cache() + print(df) + + @skip_legacy_pandas def test_unordered_mode_read_gbq(unordered_session): df = unordered_session.read_gbq( From c6d1c7c9e0608e55bc7121ea8f97146a2581b9b7 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:26:14 -0700 Subject: [PATCH 10/36] chore: fix notebook test. (#841) --- notebooks/regression/sklearn_linear_regression.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/regression/sklearn_linear_regression.ipynb b/notebooks/regression/sklearn_linear_regression.ipynb index 2873527449..95aa314bb0 100644 --- a/notebooks/regression/sklearn_linear_regression.ipynb +++ b/notebooks/regression/sklearn_linear_regression.ipynb @@ -857,7 +857,7 @@ "from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder\n", "\n", "preprocessing = ColumnTransformer([\n", - " (\"onehot\", OneHotEncoder(), [\"island\", \"species\", \"sex\"]),\n", + " (\"onehot\", OneHotEncoder(), [\"island\", \"sex\"]),\n", " (\"scaler\", StandardScaler(), [\"culmen_depth_mm\", \"culmen_length_mm\", \"flipper_length_mm\"]),\n", "])\n", "\n", From 877bcc052b27a59485bb155cafed29c640642c96 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 12 Jul 2024 16:01:34 -0700 Subject: [PATCH 11/36] refactor: Turn compiler into object with toggleable order strictness (#840) --- bigframes/core/compile/api.py | 18 +- bigframes/core/compile/compiler.py | 488 +++++++++++++++-------------- bigframes/core/nodes.py | 2 +- 3 files changed, 258 insertions(+), 250 deletions(-) diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index a918cbd324..4e76d42bef 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -22,24 +22,28 @@ import bigframes.core.ordering import bigframes.core.schema +_STRICT_COMPILER = compiler.Compiler(strict=True) + def compile_peek(node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" - return compiler.compile_unordered_ir(node).peek_sql(n_rows) + return _STRICT_COMPILER.compile_unordered_ir(node).peek_sql(n_rows) def compile_unordered( node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} ) -> str: """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" - return compiler.compile_unordered_ir(node).to_sql(col_id_overrides=col_id_overrides) + return _STRICT_COMPILER.compile_unordered_ir(node).to_sql( + col_id_overrides=col_id_overrides + ) def compile_ordered( node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} ) -> str: """Compile node into sql where rows are sorted with ORDER BY.""" - return compiler.compile_ordered_ir(node).to_sql( + return _STRICT_COMPILER.compile_ordered_ir(node).to_sql( col_id_overrides=col_id_overrides, ordered=True ) @@ -48,7 +52,7 @@ def compile_raw( node: bigframes.core.nodes.BigFrameNode, ) -> Tuple[str, bigframes.core.ordering.TotalOrdering]: """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" - ir = compiler.compile_ordered_ir(node) + ir = _STRICT_COMPILER.compile_ordered_ir(node) sql = ir.raw_sql() ordering_info = ir._ordering assert ir.has_total_order @@ -57,7 +61,9 @@ def compile_raw( def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - ibis = compiler.compile_ordered_ir(node)._to_ibis_expr(ordering_mode="unordered") + ibis = _STRICT_COMPILER.compile_ordered_ir(node)._to_ibis_expr( + ordering_mode="unordered" + ) return ibis.pandas.connect({}).execute(ibis) @@ -65,7 +71,7 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): """Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema.""" import bigframes.core.schema - compiled = compiler.compile_unordered_ir(node) + compiled = _STRICT_COMPILER.compile_unordered_ir(node) items = tuple( bigframes.core.schema.SchemaItem(id, compiled.get_column_type(id)) for id in compiled.column_ids diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 5bea88739d..da74ffeb8f 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import dataclasses import functools import io import typing @@ -37,99 +38,191 @@ import bigframes.session -def compile_ordered_ir(node: nodes.BigFrameNode) -> compiled.OrderedIR: - ir = typing.cast(compiled.OrderedIR, compile_node(node, True)) - assert ir.has_total_order - return ir - - -def compile_unordered_ir(node: nodes.BigFrameNode) -> compiled.UnorderedIR: - return typing.cast(compiled.UnorderedIR, compile_node(node, False)) - - -def compile_peak_sql(node: nodes.BigFrameNode, n_rows: int) -> typing.Optional[str]: - return compile_unordered_ir(node).peek_sql(n_rows) - - -# TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) -@functools.lru_cache(maxsize=5000) -def compile_node( - node: nodes.BigFrameNode, ordered: bool = True -) -> compiled.UnorderedIR | compiled.OrderedIR: - """Compile node into CompileArrayValue. Caches result.""" - return _compile_node(node, ordered) - - -@functools.singledispatch -def _compile_node( - node: nodes.BigFrameNode, ordered: bool = True -) -> compiled.UnorderedIR: - """Defines transformation but isn't cached, always use compile_node instead""" - raise ValueError(f"Can't compile unrecognized node: {node}") - +@dataclasses.dataclass(frozen=True) +class Compiler: + # In strict mode, ordering will always be deterministic + # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance. + strict: bool = True + + def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR: + ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True)) + assert ir.has_total_order + return ir + + def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: + return typing.cast(compiled.UnorderedIR, self.compile_node(node, False)) + + def compile_peak_sql( + self, node: nodes.BigFrameNode, n_rows: int + ) -> typing.Optional[str]: + return self.compile_unordered_ir(node).peek_sql(n_rows) + + # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) + @functools.lru_cache(maxsize=5000) + def compile_node( + self, node: nodes.BigFrameNode, ordered: bool = True + ) -> compiled.UnorderedIR | compiled.OrderedIR: + """Compile node into CompileArrayValue. Caches result.""" + return self._compile_node(node, ordered) + + @functools.singledispatchmethod + def _compile_node( + self, node: nodes.BigFrameNode, ordered: bool = True + ) -> compiled.UnorderedIR: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized node: {node}") + + @_compile_node.register + def compile_join(self, node: nodes.JoinNode, ordered: bool = True): + if ordered: + left_ordered = self.compile_ordered_ir(node.left_child) + right_ordered = self.compile_ordered_ir(node.right_child) + return bigframes.core.compile.single_column.join_by_column_ordered( + left=left_ordered, + right=right_ordered, + join=node.join, + ) + else: + left_unordered = self.compile_unordered_ir(node.left_child) + right_unordered = self.compile_unordered_ir(node.right_child) + return bigframes.core.compile.single_column.join_by_column_unordered( + left=left_unordered, + right=right_unordered, + join=node.join, + ) -@_compile_node.register -def compile_join(node: nodes.JoinNode, ordered: bool = True): - if ordered: - left_ordered = compile_ordered_ir(node.left_child) - right_ordered = compile_ordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_ordered( - left=left_ordered, - right=right_ordered, - join=node.join, + @_compile_node.register + def compile_readlocal(self, node: nodes.ReadLocalNode, ordered: bool = True): + array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) + ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.schema) + if ordered: + return ordered_ir + else: + return ordered_ir.to_unordered() + + @_compile_node.register + def compile_cached_table(self, node: nodes.CachedTableNode, ordered: bool = True): + full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" + used_columns = ( + *node.schema.names, + *node.hidden_columns, ) - else: - left_unordered = compile_unordered_ir(node.left_child) - right_unordered = compile_unordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_unordered( - left=left_unordered, - right=right_unordered, - join=node.join, + # Physical schema might include unused columns, unsupported datatypes like JSON + physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( + list(i for i in node.physical_schema if i.name in used_columns) ) + ibis_table = ibis.table(physical_schema, full_table_name) + if ordered: + if node.ordering is None: + # If this happens, session malfunctioned while applying cached results. + raise ValueError( + "Cannot use unordered cached value. Result requires ordering information." + ) + if self.strict and not isinstance(node.ordering, bf_ordering.TotalOrdering): + raise ValueError( + "Cannot use partially ordered cached value. Result requires total ordering information." + ) + return compiled.OrderedIR( + ibis_table, + columns=tuple( + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) + for col in node.schema.names + ), + ordering=node.ordering, + hidden_ordering_columns=[ibis_table[c] for c in node.hidden_columns], + ) + else: + return compiled.UnorderedIR( + ibis_table, + columns=tuple( + bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( + ibis_table[col] + ) + for col in node.schema.names + ), + ) -@_compile_node.register -def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): - array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) - ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.schema) - if ordered: - return ordered_ir - else: - return ordered_ir.to_unordered() - - -@_compile_node.register -def compile_cached_table(node: nodes.CachedTableNode, ordered: bool = True): - full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" - used_columns = ( - *node.schema.names, - *node.hidden_columns, - ) - # Physical schema might include unused columns, unsupported datatypes like JSON - physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( - list(i for i in node.physical_schema if i.name in used_columns) - ) - ibis_table = ibis.table(physical_schema, full_table_name) - if ordered: - if node.ordering is None: - # If this happens, session malfunctioned while applying cached results. - raise ValueError( - "Cannot use unordered cached value. Result requires ordering information." + @_compile_node.register + def compile_readtable(self, node: nodes.ReadTableNode, ordered: bool = True): + if ordered: + return self.compile_read_table_ordered(node) + else: + return self.compile_read_table_unordered(node) + + def read_table_as_unordered_ibis( + self, node: nodes.ReadTableNode + ) -> ibis.expr.types.Table: + full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" + used_columns = ( + *node.schema.names, + *[i for i in node.total_order_cols if i not in node.schema.names], + ) + # Physical schema might include unused columns, unsupported datatypes like JSON + physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( + list(i for i in node.physical_schema if i.name in used_columns) + ) + if node.at_time is not None or node.sql_predicate is not None: + import bigframes.session._io.bigquery + + sql = bigframes.session._io.bigquery.to_query( + full_table_name, + columns=used_columns, + sql_predicate=node.sql_predicate, + time_travel_timestamp=node.at_time, ) - return compiled.OrderedIR( + return ibis.backends.bigquery.Backend().sql( + schema=physical_schema, query=sql + ) + else: + return ibis.table(physical_schema, full_table_name) + + def compile_read_table_unordered(self, node: nodes.ReadTableNode): + ibis_table = self.read_table_as_unordered_ibis(node) + return compiled.UnorderedIR( ibis_table, - columns=tuple( + tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( ibis_table[col] ) for col in node.schema.names ), - ordering=node.ordering, - hidden_ordering_columns=[ibis_table[c] for c in node.hidden_columns], ) - else: - return compiled.UnorderedIR( + def compile_read_table_ordered(self, node: nodes.ReadTableNode): + ibis_table = self.read_table_as_unordered_ibis(node) + if node.total_order_cols: + ordering_value_columns = tuple( + bf_ordering.ascending_over(col) for col in node.total_order_cols + ) + if node.order_col_is_sequential: + integer_encoding = bf_ordering.IntegerEncoding( + is_encoded=True, is_sequential=True + ) + else: + integer_encoding = bf_ordering.IntegerEncoding() + ordering: bf_ordering.RowOrdering = bf_ordering.TotalOrdering( + ordering_value_columns, + integer_encoding=integer_encoding, + total_ordering_columns=frozenset(node.total_order_cols), + ) + hidden_columns = () + elif self.strict: + ibis_table, ordering = default_ordering.gen_default_ordering( + ibis_table, use_double_hash=True + ) + hidden_columns = tuple( + ibis_table[col] + for col in ibis_table.columns + if col not in node.schema.names + ) + else: + # In unstrict mode, don't generate total ordering from hashing as this is + # expensive (prevent removing any columns from table scan) + ordering, hidden_columns = bf_ordering.RowOrdering(), () + return compiled.OrderedIR( ibis_table, columns=tuple( bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( @@ -137,182 +230,91 @@ def compile_cached_table(node: nodes.CachedTableNode, ordered: bool = True): ) for col in node.schema.names ), + ordering=ordering, + hidden_ordering_columns=hidden_columns, ) + @_compile_node.register + def compile_promote_offsets( + self, node: nodes.PromoteOffsetsNode, ordered: bool = True + ): + result = self.compile_ordered_ir(node.child).promote_offsets(node.col_id) + return result if ordered else result.to_unordered() -@_compile_node.register -def compile_readtable(node: nodes.ReadTableNode, ordered: bool = True): - if ordered: - return compile_read_table_ordered(node) - else: - return compile_read_table_unordered(node) - - -def read_table_as_unordered_ibis(node: nodes.ReadTableNode) -> ibis.expr.types.Table: - full_table_name = f"{node.project_id}.{node.dataset_id}.{node.table_id}" - used_columns = ( - *node.schema.names, - *[i for i in node.total_order_cols if i not in node.schema.names], - ) - # Physical schema might include unused columns, unsupported datatypes like JSON - physical_schema = ibis.backends.bigquery.BigQuerySchema.to_ibis( - list(i for i in node.physical_schema if i.name in used_columns) - ) - if node.at_time is not None or node.sql_predicate is not None: - import bigframes.session._io.bigquery - - sql = bigframes.session._io.bigquery.to_query( - full_table_name, - columns=used_columns, - sql_predicate=node.sql_predicate, - time_travel_timestamp=node.at_time, - ) - return ibis.backends.bigquery.Backend().sql(schema=physical_schema, query=sql) - else: - return ibis.table(physical_schema, full_table_name) - + @_compile_node.register + def compile_filter(self, node: nodes.FilterNode, ordered: bool = True): + return self.compile_node(node.child, ordered).filter(node.predicate) -def compile_read_table_unordered(node: nodes.ReadTableNode): - ibis_table = read_table_as_unordered_ibis(node) - return compiled.UnorderedIR( - ibis_table, - tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] - ) - for col in node.schema.names - ), - ) + @_compile_node.register + def compile_orderby(self, node: nodes.OrderByNode, ordered: bool = True): + if ordered: + return self.compile_ordered_ir(node.child).order_by(node.by) + else: + return self.compile_unordered_ir(node.child) + @_compile_node.register + def compile_reversed(self, node: nodes.ReversedNode, ordered: bool = True): + if ordered: + return self.compile_ordered_ir(node.child).reversed() + else: + return self.compile_unordered_ir(node.child) + + @_compile_node.register + def compile_projection(self, node: nodes.ProjectionNode, ordered: bool = True): + result = self.compile_node(node.child, ordered) + return result.projection(node.assignments) + + @_compile_node.register + def compile_concat(self, node: nodes.ConcatNode, ordered: bool = True): + if ordered: + compiled_ordered = [self.compile_ordered_ir(node) for node in node.children] + return concat_impl.concat_ordered(compiled_ordered) + else: + compiled_unordered = [ + self.compile_unordered_ir(node) for node in node.children + ] + return concat_impl.concat_unordered(compiled_unordered) + + @_compile_node.register + def compile_rowcount(self, node: nodes.RowCountNode, ordered: bool = True): + result = self.compile_unordered_ir(node.child).row_count() + return result if ordered else result.to_unordered() -def compile_read_table_ordered(node: nodes.ReadTableNode): - ibis_table = read_table_as_unordered_ibis(node) - if node.total_order_cols: - ordering_value_columns = tuple( - bf_ordering.ascending_over(col) for col in node.total_order_cols + @_compile_node.register + def compile_aggregate(self, node: nodes.AggregateNode, ordered: bool = True): + has_ordered_aggregation_ops = any( + aggregate.op.can_order_by for aggregate, _ in node.aggregations ) - if node.order_col_is_sequential: - integer_encoding = bf_ordering.IntegerEncoding( - is_encoded=True, is_sequential=True + if ordered and has_ordered_aggregation_ops: + return self.compile_ordered_ir(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna ) else: - integer_encoding = bf_ordering.IntegerEncoding() - ordering = bf_ordering.TotalOrdering( - ordering_value_columns, - integer_encoding=integer_encoding, - total_ordering_columns=frozenset(node.total_order_cols), - ) - hidden_columns = () - else: - ibis_table, ordering = default_ordering.gen_default_ordering( - ibis_table, use_double_hash=True - ) - hidden_columns = tuple( - ibis_table[col] - for col in ibis_table.columns - if col not in node.schema.names - ) - return compiled.OrderedIR( - ibis_table, - columns=tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[col] + result = self.compile_unordered_ir(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna ) - for col in node.schema.names - ), - ordering=ordering, - hidden_ordering_columns=hidden_columns, - ) - - -@_compile_node.register -def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): - result = compile_ordered_ir(node.child).promote_offsets(node.col_id) - return result if ordered else result.to_unordered() - - -@_compile_node.register -def compile_filter(node: nodes.FilterNode, ordered: bool = True): - return compile_node(node.child, ordered).filter(node.predicate) - - -@_compile_node.register -def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): - if ordered: - return compile_ordered_ir(node.child).order_by(node.by) - else: - return compile_unordered_ir(node.child) - - -@_compile_node.register -def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): - if ordered: - return compile_ordered_ir(node.child).reversed() - else: - return compile_unordered_ir(node.child) - - -@_compile_node.register -def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): - result = compile_node(node.child, ordered) - return result.projection(node.assignments) - - -@_compile_node.register -def compile_concat(node: nodes.ConcatNode, ordered: bool = True): - if ordered: - compiled_ordered = [compile_ordered_ir(node) for node in node.children] - return concat_impl.concat_ordered(compiled_ordered) - else: - compiled_unordered = [compile_unordered_ir(node) for node in node.children] - return concat_impl.concat_unordered(compiled_unordered) - - -@_compile_node.register -def compile_rowcount(node: nodes.RowCountNode, ordered: bool = True): - result = compile_unordered_ir(node.child).row_count() - return result if ordered else result.to_unordered() - - -@_compile_node.register -def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): - has_ordered_aggregation_ops = any( - aggregate.op.can_order_by for aggregate, _ in node.aggregations - ) - if ordered and has_ordered_aggregation_ops: - return compile_ordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna - ) - else: - result = compile_unordered_ir(node.child).aggregate( - node.aggregations, node.by_column_ids, node.dropna + return result if ordered else result.to_unordered() + + @_compile_node.register + def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): + result = self.compile_ordered_ir(node.child).project_window_op( + node.column_name, + node.op, + node.window_spec, + node.output_name, + never_skip_nulls=node.never_skip_nulls, + skip_reproject_unsafe=node.skip_reproject_unsafe, ) return result if ordered else result.to_unordered() + @_compile_node.register + def compile_reproject(self, node: nodes.ReprojectOpNode, ordered: bool = True): + return self.compile_node(node.child, ordered)._reproject_to_table() -@_compile_node.register -def compile_window(node: nodes.WindowOpNode, ordered: bool = True): - result = compile_ordered_ir(node.child).project_window_op( - node.column_name, - node.op, - node.window_spec, - node.output_name, - never_skip_nulls=node.never_skip_nulls, - skip_reproject_unsafe=node.skip_reproject_unsafe, - ) - return result if ordered else result.to_unordered() - - -@_compile_node.register -def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): - return compile_node(node.child, ordered)._reproject_to_table() - - -@_compile_node.register -def compile_explode(node: nodes.ExplodeNode, ordered: bool = True): - return compile_node(node.child, ordered).explode(node.column_ids) - + @_compile_node.register + def compile_explode(self, node: nodes.ExplodeNode, ordered: bool = True): + return self.compile_node(node.child, ordered).explode(node.column_ids) -@_compile_node.register -def compile_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): - return compile_node(node.child, ordered)._uniform_sampling(node.fraction) + @_compile_node.register + def compile_random_sample(self, node: nodes.RandomSampleNode, ordered: bool = True): + return self.compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index dbcfc282e4..aadab9f5cc 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -372,7 +372,7 @@ class CachedTableNode(BigFrameNode): table_id: str = field() physical_schema: Tuple[bq.SchemaField, ...] = field() - ordering: typing.Optional[orderings.TotalOrdering] = field() + ordering: typing.Optional[orderings.RowOrdering] = field() def __post_init__(self): # enforce invariants From ed06436612c0d46f190f79721416d473bde7e2f4 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 15 Jul 2024 16:30:30 -0700 Subject: [PATCH 12/36] feat: support remote function cleanup with `session.close` (#818) * feat: support remote function cleanup with `session.close` * accept the possibility that the artifact may have already been deleted * add cleanup by previous session id * add more documentation * hold session artifacts in a remote function session class * fix the missing return keyword --- bigframes/functions/remote_function.py | 987 ++++++++++++--------- bigframes/pandas/__init__.py | 17 +- bigframes/session/__init__.py | 28 +- tests/system/large/test_remote_function.py | 150 +++- tests/system/large/test_session.py | 10 +- 5 files changed, 756 insertions(+), 436 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index c1878b6c31..f24ba1b5fb 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -24,14 +24,17 @@ import string import sys import tempfile +import threading from typing import ( Any, cast, + Dict, List, Mapping, NamedTuple, Optional, Sequence, + Set, Tuple, TYPE_CHECKING, Union, @@ -67,11 +70,80 @@ logger = logging.getLogger(__name__) +# Naming convention for the remote function artifacts +_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" +_BQ_FUNCTION_NAME_SEPERATOR = "_" +_GCF_FUNCTION_NAME_SEPERATOR = "-" + # Protocol version 4 is available in python version 3.4 and above # https://docs.python.org/3/library/pickle.html#data-stream-format _pickle_protocol_version = 4 +def _clean_up_by_session_id( + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + dataset: bigquery.DatasetReference, + session_id: str, +): + """Delete remote function artifacts for a session id, where the session id + was not necessarily created in the current runtime. This is useful if the + user worked with a BigQuery DataFrames session previously and remembered the + session id, and now wants to clean up its temporary resources at a later + point in time. + """ + + # First clean up the BQ remote functions and then the underlying + # cloud functions, so that at no point we are left with a remote function + # that is pointing to a cloud function that does not exist + + endpoints_to_be_deleted: Set[str] = set() + match_prefix = "".join( + [ + _BIGFRAMES_REMOTE_FUNCTION_PREFIX, + _BQ_FUNCTION_NAME_SEPERATOR, + session_id, + _BQ_FUNCTION_NAME_SEPERATOR, + ] + ) + for routine in bqclient.list_routines(dataset): + routine = cast(bigquery.Routine, routine) + + # skip past the routines not belonging to the given session id, or + # non-remote-function routines + if ( + routine.type_ != bigquery.RoutineType.SCALAR_FUNCTION + or not cast(str, routine.routine_id).startswith(match_prefix) + or not routine.remote_function_options + or not routine.remote_function_options.endpoint + ): + continue + + # Let's forgive the edge case possibility that the BQ remote function + # may have been deleted at the same time directly by the user + bqclient.delete_routine(routine, not_found_ok=True) + endpoints_to_be_deleted.add(routine.remote_function_options.endpoint) + + # Now clean up the cloud functions + bq_location = bqclient.get_dataset(dataset).location + bq_location, gcf_location = get_remote_function_locations(bq_location) + parent_path = gcfclient.common_location_path( + project=dataset.project, location=gcf_location + ) + for gcf in gcfclient.list_functions(parent=parent_path): + # skip past the cloud functions not attached to any BQ remote function + # belonging to the given session id + if gcf.service_config.uri not in endpoints_to_be_deleted: + continue + + # Let's forgive the edge case possibility that the cloud function + # may have been deleted at the same time directly by the user + try: + gcfclient.delete_function(name=gcf.name) + except google.api_core.exceptions.NotFound: + pass + + def get_remote_function_locations(bq_location): """Get BQ location and cloud functions region given a BQ client.""" # TODO(shobs, b/274647164): Find the best way to determine default location. @@ -102,7 +174,9 @@ def _get_hash(def_, package_requirements=None): return hashlib.md5(def_repr).hexdigest() -def _get_updated_package_requirements(package_requirements, is_row_processor): +def _get_updated_package_requirements( + package_requirements=None, is_row_processor=False +): requirements = [f"cloudpickle=={cloudpickle.__version__}"] if is_row_processor: # bigframes remote function will send an entire row of data as json, @@ -130,31 +204,20 @@ class IbisSignature(NamedTuple): output_type: IbisDataType -def get_cloud_function_name( - def_, uniq_suffix=None, package_requirements=None, is_row_processor=False -): +def get_cloud_function_name(function_hash, session_id, uniq_suffix=None): "Get a name for the cloud function for the given user defined function." - - # Augment user package requirements with any internal package - # requirements - package_requirements = _get_updated_package_requirements( - package_requirements, is_row_processor - ) - - cf_name = _get_hash(def_, package_requirements) - cf_name = f"bigframes-{cf_name}" # for identification + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] if uniq_suffix: - cf_name = f"{cf_name}-{uniq_suffix}" - return cf_name, package_requirements + parts.append(uniq_suffix) + return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) -def get_remote_function_name(def_, uniq_suffix=None, package_requirements=None): +def get_remote_function_name(function_hash, session_id, uniq_suffix=None): "Get a name for the BQ remote function for the given user defined function." - bq_rf_name = _get_hash(def_, package_requirements) - bq_rf_name = f"bigframes_{bq_rf_name}" # for identification + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] if uniq_suffix: - bq_rf_name = f"{bq_rf_name}_{uniq_suffix}" - return bq_rf_name + parts.append(uniq_suffix) + return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) class RemoteFunctionClient: @@ -272,6 +335,10 @@ def get_cloud_function_fully_qualified_name(self, name): self._gcp_project_id, self._cloud_function_region, name ) + def get_remote_function_fully_qualilfied_name(self, name): + "Get the fully qualilfied name for a BQ remote function." + return f"{self._gcp_project_id}.{self._bq_dataset}.{name}" + def get_cloud_function_endpoint(self, name): """Get the http endpoint of a cloud function if it exists.""" fully_qualified_name = self.get_cloud_function_fully_qualified_name(name) @@ -478,20 +545,31 @@ def provision_bq_remote_function( cloud_function_memory_mib, ): """Provision a BigQuery remote function.""" + # Augment user package requirements with any internal package + # requirements + package_requirements = _get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code + function_hash = _get_hash(def_, package_requirements) + # If reuse of any existing function with the same name (indicated by the # same hash of its source code) is not intended, then attach a unique # suffix to the intended function name to make it unique. uniq_suffix = None if not reuse: + # use 4 digits as a unique suffix which should suffice for + # uniqueness per session uniq_suffix = "".join( - random.choices(string.ascii_lowercase + string.digits, k=8) + random.choices(string.ascii_lowercase + string.digits, k=4) ) # Derive the name of the cloud function underlying the intended BQ # remote function, also collect updated package requirements as # determined in the name resolution - cloud_function_name, package_requirements = get_cloud_function_name( - def_, uniq_suffix, package_requirements, is_row_processor + cloud_function_name = get_cloud_function_name( + function_hash, self._session.session_id, uniq_suffix ) cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) @@ -516,7 +594,7 @@ def provision_bq_remote_function( remote_function_name = name if not remote_function_name: remote_function_name = get_remote_function_name( - def_, uniq_suffix, package_requirements + function_hash, self._session.session_id, uniq_suffix ) rf_endpoint, rf_conn = self.get_remote_function_specs(remote_function_name) @@ -524,6 +602,7 @@ def provision_bq_remote_function( # 1. It does not exist # 2. It exists but the existing remote function has different # configuration than intended + created_new = False if not rf_endpoint or ( rf_endpoint != cf_endpoint or rf_conn != self._bq_connection_id ): @@ -540,10 +619,12 @@ def provision_bq_remote_function( remote_function_name, max_batching_rows, ) + + created_new = True else: logger.info(f"Remote function {remote_function_name} already exists.") - return remote_function_name, cloud_function_name + return remote_function_name, cloud_function_name, created_new def get_remote_function_specs(self, remote_function_name): """Check whether a remote function already exists for the udf.""" @@ -645,426 +726,500 @@ def get_routine_reference( return dataset_ref.routine(routine_ref_str) -# Inspired by @udf decorator implemented in ibis-bigquery package -# https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py -# which has moved as @js to the ibis package -# https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py -def remote_function( - input_types: Union[None, type, Sequence[type]] = None, - output_type: Optional[type] = None, - session: Optional[Session] = None, - bigquery_client: Optional[bigquery.Client] = None, - bigquery_connection_client: Optional[ - bigquery_connection_v1.ConnectionServiceClient - ] = None, - cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, - resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, - dataset: Optional[str] = None, - bigquery_connection: Optional[str] = None, - reuse: bool = True, - name: Optional[str] = None, - packages: Optional[Sequence[str]] = None, - cloud_function_service_account: Optional[str] = None, - cloud_function_kms_key_name: Optional[str] = None, - cloud_function_docker_repository: Optional[str] = None, - max_batching_rows: Optional[int] = 1000, - cloud_function_timeout: Optional[int] = 600, - cloud_function_max_instances: Optional[int] = None, - cloud_function_vpc_connector: Optional[str] = None, - cloud_function_memory_mib: Optional[int] = 1024, -): - """Decorator to turn a user defined function into a BigQuery remote function. - - .. deprecated:: 0.0.1 - This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. - - .. note:: - Please make sure following is setup before using this API: - - 1. Have the below APIs enabled for your project: - - * BigQuery Connection API - * Cloud Functions API - * Cloud Run API - * Cloud Build API - * Artifact Registry API - * Cloud Resource Manager API - - This can be done from the cloud console (change `PROJECT_ID` to yours): - https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - - Or from the gcloud CLI: - - `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` - - 2. Have following IAM roles enabled for you: - - * BigQuery Data Editor (roles/bigquery.dataEditor) - * BigQuery Connection Admin (roles/bigquery.connectionAdmin) - * Cloud Functions Developer (roles/cloudfunctions.developer) - * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` - * Storage Object Viewer (roles/storage.objectViewer) - * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) - - 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: - - 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection - 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - - Alternatively, the IAM could also be setup via the gcloud CLI: - - `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. - - Args: - input_types (None, type, or sequence(type)): - For scalar user defined function it should be the input type or - sequence of input types. For row processing user defined function, - type `Series` should be specified. - output_type (Optional[type]): - Data type of the output in the user defined function. - session (bigframes.Session, Optional): - BigQuery DataFrames session to use for getting default project, - dataset and BigQuery connection. - bigquery_client (google.cloud.bigquery.Client, Optional): - Client to use for BigQuery operations. If this param is not provided - then bigquery client from the session would be used. - bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): - Client to use for BigQuery connection operations. If this param is - not provided then bigquery connection client from the session would - be used. - cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): - Client to use for cloud functions operations. If this param is not - provided then the functions client from the session would be used. - resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): - Client to use for cloud resource management operations, e.g. for - getting and setting IAM roles on cloud resources. If this param is - not provided then resource manager client from the session would be - used. - dataset (str, Optional.): - Dataset in which to create a BigQuery remote function. It should be in - `.` or `` format. If this - parameter is not provided then session dataset id is used. - bigquery_connection (str, Optional): - Name of the BigQuery connection in the form of `CONNECTION_ID` or - `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. - If this param is not provided then the bigquery connection from the session - would be used. If it is pre created in the same location as the - `bigquery_client.location` then it would be used, otherwise it is created - dynamically using the `bigquery_connection_client` assuming the user has necessary - priviliges. The PROJECT_ID should be the same as the BigQuery connection project. - reuse (bool, Optional): - Reuse the remote function if is already exists. - `True` by default, which results in reusing an existing remote - function and corresponding cloud function (if any) that was - previously created for the same udf. - Setting it to `False` forces the creation of a unique remote function. - If the required remote function does not exist then it would be - created irrespective of this param. - name (str, Optional): - Explicit name of the persisted BigQuery remote function. Use it with - caution, because two users working in the same project and dataset - could overwrite each other's remote functions if they use the same - persistent name. - packages (str[], Optional): - Explicit name of the external package dependencies. Each dependency - is added to the `requirements.txt` as is, and can be of the form - supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - cloud_function_service_account (str, Optional): - Service account to use for the cloud functions. If not provided then - the default service account would be used. See - https://cloud.google.com/functions/docs/securing/function-identity - for more details. Please make sure the service account has the - necessary IAM permissions configured as described in - https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. - cloud_function_kms_key_name (str, Optional): - Customer managed encryption key to protect cloud functions and - related data at rest. This is of the format - projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. - Read https://cloud.google.com/functions/docs/securing/cmek for - more details including granting necessary service accounts - access to the key. - cloud_function_docker_repository (str, Optional): - Docker repository created with the same encryption key as - `cloud_function_kms_key_name` to store encrypted artifacts - created to support the cloud function. This is of the format - projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. - For more details see - https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. - max_batching_rows (int, Optional): - The maximum number of rows to be batched for processing in the - BQ remote function. Default value is 1000. A lower number can be - passed to avoid timeouts in case the user code is too complex to - process large number of rows fast enough. A higher number can be - used to increase throughput in case the user code is fast enough. - `None` can be passed to let BQ remote functions service apply - default batching. See for more details - https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. - cloud_function_timeout (int, Optional): - The maximum amount of time (in seconds) BigQuery should wait for - the cloud function to return a response. See for more details - https://cloud.google.com/functions/docs/configuring/timeout. - Please note that even though the cloud function (2nd gen) itself - allows seeting up to 60 minutes of timeout, BigQuery remote - function can wait only up to 20 minutes, see for more details - https://cloud.google.com/bigquery/quotas#remote_function_limits. - By default BigQuery DataFrames uses a 10 minute timeout. `None` - can be passed to let the cloud functions default timeout take effect. - cloud_function_max_instances (int, Optional): - The maximumm instance count for the cloud function created. This - can be used to control how many cloud function instances can be - active at max at any given point of time. Lower setting can help - control the spike in the billing. Higher setting can help - support processing larger scale data. When not specified, cloud - function's default setting applies. For more details see - https://cloud.google.com/functions/docs/configuring/max-instances. - cloud_function_vpc_connector (str, Optional): - The VPC connector you would like to configure for your cloud - function. This is useful if your code needs access to data or - service(s) that are on a VPC network. See for more details - https://cloud.google.com/functions/docs/networking/connecting-vpc. - cloud_function_memory_mib (int, Optional): - The amounts of memory (in mebibytes) to allocate for the cloud - function (2nd gen) created. This also dictates a corresponding - amount of allocated CPU for the function. By default a memory of - 1024 MiB is set for the cloud functions created to support - BigQuery DataFrames remote function. If you want to let the - default memory of cloud functions be allocated, pass `None`. See - for more details - https://cloud.google.com/functions/docs/configuring/memory. - """ - # Some defaults may be used from the session if not provided otherwise - import bigframes.exceptions as bf_exceptions - import bigframes.pandas as bpd - import bigframes.series as bf_series - import bigframes.session - - session = cast(bigframes.session.Session, session or bpd.get_global_session()) - - # A BigQuery client is required to perform BQ operations - if not bigquery_client: - bigquery_client = session.bqclient - if not bigquery_client: - raise ValueError( - "A bigquery client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) +class _RemoteFunctionSession: + """Session to manage remote functions.""" - # A BigQuery connection client is required to perform BQ connection operations - if not bigquery_connection_client: - bigquery_connection_client = session.bqconnectionclient - if not bigquery_connection_client: - raise ValueError( - "A bigquery connection client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + def __init__(self): + # Session level mapping of remote function artifacts + self._temp_session_artifacts: Dict[str, str] = dict() - # A cloud functions client is required to perform cloud functions operations - if not cloud_functions_client: - cloud_functions_client = session.cloudfunctionsclient - if not cloud_functions_client: - raise ValueError( - "A cloud functions client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + # Lock to synchronize the update of the session level mapping + self._session_artifacts_lock = threading.Lock() - # A resource manager client is required to get/set IAM operations - if not resource_manager_client: - resource_manager_client = session.resourcemanagerclient - if not resource_manager_client: - raise ValueError( - "A resource manager client must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" - ) + def _update_artifacts(self, bqrf_routine: str, gcf_path: str): + """Update remote function artifacts in the current session.""" + with self._session_artifacts_lock: + self._temp_session_artifacts[bqrf_routine] = gcf_path - # BQ remote function must be persisted, for which we need a dataset - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. - if dataset: - dataset_ref = bigquery.DatasetReference.from_string( - dataset, default_project=bigquery_client.project - ) - else: - dataset_ref = session._anonymous_dataset + def clean_up( + self, + bqclient: bigquery.Client, + gcfclient: functions_v2.FunctionServiceClient, + session_id: str, + ): + """Delete remote function artifacts in the current session.""" + with self._session_artifacts_lock: + for bqrf_routine, gcf_path in self._temp_session_artifacts.items(): + # Let's accept the possibility that the remote function may have + # been deleted directly by the user + bqclient.delete_routine(bqrf_routine, not_found_ok=True) + + # Let's accept the possibility that the cloud function may have + # been deleted directly by the user + try: + gcfclient.delete_function(name=gcf_path) + except google.api_core.exceptions.NotFound: + pass + + self._temp_session_artifacts.clear() + + # Inspired by @udf decorator implemented in ibis-bigquery package + # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py + # which has moved as @js to the ibis package + # https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/udf/__init__.py + def remote_function( + self, + input_types: Union[None, type, Sequence[type]] = None, + output_type: Optional[type] = None, + session: Optional[Session] = None, + bigquery_client: Optional[bigquery.Client] = None, + bigquery_connection_client: Optional[ + bigquery_connection_v1.ConnectionServiceClient + ] = None, + cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, + dataset: Optional[str] = None, + bigquery_connection: Optional[str] = None, + reuse: bool = True, + name: Optional[str] = None, + packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, + cloud_function_kms_key_name: Optional[str] = None, + cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, + cloud_function_timeout: Optional[int] = 600, + cloud_function_max_instances: Optional[int] = None, + cloud_function_vpc_connector: Optional[str] = None, + cloud_function_memory_mib: Optional[int] = 1024, + ): + """Decorator to turn a user defined function into a BigQuery remote function. - bq_location, cloud_function_region = get_remote_function_locations( - bigquery_client.location - ) + .. deprecated:: 0.0.1 + This is an internal method. Please use :func:`bigframes.pandas.remote_function` instead. - # A connection is required for BQ remote function - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function - if not bigquery_connection: - bigquery_connection = session._bq_connection # type: ignore + .. note:: + Please make sure following is setup before using this API: - bigquery_connection = clients.resolve_full_bq_connection_name( - bigquery_connection, - default_project=dataset_ref.project, - default_location=bq_location, - ) - # Guaranteed to be the form of .. - ( - gcp_project_id, - bq_connection_location, - bq_connection_id, - ) = bigquery_connection.split(".") - if gcp_project_id.casefold() != dataset_ref.project.casefold(): - raise ValueError( - "The project_id does not match BigQuery connection gcp_project_id: " - f"{dataset_ref.project}." - ) - if bq_connection_location.casefold() != bq_location.casefold(): - raise ValueError( - "The location does not match BigQuery connection location: " - f"{bq_location}." - ) + 1. Have the below APIs enabled for your project: - # If any CMEK is intended then check that a docker repository is also specified - if ( - cloud_function_kms_key_name is not None - and cloud_function_docker_repository is None - ): - raise ValueError( - "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." - " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" - ) + * BigQuery Connection API + * Cloud Functions API + * Cloud Run API + * Cloud Build API + * Artifact Registry API + * Cloud Resource Manager API - bq_connection_manager = None if session is None else session.bqconnectionmanager + This can be done from the cloud console (change `PROJECT_ID` to yours): + https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID - def wrapper(func): - nonlocal input_types, output_type + Or from the gcloud CLI: - if not callable(func): - raise TypeError("f must be callable, got {}".format(func)) + `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com` - if sys.version_info >= (3, 10): - # Add `eval_str = True` so that deferred annotations are turned into their - # corresponding type objects. Need Python 3.10 for eval_str parameter. - # https://docs.python.org/3/library/inspect.html#inspect.signature - signature_kwargs: Mapping[str, Any] = {"eval_str": True} + 2. Have following IAM roles enabled for you: + + * BigQuery Data Editor (roles/bigquery.dataEditor) + * BigQuery Connection Admin (roles/bigquery.connectionAdmin) + * Cloud Functions Developer (roles/cloudfunctions.developer) + * Service Account User (roles/iam.serviceAccountUser) on the service account `PROJECT_NUMBER-compute@developer.gserviceaccount.com` + * Storage Object Viewer (roles/storage.objectViewer) + * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.) + + 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set: + + 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection + 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + + Alternatively, the IAM could also be setup via the gcloud CLI: + + `$ gcloud projects add-iam-policy-binding PROJECT_ID --member="serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID" --role="roles/run.invoker"`. + + Args: + input_types (None, type, or sequence(type)): + For scalar user defined function it should be the input type or + sequence of input types. For row processing user defined function, + type `Series` should be specified. + output_type (Optional[type]): + Data type of the output in the user defined function. + session (bigframes.Session, Optional): + BigQuery DataFrames session to use for getting default project, + dataset and BigQuery connection. + bigquery_client (google.cloud.bigquery.Client, Optional): + Client to use for BigQuery operations. If this param is not provided + then bigquery client from the session would be used. + bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): + Client to use for BigQuery connection operations. If this param is + not provided then bigquery connection client from the session would + be used. + cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): + Client to use for cloud functions operations. If this param is not + provided then the functions client from the session would be used. + resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): + Client to use for cloud resource management operations, e.g. for + getting and setting IAM roles on cloud resources. If this param is + not provided then resource manager client from the session would be + used. + dataset (str, Optional.): + Dataset in which to create a BigQuery remote function. It should be in + `.` or `` format. If this + parameter is not provided then session dataset id is used. + bigquery_connection (str, Optional): + Name of the BigQuery connection in the form of `CONNECTION_ID` or + `LOCATION.CONNECTION_ID` or `PROJECT_ID.LOCATION.CONNECTION_ID`. + If this param is not provided then the bigquery connection from the session + would be used. If it is pre created in the same location as the + `bigquery_client.location` then it would be used, otherwise it is created + dynamically using the `bigquery_connection_client` assuming the user has necessary + priviliges. The PROJECT_ID should be the same as the BigQuery connection project. + reuse (bool, Optional): + Reuse the remote function if is already exists. + `True` by default, which results in reusing an existing remote + function and corresponding cloud function (if any) that was + previously created for the same udf. + Setting it to `False` forces the creation of a unique remote function. + If the required remote function does not exist then it would be + created irrespective of this param. + name (str, Optional): + Explicit name of the persisted BigQuery remote function. Use it with + caution, because two users working in the same project and dataset + could overwrite each other's remote functions if they use the same + persistent name. When an explicit name is provided, any session + specific clean up (``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. + packages (str[], Optional): + Explicit name of the external package dependencies. Each dependency + is added to the `requirements.txt` as is, and can be of the form + supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + cloud_function_service_account (str, Optional): + Service account to use for the cloud functions. If not provided then + the default service account would be used. See + https://cloud.google.com/functions/docs/securing/function-identity + for more details. Please make sure the service account has the + necessary IAM permissions configured as described in + https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. + cloud_function_kms_key_name (str, Optional): + Customer managed encryption key to protect cloud functions and + related data at rest. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY. + Read https://cloud.google.com/functions/docs/securing/cmek for + more details including granting necessary service accounts + access to the key. + cloud_function_docker_repository (str, Optional): + Docker repository created with the same encryption key as + `cloud_function_kms_key_name` to store encrypted artifacts + created to support the cloud function. This is of the format + projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. + For more details see + https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. + cloud_function_timeout (int, Optional): + The maximum amount of time (in seconds) BigQuery should wait for + the cloud function to return a response. See for more details + https://cloud.google.com/functions/docs/configuring/timeout. + Please note that even though the cloud function (2nd gen) itself + allows seeting up to 60 minutes of timeout, BigQuery remote + function can wait only up to 20 minutes, see for more details + https://cloud.google.com/bigquery/quotas#remote_function_limits. + By default BigQuery DataFrames uses a 10 minute timeout. `None` + can be passed to let the cloud functions default timeout take effect. + cloud_function_max_instances (int, Optional): + The maximumm instance count for the cloud function created. This + can be used to control how many cloud function instances can be + active at max at any given point of time. Lower setting can help + control the spike in the billing. Higher setting can help + support processing larger scale data. When not specified, cloud + function's default setting applies. For more details see + https://cloud.google.com/functions/docs/configuring/max-instances. + cloud_function_vpc_connector (str, Optional): + The VPC connector you would like to configure for your cloud + function. This is useful if your code needs access to data or + service(s) that are on a VPC network. See for more details + https://cloud.google.com/functions/docs/networking/connecting-vpc. + cloud_function_memory_mib (int, Optional): + The amounts of memory (in mebibytes) to allocate for the cloud + function (2nd gen) created. This also dictates a corresponding + amount of allocated CPU for the function. By default a memory of + 1024 MiB is set for the cloud functions created to support + BigQuery DataFrames remote function. If you want to let the + default memory of cloud functions be allocated, pass `None`. See + for more details + https://cloud.google.com/functions/docs/configuring/memory. + """ + # Some defaults may be used from the session if not provided otherwise + import bigframes.exceptions as bf_exceptions + import bigframes.pandas as bpd + import bigframes.series as bf_series + import bigframes.session + + session = cast(bigframes.session.Session, session or bpd.get_global_session()) + + # A BigQuery client is required to perform BQ operations + if not bigquery_client: + bigquery_client = session.bqclient + if not bigquery_client: + raise ValueError( + "A bigquery client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A BigQuery connection client is required to perform BQ connection operations + if not bigquery_connection_client: + bigquery_connection_client = session.bqconnectionclient + if not bigquery_connection_client: + raise ValueError( + "A bigquery connection client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A cloud functions client is required to perform cloud functions operations + if not cloud_functions_client: + cloud_functions_client = session.cloudfunctionsclient + if not cloud_functions_client: + raise ValueError( + "A cloud functions client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A resource manager client is required to get/set IAM operations + if not resource_manager_client: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # BQ remote function must be persisted, for which we need a dataset + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#:~:text=You%20cannot%20create%20temporary%20remote%20functions. + if dataset: + dataset_ref = bigquery.DatasetReference.from_string( + dataset, default_project=bigquery_client.project + ) else: - signature_kwargs = {} + dataset_ref = session._anonymous_dataset - signature = inspect.signature( - func, - **signature_kwargs, + bq_location, cloud_function_region = get_remote_function_locations( + bigquery_client.location ) - # Try to get input types via type annotations. - if input_types is None: - input_types = [] - for parameter in signature.parameters.values(): - if (param_type := parameter.annotation) is inspect.Signature.empty: + # A connection is required for BQ remote function + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function + if not bigquery_connection: + bigquery_connection = session._bq_connection # type: ignore + + bigquery_connection = clients.resolve_full_bq_connection_name( + bigquery_connection, + default_project=dataset_ref.project, + default_location=bq_location, + ) + # Guaranteed to be the form of .. + ( + gcp_project_id, + bq_connection_location, + bq_connection_id, + ) = bigquery_connection.split(".") + if gcp_project_id.casefold() != dataset_ref.project.casefold(): + raise ValueError( + "The project_id does not match BigQuery connection gcp_project_id: " + f"{dataset_ref.project}." + ) + if bq_connection_location.casefold() != bq_location.casefold(): + raise ValueError( + "The location does not match BigQuery connection location: " + f"{bq_location}." + ) + + # If any CMEK is intended then check that a docker repository is also specified + if ( + cloud_function_kms_key_name is not None + and cloud_function_docker_repository is None + ): + raise ValueError( + "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." + " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" + ) + + bq_connection_manager = session.bqconnectionmanager + + def wrapper(func): + nonlocal input_types, output_type + + if not callable(func): + raise TypeError("f must be callable, got {}".format(func)) + + if sys.version_info >= (3, 10): + # Add `eval_str = True` so that deferred annotations are turned into their + # corresponding type objects. Need Python 3.10 for eval_str parameter. + # https://docs.python.org/3/library/inspect.html#inspect.signature + signature_kwargs: Mapping[str, Any] = {"eval_str": True} + else: + signature_kwargs = {} + + signature = inspect.signature( + func, + **signature_kwargs, + ) + + # Try to get input types via type annotations. + if input_types is None: + input_types = [] + for parameter in signature.parameters.values(): + if (param_type := parameter.annotation) is inspect.Signature.empty: + raise ValueError( + "'input_types' was not set and parameter " + f"'{parameter.name}' is missing a type annotation. " + "Types are required to use @remote_function." + ) + input_types.append(param_type) + elif not isinstance(input_types, collections.abc.Sequence): + input_types = [input_types] + + if output_type is None: + if ( + output_type := signature.return_annotation + ) is inspect.Signature.empty: raise ValueError( - "'input_types' was not set and parameter " - f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function." + "'output_type' was not set and function is missing a " + "return type annotation. Types are required to use " + "@remote_function." ) - input_types.append(param_type) - elif not isinstance(input_types, collections.abc.Sequence): - input_types = [input_types] - if output_type is None: - if (output_type := signature.return_annotation) is inspect.Signature.empty: - raise ValueError( - "'output_type' was not set and function is missing a " - "return type annotation. Types are required to use " - "@remote_function." + # The function will actually be receiving a pandas Series, but allow both + # BigQuery DataFrames and pandas object types for compatibility. + is_row_processor = False + if len(input_types) == 1 and ( + (input_type := input_types[0]) == bf_series.Series + or input_type == pandas.Series + ): + warnings.warn( + "input_types=Series is in preview.", + stacklevel=1, + category=bf_exceptions.PreviewWarning, ) - # The function will actually be receiving a pandas Series, but allow both - # BigQuery DataFrames and pandas object types for compatibility. - is_row_processor = False - if len(input_types) == 1 and ( - (input_type := input_types[0]) == bf_series.Series - or input_type == pandas.Series - ): - warnings.warn( - "input_types=Series is in preview.", - stacklevel=1, - category=bf_exceptions.PreviewWarning, + # we will model the row as a json serialized string containing the data + # and the metadata representing the row + input_types = [str] + is_row_processor = True + elif isinstance(input_types, type): + input_types = [input_types] + + # TODO(b/340898611): fix type error + ibis_signature = ibis_signature_from_python_signature( + signature, input_types, output_type # type: ignore ) - # we will model the row as a json serialized string containing the data - # and the metadata representing the row - input_types = [str] - is_row_processor = True - elif isinstance(input_types, type): - input_types = [input_types] + remote_function_client = RemoteFunctionClient( + dataset_ref.project, + cloud_function_region, + cloud_functions_client, + bq_location, + dataset_ref.dataset_id, + bigquery_client, + bq_connection_id, + bq_connection_manager, + cloud_function_service_account, + cloud_function_kms_key_name, + cloud_function_docker_repository, + session=session, # type: ignore + ) - # TODO(b/340898611): fix type error - ibis_signature = ibis_signature_from_python_signature( - signature, input_types, output_type # type: ignore - ) + # In the unlikely case where the user is trying to re-deploy the same + # function, cleanup the attributes we add below, first. This prevents + # the pickle from having dependencies that might not otherwise be + # present such as ibis or pandas. + def try_delattr(attr): + try: + delattr(func, attr) + except AttributeError: + pass + + try_delattr("bigframes_cloud_function") + try_delattr("bigframes_remote_function") + try_delattr("output_dtype") + try_delattr("ibis_node") + + ( + rf_name, + cf_name, + created_new, + ) = remote_function_client.provision_bq_remote_function( + func, + input_types=tuple( + third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) + for type_ in ibis_signature.input_types + ), + output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( + ibis_signature.output_type + ), + reuse=reuse, + name=name, + package_requirements=packages, + max_batching_rows=max_batching_rows, + cloud_function_timeout=cloud_function_timeout, + cloud_function_max_instance_count=cloud_function_max_instances, + is_row_processor=is_row_processor, + cloud_function_vpc_connector=cloud_function_vpc_connector, + cloud_function_memory_mib=cloud_function_memory_mib, + ) - remote_function_client = RemoteFunctionClient( - dataset_ref.project, - cloud_function_region, - cloud_functions_client, - bq_location, - dataset_ref.dataset_id, - bigquery_client, - bq_connection_id, - bq_connection_manager, - cloud_function_service_account, - cloud_function_kms_key_name, - cloud_function_docker_repository, - session=session, # type: ignore - ) + # TODO: Move ibis logic to compiler step + node = ibis.udf.scalar.builtin( + func, + name=rf_name, + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) + func.bigframes_cloud_function = ( + remote_function_client.get_cloud_function_fully_qualified_name(cf_name) + ) + func.bigframes_remote_function = ( + remote_function_client.get_remote_function_fully_qualilfied_name( + rf_name + ) + ) - # In the unlikely case where the user is trying to re-deploy the same - # function, cleanup the attributes we add below, first. This prevents - # the pickle from having dependencies that might not otherwise be - # present such as ibis or pandas. - def try_delattr(attr): - try: - delattr(func, attr) - except AttributeError: - pass - - try_delattr("bigframes_cloud_function") - try_delattr("bigframes_remote_function") - try_delattr("output_dtype") - try_delattr("ibis_node") - - rf_name, cf_name = remote_function_client.provision_bq_remote_function( - func, - input_types=tuple( - third_party_ibis_bqtypes.BigQueryType.from_ibis(type_) - for type_ in ibis_signature.input_types - ), - output_type=third_party_ibis_bqtypes.BigQueryType.from_ibis( - ibis_signature.output_type - ), - reuse=reuse, - name=name, - package_requirements=packages, - max_batching_rows=max_batching_rows, - cloud_function_timeout=cloud_function_timeout, - cloud_function_max_instance_count=cloud_function_max_instances, - is_row_processor=is_row_processor, - cloud_function_vpc_connector=cloud_function_vpc_connector, - cloud_function_memory_mib=cloud_function_memory_mib, - ) + func.output_dtype = ( + bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( + ibis_signature.output_type + ) + ) + func.ibis_node = node + + # If a new remote function was created, update the cloud artifacts + # created in the session. This would be used to clean up any + # resources in the session. Note that we need to do this only for + # the case where an explicit name was not provided by the user and + # we used an internal name. For the cases where the user provided an + # explicit name, we are assuming that the user wants to persist them + # with that name and would directly manage their lifecycle. + if created_new and (not name): + self._update_artifacts( + func.bigframes_remote_function, func.bigframes_cloud_function + ) + return func - # TODO: Move ibis logic to compiler step - node = ibis.udf.scalar.builtin( - func, - name=rf_name, - schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", - signature=(ibis_signature.input_types, ibis_signature.output_type), - ) - func.bigframes_cloud_function = ( - remote_function_client.get_cloud_function_fully_qualified_name(cf_name) - ) - func.bigframes_remote_function = str(dataset_ref.routine(rf_name)) # type: ignore + return wrapper + + +def remote_function(*args, **kwargs): + remote_function_session = _RemoteFunctionSession() + return remote_function_session.remote_function(*args, **kwargs) - func.output_dtype = ( - bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype( - ibis_signature.output_type - ) - ) - func.ibis_node = node - return func - return wrapper +remote_function.__doc__ = _RemoteFunctionSession.remote_function.__doc__ def read_gbq_function( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index faba0f3aa3..eb990d2393 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -63,6 +63,7 @@ import bigframes.core.tools import bigframes.dataframe import bigframes.enums +import bigframes.functions.remote_function as bigframes_rf import bigframes.operations as ops import bigframes.series import bigframes.session @@ -768,8 +769,11 @@ def clean_up_by_session_id( location: Optional[str] = None, project: Optional[str] = None, ) -> None: - """Searches through table names in BigQuery and deletes tables - found matching the expected format. + """Searches through BigQuery tables and routines and deletes the ones + created during the session with the given session id. The match is + determined by having the session id present in the resource name or + metadata. The cloud functions serving the cleaned up routines are also + cleaned up. This could be useful if the session object has been lost. Calling `session.close()` or `bigframes.pandas.close_session()` @@ -794,7 +798,6 @@ def clean_up_by_session_id( None """ session = get_global_session() - client = session.bqclient if (location is None) != (project is None): raise ValueError( @@ -804,14 +807,18 @@ def clean_up_by_session_id( dataset = session._anonymous_dataset else: dataset = bigframes.session._io.bigquery.create_bq_dataset_reference( - client, + session.bqclient, location=location, project=project, api_name="clean_up_by_session_id", ) bigframes.session._io.bigquery.delete_tables_matching_session_id( - client, dataset, session_id + session.bqclient, dataset, session_id + ) + + bigframes_rf._clean_up_by_session_id( + session.bqclient, session.cloudfunctionsclient, dataset, session_id ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0f7953d3d4..10c0797873 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -95,8 +95,7 @@ import bigframes.dtypes import bigframes.exceptions import bigframes.formatting_helpers as formatting_helpers -from bigframes.functions.remote_function import read_gbq_function as bigframes_rgf -from bigframes.functions.remote_function import remote_function as bigframes_rf +import bigframes.functions.remote_function as bigframes_rf import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.clients @@ -306,6 +305,8 @@ def __init__( else bigframes.enums.DefaultIndexKind.NULL ) + self._remote_function_session = bigframes_rf._RemoteFunctionSession() + @property def bqclient(self): return self._clients_provider.bqclient @@ -383,7 +384,7 @@ def __hash__(self): # Stable hash needed to use in expression tree return hash(str(self._anonymous_dataset)) - def close(self): + def _clean_up_tables(self): """Delete tables that were created with this session's session_id.""" client = self.bqclient project_id = self._anonymous_dataset.project @@ -393,6 +394,15 @@ def close(self): full_id = ".".join([project_id, dataset_id, table_id]) client.delete_table(full_id, not_found_ok=True) + def close(self): + """Delete resources that were created with this session's session_id. + This includes BigQuery tables, remote functions and cloud functions + serving the remote functions""" + self._clean_up_tables() + self._remote_function_session.clean_up( + self.bqclient, self.cloudfunctionsclient, self.session_id + ) + def read_gbq( self, query_or_table: str, @@ -1613,7 +1623,13 @@ def remote_function( Explicit name of the persisted BigQuery remote function. Use it with caution, because two users working in the same project and dataset could overwrite each other's remote functions if they use the same - persistent name. + persistent name. When an explicit name is provided, any session + specific clean up (``bigframes.session.Session.close``/ + ``bigframes.pandas.close_session``/ + ``bigframes.pandas.reset_session``/ + ``bigframes.pandas.clean_up_by_session_id``) does not clean up + the function, and leaves it for the user to manage the function + and the associated cloud function directly. packages (str[], Optional): Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form @@ -1689,7 +1705,7 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ - return bigframes_rf( + return self._remote_function_session.remote_function( input_types, output_type, session=self, @@ -1769,7 +1785,7 @@ def read_gbq_function( not including the `bigframes_cloud_function` property. """ - return bigframes_rgf( + return bigframes_rf.read_gbq_function( function_name=function_name, session=self, ) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index ef8b9811df..303c74f1fd 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -22,13 +22,14 @@ import textwrap import google.api_core.exceptions -from google.cloud import bigquery, storage +from google.cloud import bigquery, functions_v2, storage import pandas import pytest import test_utils.prefixer import bigframes -from bigframes.functions.remote_function import get_cloud_function_name +import bigframes.functions.remote_function as bigframes_rf +import bigframes.pandas as bpd import bigframes.series from tests.system.utils import ( assert_pandas_df_equal, @@ -590,7 +591,11 @@ def add_one(x): add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) # Expected cloud function name for the unique udf - add_one_uniq_cf_name, _ = get_cloud_function_name(add_one_uniq) + package_requirements = bigframes_rf._get_updated_package_requirements() + add_one_uniq_hash = bigframes_rf._get_hash(add_one_uniq, package_requirements) + add_one_uniq_cf_name = bigframes_rf.get_cloud_function_name( + add_one_uniq_hash, session.session_id + ) # There should be no cloud function yet for the unique udf cloud_functions = list( @@ -1860,3 +1865,142 @@ def test_remote_function_gcf_memory_unsupported(session, memory_mib): @session.remote_function(reuse=False, cloud_function_memory_mib=memory_mib) def square(x: int) -> int: return x * x + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_unnamed_removed_w_session_cleanup(): + # create a clean session + session = bigframes.connect() + + # create an unnamed remote function in the session + @session.remote_function(reuse=False) + def foo(x: int) -> int: + return x + 1 + + # ensure that remote function artifacts are created + assert foo.bigframes_remote_function is not None + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_cloud_function is not None + session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) is not None + + # explicitly close the session + session.close() + + # ensure that the bq remote function is deleted + with pytest.raises(google.cloud.exceptions.NotFound): + session.bqclient.get_routine(foo.bigframes_remote_function) + + # the deletion of cloud function happens in a non-blocking way, ensure that + # it either exists in a being-deleted state, or is already deleted + try: + gcf = session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.DELETING + except google.cloud.exceptions.NotFound: + pass + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_named_perists_w_session_cleanup(): + try: + # create a clean session + session = bigframes.connect() + + # create a name for the remote function + name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() + + # create an unnamed remote function in the session + @session.remote_function(name=name) + def foo(x: int) -> int: + return x + 1 + + # ensure that remote function artifacts are created + assert foo.bigframes_remote_function is not None + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_cloud_function is not None + session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) is not None + + # explicitly close the session + session.close() + + # ensure that the bq remote function still exists + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + + # the deletion of cloud function happens in a non-blocking way, ensure + # that it was not deleted and still exists in active state + gcf = session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.ACTIVE + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, foo + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_clean_up_by_session_id(): + # Use a brand new session to avoid conflict with other tests + session = bigframes.Session() + session_id = session.session_id + try: + # we will create remote functions, one with explicit name and another + # without it, and later confirm that the former is deleted when the session + # is cleaned up by session id, but the latter remains + ## unnamed + @session.remote_function(reuse=False) + def foo_unnamed(x: int) -> int: + return x + 1 + + ## named + rf_name = test_utils.prefixer.Prefixer("bigframes", "").create_prefix() + + @session.remote_function(reuse=False, name=rf_name) + def foo_named(x: int) -> int: + return x + 2 + + # check that BQ remote functiosn were created with corresponding cloud + # functions + for foo in [foo_unnamed, foo_named]: + assert foo.bigframes_remote_function is not None + session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_cloud_function is not None + session.cloudfunctionsclient.get_function( + name=foo.bigframes_cloud_function + ) is not None + + # clean up using explicit session id + bpd.clean_up_by_session_id( + session_id, location=session._location, project=session._project + ) + + # ensure that the unnamed bq remote function is deleted along with its + # corresponding cloud function + with pytest.raises(google.cloud.exceptions.NotFound): + session.bqclient.get_routine(foo_unnamed.bigframes_remote_function) + try: + gcf = session.cloudfunctionsclient.get_function( + name=foo_unnamed.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.DELETING + except google.cloud.exceptions.NotFound: + pass + + # ensure that the named bq remote function still exists along with its + # corresponding cloud function + session.bqclient.get_routine(foo_named.bigframes_remote_function) is not None + gcf = session.cloudfunctionsclient.get_function( + name=foo_named.bigframes_cloud_function + ) + assert gcf.state is functions_v2.Function.State.ACTIVE + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, foo_named + ) diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py index c7a19dc26e..2b82d0133b 100644 --- a/tests/system/large/test_session.py +++ b/tests/system/large/test_session.py @@ -19,6 +19,7 @@ import bigframes import bigframes.pandas as bpd +import bigframes.session._io.bigquery @pytest.mark.parametrize( @@ -93,8 +94,7 @@ def test_clean_up_by_session_id(): session_id = session.session_id # we will create two tables and confirm that they are deleted - # when the session is closed by id - + # when the session is cleaned up by id bqclient = session.bqclient dataset = session._anonymous_dataset expiration = ( @@ -110,9 +110,7 @@ def test_clean_up_by_session_id(): max_results=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, page_size=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, ) - assert any( - [(session.session_id in table.full_table_id) for table in list(tables_before)] - ) + assert any([(session.session_id in table.full_table_id) for table in tables_before]) bpd.clean_up_by_session_id( session_id, location=session._location, project=session._project @@ -125,5 +123,5 @@ def test_clean_up_by_session_id(): page_size=bigframes.session._io.bigquery._LIST_TABLES_LIMIT, ) assert not any( - [(session.session_id in table.full_table_id) for table in list(tables_after)] + [(session.session_id in table.full_table_id) for table in tables_after] ) From ab0dabcf83a1b25de3afc0e950d12d7c55ca0af4 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 16 Jul 2024 10:33:48 -0700 Subject: [PATCH 13/36] test: Fix json_set error tests to create resources in test body (#845) --- tests/system/small/bigquery/test_json.py | 41 ++++++++++-------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index ff759b8fda..9e0c06e0bd 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -85,18 +85,18 @@ def test_json_set_w_more_pairs(): ) -@pytest.mark.parametrize( - ("series", "json_path_value_pairs"), - [ - pytest.param( - _get_series_from_json([{"a": 10}]), - [("$.a", 1, 100)], - id="invalid_json_path_value_pairs", - marks=pytest.mark.xfail(raises=ValueError), - ), - pytest.param( +def test_json_set_w_invalid_json_path_value_pairs(): + with pytest.raises(ValueError): + bbq.json_set( + _get_series_from_json([{"a": 10}]), json_path_value_pairs=[("$.a", 1, 100)] # type: ignore + ) + + +def test_json_set_w_invalid_value_type(): + with pytest.raises(TypeError): + bbq.json_set( _get_series_from_json([{"a": 10}]), - [ + json_path_value_pairs=[ ( "$.a", bpd.read_pandas( @@ -104,16 +104,9 @@ def test_json_set_w_more_pairs(): ), ) ], - id="invalid_json_value_type", - marks=pytest.mark.xfail(raises=TypeError), - ), - pytest.param( - bpd.Series([1, 2]), - [("$.a", 1)], - id="invalid_series_type", - marks=pytest.mark.xfail(raises=TypeError), - ), - ], -) -def test_json_set_w_invalid(series, json_path_value_pairs): - bbq.json_set(series, json_path_value_pairs=json_path_value_pairs) + ) + + +def test_json_set_w_invalid_series_type(): + with pytest.raises(TypeError): + bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)]) From 6278e0b8fbdd3e6ff9bd008128dccf24c37d65c6 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 16 Jul 2024 12:25:40 -0700 Subject: [PATCH 14/36] test: Run several tests in unordered mode as well as ordered (#831) --- tests/system/conftest.py | 21 ++++++++ tests/system/small/test_dataframe.py | 77 +++++++++++++--------------- tests/system/utils.py | 18 +++++++ 3 files changed, 76 insertions(+), 40 deletions(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index df4ff9aff0..59439c306f 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -139,6 +139,16 @@ def session() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time +@pytest.fixture(scope="session", params=["ordered", "unordered"]) +def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions( + location="US", _strictly_ordered=request.param == "ordered" + ) + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup type + + @pytest.fixture(scope="session") def unordered_session() -> Generator[bigframes.Session, None, None]: context = bigframes.BigQueryOptions(location="US", _strictly_ordered=False) @@ -467,6 +477,17 @@ def scalars_dfs( return scalars_df_index, scalars_pandas_df_index +@pytest.fixture(scope="session") +def scalars_dfs_maybe_ordered( + maybe_ordered_session, + scalars_pandas_df_index, +): + return ( + maybe_ordered_session.read_pandas(scalars_pandas_df_index), + scalars_pandas_df_index, + ) + + @pytest.fixture(scope="session") def hockey_df( hockey_table_id: str, session: bigframes.Session diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 7273e2079f..67792b3a1d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -33,8 +33,10 @@ import bigframes.pandas as bpd import bigframes.series as series from tests.system.utils import ( + assert_dfs_equivalent, assert_pandas_df_equal, assert_series_equal, + assert_series_equivalent, skip_legacy_pandas, ) @@ -75,7 +77,7 @@ def test_df_construct_large_strings(): pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) -def test_df_construct_pandas_load_job(scalars_dfs): +def test_df_construct_pandas_load_job(scalars_dfs_maybe_ordered): # This should trigger the inlined codepath columns = [ "int64_too", @@ -91,10 +93,10 @@ def test_df_construct_pandas_load_job(scalars_dfs): "timestamp_col", "geography_col", ] - _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() + _, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns) pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_dfs_equivalent(pd_result, bf_result) def test_df_construct_pandas_set_dtype(scalars_dfs): @@ -112,17 +114,17 @@ def test_df_construct_pandas_set_dtype(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_df_construct_from_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_df_construct_from_series(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered bf_result = dataframe.DataFrame( {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, dtype="string[pyarrow]", - ).to_pandas() + ) pd_result = pd.DataFrame( {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, dtype="string[pyarrow]", ) - pandas.testing.assert_frame_equal(bf_result, pd_result) + assert_dfs_equivalent(pd_result, bf_result) def test_df_construct_from_dict(): @@ -505,8 +507,8 @@ def test_rename(scalars_dfs): ) -def test_df_peek(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_df_peek(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered peek_result = scalars_df.peek(n=3, force=False) pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -1709,14 +1711,14 @@ def test_sort_index(scalars_dfs, ascending, na_position): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_df_abs(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs +def test_df_abs(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered columns = ["int64_col", "int64_too", "float64_col"] - bf_result = scalars_df[columns].abs().to_pandas() + bf_result = scalars_df[columns].abs() pd_result = scalars_pandas_df[columns].abs() - assert_pandas_df_equal(bf_result, pd_result) + assert_dfs_equivalent(pd_result, bf_result) def test_df_pos(scalars_dfs): @@ -2268,8 +2270,10 @@ def test_series_binop_add_different_table( @all_joins -def test_join_same_table(scalars_dfs, how): - bf_df, pd_df = scalars_dfs +def test_join_same_table(scalars_dfs_maybe_ordered, how): + bf_df, pd_df = scalars_dfs_maybe_ordered + if not bf_df._session._strictly_ordered and how == "cross": + pytest.skip("Cross join not supported in unordered mode.") bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] bf_df_a = bf_df_a.sort_index() @@ -2503,7 +2507,7 @@ def test_dataframe_agg_int_single_string(scalars_dfs, agg): ) -def test_dataframe_agg_multi_string(scalars_dfs): +def test_dataframe_agg_multi_string(scalars_dfs_maybe_ordered): numeric_cols = ["int64_col", "int64_too", "float64_col"] aggregations = [ "sum", @@ -2516,8 +2520,8 @@ def test_dataframe_agg_multi_string(scalars_dfs): "nunique", "count", ] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = scalars_df[numeric_cols].agg(aggregations) pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) # Pandas may produce narrower numeric types, but bigframes always produces Float64 @@ -2528,7 +2532,7 @@ def test_dataframe_agg_multi_string(scalars_dfs): bf_result = bf_result.drop(labels=["median"]) pd_result = pd_result.drop(labels=["median"]) - pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) + assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) # Double-check that median is at least plausible. assert ( @@ -3205,13 +3209,6 @@ def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) @pytest.mark.parametrize( ("op", "bf_dtype"), [ @@ -3226,12 +3223,11 @@ def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col ], ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], ) -def test_dataframe_aggregates( - scalars_df_index, scalars_pandas_df_index, op, bf_dtype, ordered -): +def test_dataframe_aggregates(scalars_dfs_maybe_ordered, op, bf_dtype): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] bf_series = op(scalars_df_index[col_names]) - bf_result = bf_series.to_pandas(ordered=ordered) + bf_result = bf_series pd_result = op(scalars_pandas_df_index[col_names]) # Check dtype separately @@ -3240,12 +3236,11 @@ def test_dataframe_aggregates( # Pandas may produce narrower numeric types, but bigframes always produces Float64 # Pandas has object index type pd_result.index = pd_result.index.astype("string[pyarrow]") - assert_series_equal( + assert_series_equivalent( pd_result, bf_result, check_dtype=False, check_index_type=False, - ignore_order=not ordered, ) @@ -3597,16 +3592,17 @@ def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): ) -def test_df_reindex_rows_list(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]).to_pandas() +def test_df_reindex_rows_list(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( - bf_result, + assert_dfs_equivalent( pd_result, + bf_result, ) @@ -3861,7 +3857,8 @@ def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): ) -def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): +def test_loc_list_multiindex(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( ["string_col", "int64_col"] @@ -3871,9 +3868,9 @@ def test_loc_list_multiindex(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_multiindex.loc[index_list] pd_result = scalars_pandas_df_multiindex.loc[index_list] - pd.testing.assert_frame_equal( - bf_result.to_pandas(), + assert_dfs_equivalent( pd_result, + bf_result, ) diff --git a/tests/system/utils.py b/tests/system/utils.py index ab4c2c119f..9fbf191a3a 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -27,6 +27,7 @@ import pytest from bigframes.functions import remote_function +import bigframes.pandas ML_REGRESSION_METRICS = [ "mean_absolute_error", @@ -56,6 +57,23 @@ def wrapper(*args, **kwds): return wrapper +# Prefer this function for tests that run in both ordered and unordered mode +def assert_dfs_equivalent( + pd_df: pd.DataFrame, bf_df: bigframes.pandas.DataFrame, **kwargs +): + bf_df_local = bf_df.to_pandas() + ignore_order = not bf_df._session._strictly_ordered + assert_pandas_df_equal(bf_df_local, pd_df, ignore_order=ignore_order, **kwargs) + + +def assert_series_equivalent( + pd_series: pd.Series, bf_series: bigframes.pandas.Series, **kwargs +): + bf_df_local = bf_series.to_pandas() + ignore_order = not bf_series._session._strictly_ordered + assert_series_equal(bf_df_local, pd_series, ignore_order=ignore_order, **kwargs) + + def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): if ignore_order: # Sort by a column to get consistent results. From c9eaff0a1a0731b28f4c67bca5606db12a47c8c0 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 16 Jul 2024 12:26:41 -0700 Subject: [PATCH 15/36] feat: Add bigframes-mode label to query jobs (#832) --- bigframes/session/__init__.py | 2 ++ tests/system/small/test_unordered.py | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 10c0797873..7cc862b93d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1849,6 +1849,8 @@ def _start_query( Starts BigQuery query job and waits for results. """ job_config = self._prepare_query_job_config(job_config) + if not self._strictly_ordered: + job_config.labels = {"bigframes-mode": "unordered"} try: return bigframes.session._io.bigquery.start_query_with_client( self, diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 4448ddc838..7220d34d55 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -20,6 +20,15 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_unordered_mode_job_label(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session) + df.to_pandas() + job_labels = df.query_job.labels # type:ignore + assert "bigframes-mode" in job_labels + assert job_labels["bigframes-mode"] == "unordered" + + def test_unordered_mode_cache_aggregate(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) df = bpd.DataFrame(pd_df, session=unordered_session) From ff2faedb133ce5f1059accbfd3351089b19895d1 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 17 Jul 2024 18:46:55 -0700 Subject: [PATCH 16/36] test: disable unavailable BQML remote vertex model tests (#848) --- noxfile.py | 3 ++- tests/system/small/ml/test_core.py | 1 + tests/system/small/ml/test_remote.py | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index b38bcacfdb..d69c16e69c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -429,7 +429,8 @@ def cover(session): "--show-missing", "--include=tests/unit/*", "--include=tests/system/small/*", - "--fail-under=100", + # TODO(b/353775058) resume coverage to 100 when the issue is fixed. + "--fail-under=99", ) session.run("coverage", "erase") diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index c505057d7b..95719ea0db 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -312,6 +312,7 @@ def test_model_detect_anomalies( ) +@pytest.mark.skip("b/353775058 BQML internal error") def test_remote_model_predict( bqml_linear_remote_model: core.BqmlModel, new_penguins_df ): diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py index 5036cdadfc..c52c452244 100644 --- a/tests/system/small/ml/test_remote.py +++ b/tests/system/small/ml/test_remote.py @@ -13,10 +13,12 @@ # limitations under the License. import pandas as pd +import pytest from bigframes.ml import remote +@pytest.mark.skip("b/353775058 BQML internal error") def test_remote_linear_vertex_model_predict( linear_remote_vertex_model: remote.VertexAIModel, new_penguins_df ): From 83f254ab4a7f089e0a71e4a05a457319ab8b1df1 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 18 Jul 2024 11:04:31 -0700 Subject: [PATCH 17/36] refactor: Allow unambiguous windows even in unstrictly ordered sessions (#849) Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- bigframes/core/__init__.py | 4 ++-- bigframes/core/nodes.py | 36 ++++++++++++++++++++++++++++ tests/system/small/test_unordered.py | 7 ++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index cfe8f29327..bce0f059eb 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -194,7 +194,7 @@ def promote_offsets(self, col_id: str) -> ArrayValue: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ - if not self.session._strictly_ordered: + if self.node.order_ambiguous and not self.session._strictly_ordered: raise ValueError("Generating offsets not supported in unordered mode") return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) @@ -346,7 +346,7 @@ def project_window_op( """ # TODO: Support non-deterministic windowing if window_spec.row_bounded or not op.order_independent: - if not self.session._strictly_ordered: + if self.node.order_ambiguous and not self.session._strictly_ordered: raise ValueError( "Order-dependent windowed ops not supported in unordered mode" ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index aadab9f5cc..a979e07972 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -127,6 +127,14 @@ def joins(self) -> bool: """ return False + @property + @abc.abstractmethod + def order_ambiguous(self) -> bool: + """ + Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways. + """ + ... + @functools.cached_property def total_variables(self) -> int: return self.variables_introduced + sum( @@ -177,6 +185,10 @@ def transform_children( ) -> BigFrameNode: return replace(self, child=t(self.child)) + @property + def order_ambiguous(self) -> bool: + return self.child.order_ambiguous + @dataclass(frozen=True) class JoinNode(BigFrameNode): @@ -196,6 +208,10 @@ def non_local(self) -> bool: def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.left_child, self.right_child) + @property + def order_ambiguous(self) -> bool: + return True + def __hash__(self): return self._node_hash @@ -247,6 +263,10 @@ def __post_init__(self): def child_nodes(self) -> typing.Sequence[BigFrameNode]: return self.children + @property + def order_ambiguous(self) -> bool: + return any(child.order_ambiguous for child in self.children) + def __hash__(self): return self._node_hash @@ -293,6 +313,10 @@ def variables_introduced(self) -> int: """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" return len(self.schema.items) + 1 + @property + def order_ambiguous(self) -> bool: + return False + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: @@ -350,6 +374,10 @@ def relation_ops_created(self) -> int: # Assume worst case, where readgbq actually has baked in analytic operation to generate index return 3 + @property + def order_ambiguous(self) -> bool: + return len(self.total_order_cols) == 0 + @functools.cached_property def variables_introduced(self) -> int: return len(self.schema.items) + 1 @@ -417,6 +445,10 @@ def hidden_columns(self) -> typing.Tuple[str, ...]: if col not in self.schema.names ) + @property + def order_ambiguous(self) -> bool: + return not isinstance(self.ordering, orderings.TotalOrdering) + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: @@ -600,6 +632,10 @@ def schema(self) -> schemata.ArraySchema: def variables_introduced(self) -> int: return len(self.aggregations) + len(self.by_column_ids) + @property + def order_ambiguous(self) -> bool: + return False + @dataclass(frozen=True) class WindowOpNode(UnaryNode): diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 7220d34d55..b1c836e1c0 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -40,6 +40,13 @@ def test_unordered_mode_cache_aggregate(unordered_session): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_unordered_mode_single_aggregate(unordered_session): + pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) + bf_df = bpd.DataFrame(pd_df, session=unordered_session) + + assert bf_df.a.mean() == pd_df.a.mean() + + def test_unordered_mode_print(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) df = bpd.DataFrame(pd_df, session=unordered_session).cache() From 827007c4590158fb83cf0bf04696cf6f4fde8f65 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 18 Jul 2024 12:40:15 -0700 Subject: [PATCH 18/36] test: re-enable gemini tuning load test (#846) * test: re-enable gemini tuning load test * remove commented out --- tests/system/load/test_llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 9b8868bb27..6d22963a97 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -62,7 +62,6 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_ # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept -@pytest.mark.skip(reason="b/351905648. Credential error to be fixed.") @pytest.mark.flaky(retries=2) def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df): model = llm.GeminiTextGenerator(model_name="gemini-pro", max_iterations=1) From 33464947aca81d830183bc0652b15b1e73382e69 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 18 Jul 2024 13:35:10 -0700 Subject: [PATCH 19/36] refactor: Unordered mode supports user partial orders (#842) --- bigframes/core/__init__.py | 2 +- bigframes/core/blocks.py | 40 +++++++++++----- bigframes/core/compile/__init__.py | 10 +--- bigframes/core/compile/api.py | 70 +++++++++++++++------------- bigframes/core/compile/compiler.py | 3 +- bigframes/core/indexes/base.py | 4 +- bigframes/dataframe.py | 24 ++++------ bigframes/series.py | 11 ++--- bigframes/session/__init__.py | 17 +++---- tests/system/small/test_unordered.py | 14 ++++++ 10 files changed, 111 insertions(+), 84 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index bce0f059eb..aa66129572 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -147,7 +147,7 @@ def _compiled_schema(self) -> schemata.ArraySchema: def as_cached( self: ArrayValue, cache_table: google.cloud.bigquery.Table, - ordering: Optional[orderings.TotalOrdering], + ordering: Optional[orderings.RowOrdering], ) -> ArrayValue: """ Replace the node with an equivalent one that references a tabel where the value has been materialized to. diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c2bf20076a..2d7c543678 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -498,9 +498,33 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: - """Run query and download results as a pandas DataFrame.""" + """Run query and download results as a pandas DataFrame. + + Args: + max_download_size (int, default None): + Download size threshold in MB. If max_download_size is exceeded when downloading data + (e.g., to_pandas()), the data will be downsampled if + bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be + raised. If set to a value other than None, this will supersede the global config. + sampling_method (str, default None): + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm + returns a portion of the data from the beginning. It is fast and requires minimal + computations to perform the downsampling; "uniform": This algorithm returns uniform + random samples of the data. If set to a value other than None, this will supersede + the global config. + random_state (int, default None): + The seed for the uniform downsampling algorithm. If provided, the uniform method may + take longer to execute and require more computation. If set to a value other than + None, this will supersede the global config. + ordered (bool, default True): + Determines whether the resulting pandas dataframe will be ordered. + Whether the row ordering is deterministics depends on whether session ordering is strict. + + Returns: + pandas.DataFrame, QueryJob + """ if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): raise NotImplementedError( f"The downsampling method {sampling_method} is not implemented, " @@ -517,10 +541,7 @@ def to_pandas( df, query_job = self._materialize_local( materialize_options=MaterializationOptions( - downsampling=sampling, - ordered=ordered - if ordered is not None - else self.session._strictly_ordered, + downsampling=sampling, ordered=ordered ) ) df.set_axis(self.column_labels, axis=1, copy=False) @@ -547,7 +568,7 @@ def to_pandas_batches( dtypes = dict(zip(self.index_columns, self.index.dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) _, query_job = self.session._query_to_destination( - self.session._to_sql(self.expr, ordered=self.session._strictly_ordered), + self.session._to_sql(self.expr, ordered=True), list(self.index_columns), api_name="cached", do_clustering=False, @@ -2593,10 +2614,7 @@ def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: index_columns = list(self._block.index_columns) expr = self._expr.select_columns(index_columns) results, _ = self.session._execute( - expr, - ordered=ordered - if (ordered is not None) - else self.session._strictly_ordered, + expr, ordered=ordered if ordered is not None else True ) df = expr.session._rows_to_dataframe(results) df = df.set_index(index_columns) diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 4c105ed03b..964113bd7b 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -14,19 +14,13 @@ from __future__ import annotations from bigframes.core.compile.api import ( - compile_ordered, - compile_peek, - compile_raw, - compile_unordered, + SQLCompiler, test_only_ibis_inferred_schema, test_only_try_evaluate, ) __all__ = [ - "compile_peek", - "compile_unordered", - "compile_ordered", - "compile_raw", + "SQLCompiler", "test_only_try_evaluate", "test_only_ibis_inferred_schema", ] diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 4e76d42bef..468c5522d9 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -25,38 +25,44 @@ _STRICT_COMPILER = compiler.Compiler(strict=True) -def compile_peek(node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: - """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" - return _STRICT_COMPILER.compile_unordered_ir(node).peek_sql(n_rows) - - -def compile_unordered( - node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} -) -> str: - """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" - return _STRICT_COMPILER.compile_unordered_ir(node).to_sql( - col_id_overrides=col_id_overrides - ) - - -def compile_ordered( - node: bigframes.core.nodes.BigFrameNode, *, col_id_overrides: Mapping[str, str] = {} -) -> str: - """Compile node into sql where rows are sorted with ORDER BY.""" - return _STRICT_COMPILER.compile_ordered_ir(node).to_sql( - col_id_overrides=col_id_overrides, ordered=True - ) - - -def compile_raw( - node: bigframes.core.nodes.BigFrameNode, -) -> Tuple[str, bigframes.core.ordering.TotalOrdering]: - """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" - ir = _STRICT_COMPILER.compile_ordered_ir(node) - sql = ir.raw_sql() - ordering_info = ir._ordering - assert ir.has_total_order - return sql, ordering_info # type: ignore +class SQLCompiler: + def __init__(self, strict: bool = True): + self._compiler = compiler.Compiler(strict=strict) + + def compile_peek(self, node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str: + """Compile node into sql that selects N arbitrary rows, may not execute deterministically.""" + return self._compiler.compile_unordered_ir(node).peek_sql(n_rows) + + def compile_unordered( + self, + node: bigframes.core.nodes.BigFrameNode, + *, + col_id_overrides: Mapping[str, str] = {}, + ) -> str: + """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" + return self._compiler.compile_unordered_ir(node).to_sql( + col_id_overrides=col_id_overrides + ) + + def compile_ordered( + self, + node: bigframes.core.nodes.BigFrameNode, + *, + col_id_overrides: Mapping[str, str] = {}, + ) -> str: + """Compile node into sql where rows are sorted with ORDER BY.""" + return self._compiler.compile_ordered_ir(node).to_sql( + col_id_overrides=col_id_overrides, ordered=True + ) + + def compile_raw( + self, + node: bigframes.core.nodes.BigFrameNode, + ) -> Tuple[str, bigframes.core.ordering.RowOrdering]: + """Compile node into sql that exposes all columns, including hidden ordering-only columns.""" + ir = self._compiler.compile_ordered_ir(node) + sql = ir.raw_sql() + return sql, ir._ordering def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index da74ffeb8f..c7f8c5ab59 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -46,7 +46,8 @@ class Compiler: def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR: ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True)) - assert ir.has_total_order + if self.strict: + assert ir.has_total_order return ir def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 696742180b..8b039707c2 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -492,9 +492,7 @@ def to_pandas(self) -> pandas.Index: pandas.Index: A pandas Index with all of the labels from this Index. """ - return self._block.index.to_pandas( - ordered=self._block.session._strictly_ordered - ) + return self._block.index.to_pandas(ordered=True) def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: return self.to_pandas().to_numpy(dtype, **kwargs) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4dcc4414ed..dcb2fd09cb 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1192,15 +1192,14 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame: def to_arrow( self, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> pyarrow.Table: """Write DataFrame to an Arrow table / record batch. Args: - ordered (bool, default None): - Determines whether the resulting Arrow table will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. If set to a value - other than None, will override Session default. + ordered (bool, default True): + Determines whether the resulting Arrow table will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame. @@ -1211,9 +1210,7 @@ def to_arrow( ) self._optimize_query_complexity() - pa_table, query_job = self._block.to_arrow( - ordered=ordered if ordered is not None else self._session._strictly_ordered, - ) + pa_table, query_job = self._block.to_arrow(ordered=ordered) self._set_internal_query_job(query_job) return pa_table @@ -1223,7 +1220,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> pandas.DataFrame: """Write DataFrame to pandas DataFrame. @@ -1243,10 +1240,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. - ordered (bool, default None): - Determines whether the resulting pandas dataframe will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. If set to a value - other than None, will override Session default. + ordered (bool, default True): + Determines whether the resulting pandas dataframe will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: pandas.DataFrame: A pandas DataFrame with all rows and columns of this DataFrame if the @@ -1259,7 +1255,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, - ordered=ordered if ordered is not None else self._session._strictly_ordered, + ordered=ordered, ) self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) diff --git a/bigframes/series.py b/bigframes/series.py index c325783e96..8fdafe25e7 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -329,7 +329,7 @@ def to_pandas( sampling_method: Optional[str] = None, random_state: Optional[int] = None, *, - ordered: Optional[bool] = None, + ordered: bool = True, ) -> pandas.Series: """Writes Series to pandas Series. @@ -349,10 +349,9 @@ def to_pandas( The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. - ordered (bool, default None): - Determines whether the resulting pandas series will be deterministically ordered. - In some cases, unordered may result in a faster-executing query. If set to a value - other than None, will override Session default. + ordered (bool, default True): + Determines whether the resulting pandas series will be ordered. + In some cases, unordered may result in a faster-executing query. Returns: @@ -364,7 +363,7 @@ def to_pandas( max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, - ordered=ordered if ordered is not None else self._session._strictly_ordered, + ordered=ordered, ) self._set_internal_query_job(query_job) series = df.squeeze(axis=1) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7cc862b93d..ca242d269c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -304,6 +304,9 @@ def __init__( if context._strictly_ordered else bigframes.enums.DefaultIndexKind.NULL ) + self._compiler = bigframes.core.compile.SQLCompiler( + strict=context._strictly_ordered + ) self._remote_function_session = bigframes_rf._RemoteFunctionSession() @@ -1893,10 +1896,8 @@ def _cache_with_cluster_cols( """Executes the query and uses the resulting table to rewrite future executions.""" # TODO: Use this for all executions? Problem is that caching materializes extra # ordering columns - # TODO: May want to support some partial ordering info even for non-strict ordering mode - keep_order_info = self._strictly_ordered - sql, ordering_info = bigframes.core.compile.compile_raw( + sql, ordering_info = self._compiler.compile_raw( self._with_cached_executions(array_value.node) ) tmp_table = self._sql_to_temp_table( @@ -1904,7 +1905,7 @@ def _cache_with_cluster_cols( ) cached_replacement = array_value.as_cached( cache_table=self.bqclient.get_table(tmp_table), - ordering=ordering_info if keep_order_info else None, + ordering=ordering_info, ).node self._cached_executions[array_value.node] = cached_replacement @@ -1917,7 +1918,7 @@ def _cache_with_offsets(self, array_value: core.ArrayValue): "Caching with offsets only supported in strictly ordered mode." ) offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") - sql = bigframes.core.compile.compile_unordered( + sql = self._compiler.compile_unordered( self._with_cached_executions( array_value.promote_offsets(offset_column).node ) @@ -2023,7 +2024,7 @@ def _peek( """A 'peek' efficiently accesses a small number of rows in the dataframe.""" if not tree_properties.peekable(self._with_cached_executions(array_value.node)): warnings.warn("Peeking this value cannot be done efficiently.") - sql = bigframes.core.compile.compile_peek( + sql = self._compiler.compile_peek( self._with_cached_executions(array_value.node), n_rows ) @@ -2044,10 +2045,10 @@ def _to_sql( array_value = array_value.promote_offsets(offset_column) node_w_cached = self._with_cached_executions(array_value.node) if ordered: - return bigframes.core.compile.compile_ordered( + return self._compiler.compile_ordered( node_w_cached, col_id_overrides=col_id_overrides ) - return bigframes.core.compile.compile_unordered( + return self._compiler.compile_unordered( node_w_cached, col_id_overrides=col_id_overrides ) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index b1c836e1c0..8dfc54c21d 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -132,3 +132,17 @@ def test_unordered_mode_blocks_windowing(unordered_session, function): match=r"Op.*not supported when strict ordering is disabled", ): function(df) + + +def test_unordered_mode_cache_preserves_order(unordered_session): + pd_df = pd.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [4, 5, 9, 3, 1, 6]}, dtype=pd.Int64Dtype() + ) + pd_df.index = pd_df.index.astype(pd.Int64Dtype()) + df = bpd.DataFrame(pd_df, session=unordered_session) + sorted_df = df.sort_values("b").cache() + bf_result = sorted_df.to_pandas() + pd_result = pd_df.sort_values("b") + + # B is unique so unstrict order mode result here should be equivalent to strictly ordered + assert_pandas_df_equal(bf_result, pd_result, ignore_order=False) From 676a41022c76684795acf35349770608b9e3be47 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 18 Jul 2024 15:24:57 -0700 Subject: [PATCH 20/36] test: temporarily disable streaming tests (#850) * test: temporarily disable streaming tests * fix import --- tests/system/large/test_streaming.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/system/large/test_streaming.py b/tests/system/large/test_streaming.py index c125fde15a..2debc09994 100644 --- a/tests/system/large/test_streaming.py +++ b/tests/system/large/test_streaming.py @@ -14,9 +14,12 @@ import time +import pytest + import bigframes.streaming +@pytest.mark.skip(reason="b/354024943. Concurrency error need to be fixed.") def test_streaming_to_bigtable(): # launch a continuous query job_id_prefix = "test_streaming_" @@ -49,6 +52,7 @@ def test_streaming_to_bigtable(): query_job.cancel() +@pytest.mark.skip(reason="b/354024943. Concurrency error need to be fixed.") def test_streaming_to_pubsub(): # launch a continuous query job_id_prefix = "test_streaming_pubsub_" From 01d6bbb7479da706dc62bb5e7d51dc28a4042812 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 19 Jul 2024 16:12:33 -0700 Subject: [PATCH 21/36] test: restore remote function stickiness in small tests (#847) * feat: support remote function cleanup with `session.close` * accept the possibility that the artifact may have already been deleted * add cleanup by previous session id * add more documentation * hold session artifacts in a remote function session class * fix the missing return keyword * test: restore stickiness in small `remote_function` tests docs: make `close_session`/`reset_session` appears in the docs --- bigframes/functions/remote_function.py | 45 +++-- bigframes/pandas/__init__.py | 24 ++- bigframes/session/__init__.py | 2 +- tests/system/small/test_remote_function.py | 202 +++++++++++++-------- 4 files changed, 178 insertions(+), 95 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index f24ba1b5fb..b95067983f 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -204,9 +204,12 @@ class IbisSignature(NamedTuple): output_type: IbisDataType -def get_cloud_function_name(function_hash, session_id, uniq_suffix=None): +def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] + parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] + if session_id: + parts.append(session_id) + parts.append(function_hash) if uniq_suffix: parts.append(uniq_suffix) return _GCF_FUNCTION_NAME_SEPERATOR.join(parts) @@ -566,10 +569,13 @@ def provision_bq_remote_function( ) # Derive the name of the cloud function underlying the intended BQ - # remote function, also collect updated package requirements as - # determined in the name resolution + # remote function. Use the session id to identify the GCF for unnamed + # functions. The named remote functions are treated as a persistant + # artifacts, so let's keep them independent of session id, which also + # makes their naming more stable for the same udf code + session_id = None if name else self._session.session_id cloud_function_name = get_cloud_function_name( - function_hash, self._session.session_id, uniq_suffix + function_hash, session_id, uniq_suffix ) cf_endpoint = self.get_cloud_function_endpoint(cloud_function_name) @@ -635,13 +641,12 @@ def get_remote_function_specs(self, remote_function_name): ) try: for routine in routines: + routine = cast(bigquery.Routine, routine) if routine.reference.routine_id == remote_function_name: - # TODO(shobs): Use first class properties when they are available - # https://github.com/googleapis/python-bigquery/issues/1552 - rf_options = routine._properties.get("remoteFunctionOptions") + rf_options = routine.remote_function_options if rf_options: - http_endpoint = rf_options.get("endpoint") - bq_connection = rf_options.get("connection") + http_endpoint = rf_options.endpoint + bq_connection = rf_options.connection if bq_connection: bq_connection = os.path.basename(bq_connection) break @@ -731,15 +736,15 @@ class _RemoteFunctionSession: def __init__(self): # Session level mapping of remote function artifacts - self._temp_session_artifacts: Dict[str, str] = dict() + self._temp_artifacts: Dict[str, str] = dict() - # Lock to synchronize the update of the session level mapping - self._session_artifacts_lock = threading.Lock() + # Lock to synchronize the update of the session artifacts + self._artifacts_lock = threading.Lock() - def _update_artifacts(self, bqrf_routine: str, gcf_path: str): + def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): """Update remote function artifacts in the current session.""" - with self._session_artifacts_lock: - self._temp_session_artifacts[bqrf_routine] = gcf_path + with self._artifacts_lock: + self._temp_artifacts[bqrf_routine] = gcf_path def clean_up( self, @@ -748,8 +753,8 @@ def clean_up( session_id: str, ): """Delete remote function artifacts in the current session.""" - with self._session_artifacts_lock: - for bqrf_routine, gcf_path in self._temp_session_artifacts.items(): + with self._artifacts_lock: + for bqrf_routine, gcf_path in self._temp_artifacts.items(): # Let's accept the possibility that the remote function may have # been deleted directly by the user bqclient.delete_routine(bqrf_routine, not_found_ok=True) @@ -761,7 +766,7 @@ def clean_up( except google.api_core.exceptions.NotFound: pass - self._temp_session_artifacts.clear() + self._temp_artifacts.clear() # Inspired by @udf decorator implemented in ibis-bigquery package # https://github.com/ibis-project/ibis-bigquery/blob/main/ibis_bigquery/udf/__init__.py @@ -1206,7 +1211,7 @@ def try_delattr(attr): # explicit name, we are assuming that the user wants to persist them # with that name and would directly manage their lifecycle. if created_new and (not name): - self._update_artifacts( + self._update_temp_artifacts( func.bigframes_remote_function, func.bigframes_cloud_function ) return func diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index eb990d2393..21f75eb82c 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -847,10 +847,28 @@ def clean_up_by_session_id( option_context = config.option_context """Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + # Session management APIs -get_global_session = global_session.get_global_session -close_session = global_session.close_session -reset_session = global_session.close_session +def get_global_session(): + return global_session.get_global_session() + + +get_global_session.__doc__ = global_session.get_global_session.__doc__ + + +def close_session(): + return global_session.close_session() + + +close_session.__doc__ = global_session.close_session.__doc__ + + +def reset_session(): + return global_session.close_session() + + +reset_session.__doc__ = global_session.close_session.__doc__ + # SQL Compilation uses recursive algorithms on deep trees # 10M tree depth should be sufficient to generate any sql that is under bigquery limit diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ca242d269c..77a20026dd 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -400,7 +400,7 @@ def _clean_up_tables(self): def close(self): """Delete resources that were created with this session's session_id. This includes BigQuery tables, remote functions and cloud functions - serving the remote functions""" + serving the remote functions.""" self._clean_up_tables() self._remote_function_session.clean_up( self.bqclient, self.cloudfunctionsclient, self.session_id diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d84d520988..c07a0afb44 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -77,15 +77,27 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection( - bq_cf_connection, dataset_id_permanent -) -> bigframes.Session: +def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: session = bigframes.Session( bigframes.BigQueryOptions(bq_connection=bq_cf_connection, location="US") ) return session +def get_rf_name(func, package_requirements=None, is_row_processor=False): + """Get a remote function name for testing given a udf.""" + # Augment user package requirements with any internal package + # requirements + package_requirements = rf._get_updated_package_requirements( + package_requirements, is_row_processor + ) + + # Compute a unique hash representing the user code + function_hash = rf._get_hash(func, package_requirements) + + return f"bigframes_{function_hash}" + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, @@ -96,8 +108,11 @@ def test_remote_function_direct_no_session_param( dataset_id_permanent, bq_cf_connection, ): - @rf.remote_function( - [int], + def square(x): + return x * x + + square = rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -107,9 +122,8 @@ def test_remote_function_direct_no_session_param( bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -153,8 +167,11 @@ def test_remote_function_direct_no_session_param_location_specified( dataset_id_permanent, bq_cf_connection_location, ): - @rf.remote_function( - [int], + def square(x): + return x * x + + square = rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -164,9 +181,8 @@ def test_remote_function_direct_no_session_param_location_specified( bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -204,13 +220,17 @@ def test_remote_function_direct_no_session_param_location_mismatched( dataset_id_permanent, bq_cf_connection_location_mismatched, ): + def square(x): + # Not expected to reach this code, as the location of the + # connection doesn't match the location of the dataset. + return x * x # pragma: NO COVER + with pytest.raises( ValueError, match=re.escape("The location does not match BigQuery connection location:"), ): - - @rf.remote_function( - [int], + rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -220,11 +240,8 @@ def test_remote_function_direct_no_session_param_location_mismatched( bigquery_connection=bq_cf_connection_location_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - # Not expected to reach this code, as the location of the - # connection doesn't match the location of the dataset. - return x * x # pragma: NO COVER + name=get_rf_name(square), + )(square) @pytest.mark.flaky(retries=2, delay=120) @@ -237,8 +254,11 @@ def test_remote_function_direct_no_session_param_location_project_specified( dataset_id_permanent, bq_cf_connection_location_project, ): - @rf.remote_function( - [int], + def square(x): + return x * x + + square = rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -248,9 +268,8 @@ def test_remote_function_direct_no_session_param_location_project_specified( bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -288,15 +307,19 @@ def test_remote_function_direct_no_session_param_project_mismatched( dataset_id_permanent, bq_cf_connection_location_project_mismatched, ): + def square(x): + # Not expected to reach this code, as the project of the + # connection doesn't match the project of the dataset. + return x * x # pragma: NO COVER + with pytest.raises( ValueError, match=re.escape( "The project_id does not match BigQuery connection gcp_project_id:" ), ): - - @rf.remote_function( - [int], + rf.remote_function( + int, int, bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, @@ -306,23 +329,25 @@ def test_remote_function_direct_no_session_param_project_mismatched( bigquery_connection=bq_cf_connection_location_project_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - # Not expected to reach this code, as the project of the - # connection doesn't match the project of the dataset. - return x * x # pragma: NO COVER + name=get_rf_name(square), + )(square) @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): - @rf.remote_function( - [int], - int, - session=session_with_bq_connection, - ) +def test_remote_function_direct_session_param( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): def square(x): return x * x + square = rf.remote_function( + int, + int, + session=session_with_bq_connection, + dataset=dataset_id_permanent, + name=get_rf_name(square), + )(square) + # Function should still work normally. assert square(2) == 4 @@ -351,7 +376,12 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): +def test_remote_function_via_session_default( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): + def square(x): + return x * x + # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -359,9 +389,9 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection.remote_function([int], int) - def square(x): - return x * x + square = session_with_bq_connection.remote_function( + int, int, dataset_id_permanent, name=get_rf_name(square) + )(square) # Function should still work normally. assert square(2) == 4 @@ -394,16 +424,18 @@ def square(x): def test_remote_function_via_session_with_overrides( session, scalars_dfs, dataset_id_permanent, bq_cf_connection ): - @session.remote_function( - [int], + def square(x): + return x * x + + square = session.remote_function( + int, int, dataset_id_permanent, bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. reuse=True, - ) - def square(x): - return x * x + name=get_rf_name(square), + )(square) # Function should still work normally. assert square(2) == 4 @@ -433,11 +465,15 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection.remote_function( + [int], int, dataset_id_permanent, name=get_rf_name(add_one) + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -460,11 +496,15 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap_na_ignore( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection.remote_function( + [int], int, dataset_id_permanent, name=get_rf_name(add_one) + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -485,7 +525,9 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map_bytes(session_with_bq_connection, scalars_dfs): +def test_series_map_bytes( + session_with_bq_connection, scalars_dfs, dataset_id_permanent +): """Check that bytes is support as input and output.""" scalars_df, scalars_pandas_df = scalars_dfs @@ -502,8 +544,11 @@ def bytes_to_hex(mybytes: bytes) -> bytes: pd.ArrowDtype(pyarrow.binary()) ) + packages = ["pandas"] remote_bytes_to_hex = session_with_bq_connection.remote_function( - packages=["pandas"] + dataset=dataset_id_permanent, + name=get_rf_name(bytes_to_hex, package_requirements=packages), + packages=packages, )(bytes_to_hex) bf_result = scalars_df.bytes_col.map(remote_bytes_to_hex).to_pandas() @@ -541,11 +586,14 @@ def test_skip_bq_connection_check(dataset_id_permanent): match=f"Not found: Connection {connection_name}", ): - @session.remote_function([int], int, dataset=dataset_id_permanent) def add_one(x): # Not expected to reach this code, as the connection doesn't exist. return x + 1 # pragma: NO COVER + session.remote_function( + [int], int, dataset=dataset_id_permanent, name=get_rf_name(add_one) + )(add_one) + @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_detects_invalid_function(session, dataset_id): @@ -570,7 +618,10 @@ def test_read_gbq_function_like_original( dataset_id_permanent, bq_cf_connection, ): - @rf.remote_function( + def square1(x): + return x * x + + square1 = rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -580,29 +631,28 @@ def test_read_gbq_function_like_original( resource_manager_client=resourcemanager_client, bigquery_connection=bq_cf_connection, reuse=True, - ) - def square1(x): - return x * x + name=get_rf_name(square1), + )(square1) # Function should still work normally. assert square1(2) == 4 square2 = rf.read_gbq_function( - function_name=square1.bigframes_remote_function, + function_name=square1.bigframes_remote_function, # type: ignore session=session, ) # The newly-created function (square1) should have a remote function AND a # cloud function associated with it, while the read-back version (square2) # should only have a remote function. - assert square1.bigframes_remote_function - assert square1.bigframes_cloud_function + assert square1.bigframes_remote_function # type: ignore + assert square1.bigframes_cloud_function # type: ignore assert square2.bigframes_remote_function assert not hasattr(square2, "bigframes_cloud_function") # They should point to the same function. - assert square1.bigframes_remote_function == square2.bigframes_remote_function + assert square1.bigframes_remote_function == square2.bigframes_remote_function # type: ignore # The result of applying them should be the same. int64_col = scalars_df_index["int64_col"] @@ -743,7 +793,7 @@ def test_read_gbq_function_enforces_explicit_types( @pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1(session, scalars_dfs): +def test_df_apply_axis_1(session, scalars_dfs, dataset_id_permanent): columns = [ "bool_col", "int64_col", @@ -764,6 +814,8 @@ def add_ints(row): add_ints_remote = session.remote_function( bigframes.series.Series, int, + dataset_id_permanent, + name=get_rf_name(add_ints, is_row_processor=True), )(add_ints) with pytest.warns( @@ -785,7 +837,7 @@ def add_ints(row): @pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1_ordering(session, scalars_dfs): +def test_df_apply_axis_1_ordering(session, scalars_dfs, dataset_id_permanent): columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] ordering_columns = ["bool_col", "int64_col"] scalars_df, scalars_pandas_df = scalars_dfs @@ -793,7 +845,12 @@ def test_df_apply_axis_1_ordering(session, scalars_dfs): def add_ints(row): return row["int64_col"] + row["int64_too"] - add_ints_remote = session.remote_function(bigframes.series.Series, int)(add_ints) + add_ints_remote = session.remote_function( + bigframes.series.Series, + int, + dataset_id_permanent, + name=get_rf_name(add_ints, is_row_processor=True), + )(add_ints) bf_result = ( scalars_df[columns] @@ -817,7 +874,7 @@ def add_ints(row): @pytest.mark.flaky(retries=2, delay=120) -def test_df_apply_axis_1_multiindex(session): +def test_df_apply_axis_1_multiindex(session, dataset_id_permanent): pd_df = pd.DataFrame( {"x": [1, 2, 3], "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"]}, index=pd.MultiIndex.from_tuples([("a", 100), ("a", 200), ("b", 300)]), @@ -827,9 +884,12 @@ def test_df_apply_axis_1_multiindex(session): def add_numbers(row): return row["x"] + row["y"] - add_numbers_remote = session.remote_function(bigframes.series.Series, float)( - add_numbers - ) + add_numbers_remote = session.remote_function( + bigframes.series.Series, + float, + dataset_id_permanent, + name=get_rf_name(add_numbers, is_row_processor=True), + )(add_numbers) bf_result = bf_df.apply(add_numbers_remote, axis=1).to_pandas() pd_result = pd_df.apply(add_numbers, axis=1) From f9e4435f7ca47c9a53f295b54592396d8f2c7c2f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 22 Jul 2024 18:06:18 -0700 Subject: [PATCH 22/36] test: fix mypy failures to unblock presubmit tests (#852) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The presubmit and continuous tests have started failing due to mypy. This change finds the failure points and puts in workarounds. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/session/__init__.py | 10 ++++++---- third_party/bigframes_vendored/pandas/core/series.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 77a20026dd..f5482ff389 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -650,15 +650,17 @@ def _read_gbq_query( index_cols = _to_index_cols(index_col) - filters = list(filters) - if len(filters) != 0 or max_results is not None: + filters_copy1, filters_copy2 = itertools.tee(filters) + has_filters = len(list(filters_copy1)) != 0 + filters = typing.cast(third_party_pandas_gbq.FiltersType, filters_copy2) + if has_filters or max_results is not None: # TODO(b/338111344): If we are running a query anyway, we might as # well generate ROW_NUMBER() at the same time. all_columns = itertools.chain(index_cols, columns) if columns else () query = bf_io_bigquery.to_query( query, all_columns, - bf_io_bigquery.compile_filters(filters) if filters else None, + bf_io_bigquery.compile_filters(filters) if has_filters else None, max_results=max_results, # We're executing the query, so we don't need time travel for # determinism. @@ -768,7 +770,7 @@ def _read_gbq_table( ) columns = list(columns) - filters = list(filters) + filters = typing.cast(list, list(filters)) # --------------------------------- # Fetch table metadata and validate diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a430c3375f..a30ed9cd92 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3521,7 +3521,7 @@ def mask(self, cond, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def clip(self): + def clip(self, lower, upper): """Trim values at input threshold(s). Assigns values outside boundary to boundary values. Thresholds can be From 1b6a556206a7a66283339d827ab12db2753521e2 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 23 Jul 2024 12:20:16 -0700 Subject: [PATCH 23/36] fix: Fix 'sql' property for null index (#844) --- bigframes/dataframe.py | 4 +++- bigframes/session/__init__.py | 2 ++ bigframes/session/_io/bigquery/read_gbq_table.py | 4 ++++ tests/system/small/test_unordered.py | 9 +++++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index dcb2fd09cb..2a3aead80a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -385,7 +385,9 @@ def _to_sql_query( @property def sql(self) -> str: """Compiles this DataFrame's expression tree to SQL.""" - include_index = self.index.name is not None or len(self.index.names) > 1 + include_index = self._has_index and ( + self.index.name is not None or len(self.index.names) > 1 + ) sql, _, _ = self._to_sql_query(include_index=include_index) return sql diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f5482ff389..9c953ee594 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -889,6 +889,8 @@ def _read_gbq_table( table=table, index_cols=index_cols, api_name=api_name, + # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique + metadata_only=not self._strictly_ordered, ) schema = schemata.ArraySchema.from_bq_table(table) if columns: diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 879a8ba44c..03b26f9460 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -152,6 +152,7 @@ def are_index_cols_unique( table: bigquery.table.Table, index_cols: List[str], api_name: str, + metadata_only: bool = False, ) -> bool: if len(index_cols) == 0: return False @@ -161,6 +162,9 @@ def are_index_cols_unique( if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols): return True + if metadata_only: + # Sometimes not worth scanning data to check uniqueness + return False # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 8dfc54c21d..6d9171aeed 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -20,6 +20,15 @@ from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +def test_unordered_mode_sql_no_hash(unordered_session): + bf_df = unordered_session.read_gbq( + "bigquery-public-data.ethereum_blockchain.blocks" + ) + sql = bf_df.sql + assert "ORDER BY".casefold() not in sql.casefold() + assert "farm_fingerprint".casefold() not in sql.casefold() + + def test_unordered_mode_job_label(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) df = bpd.DataFrame(pd_df, session=unordered_session) From 10da997a9276d8bd0ba8c13861c8a5246c905ba8 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:31:13 -0700 Subject: [PATCH 24/36] chore: fix create_bigtable script (#853) --- scripts/create_bigtable.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/create_bigtable.py b/scripts/create_bigtable.py index f81bb8a013..da40e9063d 100644 --- a/scripts/create_bigtable.py +++ b/scripts/create_bigtable.py @@ -18,6 +18,7 @@ import os import sys +from google.cloud.bigtable import column_family import google.cloud.bigtable as bigtable PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") @@ -57,8 +58,11 @@ def create_table(instance): table_id, instance, ) + max_versions_rule = column_family.MaxVersionsGCRule(1) + column_family_id = "body_mass_g" + column_families = {column_family_id: max_versions_rule} if not table.exists(): - table.create() + table.create(column_families=column_families) print(f"Created table {table_id}") From eb0ef75d7ce2e0a1e1735429d61df53828240a69 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:37:45 -0700 Subject: [PATCH 25/36] test: enable Gemini 1.5 flash default test (#859) --- tests/system/small/ml/test_llm.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index ee9d654d93..b926004fd8 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -328,12 +328,7 @@ def test_create_load_gemini_text_generator_model( @pytest.mark.parametrize( "model_name", - ( - "gemini-pro", - "gemini-1.5-pro-preview-0514", - # TODO(garrrettwu): enable when cl/637028077 is in prod. - # "gemini-1.5-flash-preview-0514" - ), + ("gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514"), ) @pytest.mark.flaky(retries=2) def test_gemini_text_generator_predict_default_params_success( From 823c0ce57611c0918a9e9999638d7393337fe9af Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 25 Jul 2024 14:50:10 -0700 Subject: [PATCH 26/36] feat: Add config option to set partial ordering mode (#855) --- bigframes/_config/bigquery_options.py | 33 +++++++++++++++++++++------ bigframes/session/__init__.py | 12 +++++++--- tests/system/conftest.py | 6 ++--- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index ad79543cb8..0506f1841e 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -16,7 +16,8 @@ from __future__ import annotations -from typing import Optional +from enum import Enum +from typing import Literal, Optional import warnings import google.api_core.exceptions @@ -26,6 +27,12 @@ import bigframes.constants import bigframes.exceptions + +class OrderingMode(Enum): + STRICT = "strict" + PARTIAL = "partial" + + SESSION_STARTED_MESSAGE = ( "Cannot change '{attribute}' once a session has started. " "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API." @@ -57,6 +64,14 @@ def _validate_location(value: Optional[str]): ) +def _validate_ordering_mode(value: str) -> OrderingMode: + if value.casefold() == OrderingMode.STRICT.value.casefold(): + return OrderingMode.STRICT + if value.casefold() == OrderingMode.PARTIAL.value.casefold(): + return OrderingMode.PARTIAL + raise ValueError("Ordering mode must be one of 'strict' or 'partial'.") + + class BigQueryOptions: """Encapsulates configuration for working with a session.""" @@ -71,7 +86,7 @@ def __init__( kms_key_name: Optional[str] = None, skip_bq_connection_check: bool = False, *, - _strictly_ordered: bool = True, + ordering_mode: Literal["strict", "partial"] = "strict", ): self._credentials = credentials self._project = project @@ -82,8 +97,8 @@ def __init__( self._kms_key_name = kms_key_name self._skip_bq_connection_check = skip_bq_connection_check self._session_started = False - # Determines the ordering strictness for the session. For internal use only. - self._strictly_ordered_internal = _strictly_ordered + # Determines the ordering strictness for the session. + self._ordering_mode = _validate_ordering_mode(ordering_mode) @property def application_name(self) -> Optional[str]: @@ -241,6 +256,10 @@ def kms_key_name(self, value: str): self._kms_key_name = value @property - def _strictly_ordered(self) -> bool: - """Internal use only. Controls whether total row order is always maintained for DataFrame/Series.""" - return self._strictly_ordered_internal + def ordering_mode(self) -> Literal["strict", "partial"]: + """Controls whether total row order is always maintained for DataFrame/Series.""" + return self._ordering_mode.value + + @ordering_mode.setter + def ordering_mode(self, ordering_mode: Literal["strict", "partial"]) -> None: + self._ordering_mode = _validate_ordering_mode(ordering_mode) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9c953ee594..22ca63d25b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -297,15 +297,21 @@ def __init__( self._execution_count = 0 # Whether this session treats objects as totally ordered. # Will expose as feature later, only False for internal testing - self._strictly_ordered: bool = context._strictly_ordered + self._strictly_ordered: bool = context.ordering_mode != "partial" + if not self._strictly_ordered: + warnings.warn( + "Partial ordering mode is a preview feature and is subject to change.", + bigframes.exceptions.PreviewWarning, + ) + # Sequential index needs total ordering to generate, so use null index with unstrict ordering. self._default_index_type: bigframes.enums.DefaultIndexKind = ( bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 - if context._strictly_ordered + if self._strictly_ordered else bigframes.enums.DefaultIndexKind.NULL ) self._compiler = bigframes.core.compile.SQLCompiler( - strict=context._strictly_ordered + strict=self._strictly_ordered ) self._remote_function_session = bigframes_rf._RemoteFunctionSession() diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 59439c306f..55079380f4 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -141,9 +141,7 @@ def session() -> Generator[bigframes.Session, None, None]: @pytest.fixture(scope="session", params=["ordered", "unordered"]) def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions( - location="US", _strictly_ordered=request.param == "ordered" - ) + context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type @@ -151,7 +149,7 @@ def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: @pytest.fixture(scope="session") def unordered_session() -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions(location="US", _strictly_ordered=False) + context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type From 2c810865035e96b577132893789608ee65d3e186 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 26 Jul 2024 15:12:51 -0700 Subject: [PATCH 27/36] chore: increase stale cloud functions cleanup rate (#863) * chore: increase stale cloud functions cleanup rate * reword the comment --- tests/system/conftest.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 55079380f4..3acae0e75b 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -43,11 +43,15 @@ # Use this to control the number of cloud functions being deleted in a single # test session. This should help soften the spike of the number of mutations per -# minute tracked against a quota limit (default 60, increased to 120 for -# bigframes-dev project) by the Cloud Functions API -# We are running pytest with "-n 20". Let's say each session lasts about a -# minute, so we are setting a limit of 120/20 = 6 deletions per session. -MAX_NUM_FUNCTIONS_TO_DELETE_PER_SESSION = 6 +# minute tracked against the quota limit: +# Cloud Functions API -> Per project mutation requests per minute per region +# (default 60, increased to 1000 for the test projects) +# We are running pytest with "-n 20". For a rough estimation, let's say all +# parallel sessions run in parallel. So that allows 1000/20 = 50 mutations per +# minute. One session takes about 1 minute to create a remote function. This +# would allow 50-1 = 49 deletions per session. As a heuristic let's use half of +# that potential for the clean up. +MAX_NUM_FUNCTIONS_TO_DELETE_PER_SESSION = 25 CURRENT_DIR = pathlib.Path(__file__).parent DATA_DIR = CURRENT_DIR.parent / "data" From 0676f73c22411f9d19399fd34790298c32f8316b Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Mon, 29 Jul 2024 19:32:21 +0200 Subject: [PATCH 28/36] chore(deps): update all dependencies (#715) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(deps): update all dependencies * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot Co-authored-by: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> --- samples/snippets/requirements-test.txt | 2 +- samples/snippets/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index beca2e44d9..d66afc7edb 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ # samples/snippets should be runnable with no "extras" google-cloud-testutils==1.4.0 -pytest==8.2.0 +pytest==8.2.1 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 8fcd19bb2c..fc00b9362e 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -bigframes==1.6.0 +bigframes==1.7.0 From b9e6150c1c35959845e4cf129ccde75a8c2abecb Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 29 Jul 2024 11:05:32 -0700 Subject: [PATCH 29/36] refactor: Block .head consistently in unordered mode (#854) --- bigframes/constants.py | 2 ++ bigframes/core/groupby/__init__.py | 2 ++ bigframes/core/validations.py | 13 ++++++++----- bigframes/dataframe.py | 2 ++ bigframes/series.py | 2 +- tests/system/small/test_unordered.py | 4 ++++ 6 files changed, 19 insertions(+), 6 deletions(-) diff --git a/bigframes/constants.py b/bigframes/constants.py index 9591297956..3c18fd20bd 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -99,3 +99,5 @@ # BigQuery default is 10000, leave 100 for overhead MAX_COLUMNS = 9900 + +SUGGEST_PEEK_PREVIEW = "Use .peek(n) to preview n arbitrary rows." diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 11a5d43ba0..02bf201ca0 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -109,6 +109,7 @@ def __getitem__( dropna=self._dropna, ) + @validations.requires_strict_ordering() def head(self, n: int = 5) -> df.DataFrame: block = self._block if self._dropna: @@ -531,6 +532,7 @@ def __init__( def _session(self) -> core.Session: return self._block.session + @validations.requires_strict_ordering() def head(self, n: int = 5) -> series.Series: block = self._block if self._dropna: diff --git a/bigframes/core/validations.py b/bigframes/core/validations.py index dc22047e3b..c5761f4e09 100644 --- a/bigframes/core/validations.py +++ b/bigframes/core/validations.py @@ -17,7 +17,7 @@ from __future__ import annotations import functools -from typing import Protocol, TYPE_CHECKING +from typing import Optional, Protocol, TYPE_CHECKING import bigframes.constants import bigframes.exceptions @@ -32,11 +32,11 @@ def _session(self) -> Session: ... -def requires_strict_ordering(): +def requires_strict_ordering(suggestion: Optional[str] = None): def decorator(meth): @functools.wraps(meth) def guarded_meth(object: HasSession, *args, **kwargs): - enforce_ordered(object, meth.__name__) + enforce_ordered(object, meth.__name__, suggestion) return meth(object, *args, **kwargs) return guarded_meth @@ -44,8 +44,11 @@ def guarded_meth(object: HasSession, *args, **kwargs): return decorator -def enforce_ordered(object: HasSession, opname: str) -> None: +def enforce_ordered( + object: HasSession, opname: str, suggestion: Optional[str] = None +) -> None: if not object._session._strictly_ordered: + suggestion_substr = suggestion + " " if suggestion else "" raise bigframes.exceptions.OrderRequiredError( - f"Op {opname} not supported when strict ordering is disabled. {bigframes.constants.FEEDBACK_LINK}" + f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{bigframes.constants.FEEDBACK_LINK}" ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2a3aead80a..717549316a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -49,6 +49,7 @@ import bigframes import bigframes._config.display_options as display_options +import bigframes.constants import bigframes.constants as constants import bigframes.core from bigframes.core import log_adapter @@ -1293,6 +1294,7 @@ def _compute_dry_run(self) -> bigquery.QueryJob: def copy(self) -> DataFrame: return DataFrame(self._block) + @validations.requires_strict_ordering(bigframes.constants.SUGGEST_PEEK_PREVIEW) def head(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[:n]) diff --git a/bigframes/series.py b/bigframes/series.py index 8fdafe25e7..7c530b9612 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -633,7 +633,7 @@ def dropna( result = result.reset_index() return Series(result) - @validations.requires_strict_ordering() + @validations.requires_strict_ordering(bigframes.constants.SUGGEST_PEEK_PREVIEW) def head(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[0:n]) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 6d9171aeed..2e97078ef5 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -131,6 +131,10 @@ def test_unordered_drop_duplicates(unordered_session, keep): lambda x: x.a.iloc[1::2], id="series_iloc", ), + pytest.param( + lambda x: x.head(3), + id="head", + ), ], ) def test_unordered_mode_blocks_windowing(unordered_session, function): From f23de1a9711e149549c6df81990e043b042311d5 Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Tue, 30 Jul 2024 19:29:19 +0200 Subject: [PATCH 30/36] chore(deps): update all dependencies (#866) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(deps): update all dependencies * revert pyarrow change Can't update pyarrow until we update ibis. --------- Co-authored-by: Tim Sweña (Swast) --- samples/polars/requirements-test.txt | 2 +- samples/polars/requirements.txt | 4 ++-- samples/snippets/requirements-test.txt | 2 +- samples/snippets/requirements.txt | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/polars/requirements-test.txt b/samples/polars/requirements-test.txt index beca2e44d9..cbac5e3f12 100644 --- a/samples/polars/requirements-test.txt +++ b/samples/polars/requirements-test.txt @@ -1,3 +1,3 @@ # samples/snippets should be runnable with no "extras" google-cloud-testutils==1.4.0 -pytest==8.2.0 +pytest==8.3.2 diff --git a/samples/polars/requirements.txt b/samples/polars/requirements.txt index e3f886e7e3..a1d8fbcdac 100644 --- a/samples/polars/requirements.txt +++ b/samples/polars/requirements.txt @@ -1,3 +1,3 @@ -bigframes==1.6.0 -polars==0.20.31 +bigframes==1.11.1 +polars==1.3.0 pyarrow==15.0.0 diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index d66afc7edb..cbac5e3f12 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ # samples/snippets should be runnable with no "extras" google-cloud-testutils==1.4.0 -pytest==8.2.1 +pytest==8.3.2 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index fc00b9362e..9b5da5182e 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,2 +1,2 @@ # samples/snippets should be runnable with no "extras" -bigframes==1.7.0 +bigframes==1.11.1 From 8e04c3827063874c27c2f0e9559b7387a8a206bc Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 30 Jul 2024 10:30:17 -0700 Subject: [PATCH 31/36] refactor: Respect session default index in merge and reset_index methods (#862) --- bigframes/core/blocks.py | 39 +++++++++++++++++++++------- tests/system/small/test_unordered.py | 22 ++++++++++++++++ 2 files changed, 51 insertions(+), 10 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2d7c543678..05865a6699 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -327,8 +327,21 @@ def reset_index(self, drop: bool = True) -> Block: A new Block because dropping index columns can break references from Index classes that point to this block. """ - new_index_col_id = guid.generate_guid() - expr = self._expr.promote_offsets(new_index_col_id) + expr = self._expr + if ( + self.session._default_index_type + == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + ): + new_index_col_id = guid.generate_guid() + expr = expr.promote_offsets(new_index_col_id) + new_index_cols = [new_index_col_id] + elif self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL: + new_index_cols = [] + else: + raise ValueError( + f"Unrecognized default index kind: {self.session._default_index_type}" + ) + if drop: # Even though the index might be part of the ordering, keep that # ordering expression as reset_index shouldn't change the row @@ -336,9 +349,8 @@ def reset_index(self, drop: bool = True) -> Block: expr = expr.drop_columns(self.index_columns) return Block( expr, - index_columns=[new_index_col_id], + index_columns=new_index_cols, column_labels=self.column_labels, - index_labels=[None], ) else: # Add index names to column index @@ -362,9 +374,8 @@ def reset_index(self, drop: bool = True) -> Block: return Block( expr, - index_columns=[new_index_col_id], + index_columns=new_index_cols, column_labels=column_labels_modified, - index_labels=[None], ) def set_index( @@ -2096,13 +2107,17 @@ def merge( # # This keeps us from generating an index if the user joins a large # BigQuery table against small local data, for example. - if len(self._index_columns) > 0 and len(other._index_columns) > 0: + if ( + self.index.is_null + or other.index.is_null + or self.session._default_index_type == bigframes.enums.DefaultIndexKind.NULL + ): + expr = joined_expr + index_columns = [] + else: offset_index_id = guid.generate_guid() expr = joined_expr.promote_offsets(offset_index_id) index_columns = [offset_index_id] - else: - expr = joined_expr - index_columns = [] return Block(expr, index_columns=index_columns, column_labels=labels) @@ -2604,6 +2619,10 @@ def column_ids(self) -> Sequence[str]: """Column(s) to use as row labels.""" return self._block._index_columns + @property + def is_null(self) -> bool: + return len(self._block._index_columns) == 0 + def to_pandas(self, *, ordered: Optional[bool] = None) -> pd.Index: """Executes deferred operations and downloads the results.""" if len(self.column_ids) == 0: diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 2e97078ef5..7d7097ceb3 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -116,6 +116,28 @@ def test_unordered_drop_duplicates(unordered_session, keep): assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) +def test_unordered_reset_index(unordered_session): + pd_df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 4, 6]}, dtype=pd.Int64Dtype()) + bf_df = bpd.DataFrame(pd_df, session=unordered_session) + + bf_result = bf_df.set_index("b").reset_index(drop=False) + pd_result = pd_df.set_index("b").reset_index(drop=False) + + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + + +def test_unordered_merge(unordered_session): + pd_df = pd.DataFrame( + {"a": [1, 1, 3], "b": [4, 4, 6], "c": [1, 2, 3]}, dtype=pd.Int64Dtype() + ) + bf_df = bpd.DataFrame(pd_df, session=unordered_session) + + bf_result = bf_df.merge(bf_df, left_on="a", right_on="c") + pd_result = pd_df.merge(pd_df, left_on="a", right_on="c") + + assert_pandas_df_equal(bf_result.to_pandas(), pd_result, ignore_order=True) + + @pytest.mark.parametrize( ("function"), [ From e95053372c36ea5a91a2d7295c1a3a3671181670 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 30 Jul 2024 10:33:14 -0700 Subject: [PATCH 32/36] feat: Allow DataFrame.join for self-join on Null index (#860) * feat: Allow DataFrame.join for self-join on Null index * fix ml caching to apply post-join, add test * fix ml golden sql test * change unordered test to use linear regression --- bigframes/core/blocks.py | 10 ++--- bigframes/ml/core.py | 8 ++-- tests/system/large/ml/test_linear_model.py | 44 ++++++++++++++++++++++ tests/system/small/test_null_index.py | 14 +++++++ tests/unit/ml/test_golden_sql.py | 1 + 5 files changed, 68 insertions(+), 9 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 05865a6699..fd0c9c9539 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2307,11 +2307,11 @@ def join( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) # Handle null index, which only supports row join - if (self.index.nlevels == other.index.nlevels == 0) and not block_identity_join: - if not block_identity_join: - result = try_row_join(self, other, how=how) - if result is not None: - return result + # This is the canonical way of aligning on null index, so always allow (ignore block_identity_join) + if self.index.nlevels == other.index.nlevels == 0: + result = try_row_join(self, other, how=how) + if result is not None: + return result raise bigframes.exceptions.NullIndexError( "Cannot implicitly align objects. Set an explicit index using set_index." ) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index ee4d8a8c27..f1b36651f4 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -83,7 +83,7 @@ def distance( """ assert len(x.columns) == 1 and len(y.columns) == 1 - input_data = x.cache().join(y.cache(), how="outer") + input_data = x.join(y, how="outer").cache() x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] return self._apply_sql( @@ -326,7 +326,7 @@ def create_model( if y_train is None: input_data = X_train.cache() else: - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -366,7 +366,7 @@ def create_llm_remote_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -399,7 +399,7 @@ def create_time_series_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 0cc9fc5353..2f4c07fa28 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -111,6 +111,50 @@ def test_linear_regression_customized_params_fit_score( assert reloaded_model.learning_rate == 0.2 +def test_unordered_mode_regression_configure_fit_score( + unordered_session, penguins_table_id, dataset_id +): + model = bigframes.ml.linear_model.LinearRegression() + + df = unordered_session.read_gbq(penguins_table_id).dropna() + X_train = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) + + # Check score to ensure the model was fitted + result = model.score(X_train, y_train).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_REGRESSION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is True + assert reloaded_model.calculate_p_values is False + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None + assert reloaded_model.max_iterations == 20 + assert reloaded_model.tol == 0.01 + + # TODO(garrettwu): add tests for param warm_start. Requires a trained model. diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index 27a3d8dffe..a1e360f73d 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -201,6 +201,20 @@ def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index ) +def test_null_index_series_self_join( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index[["int64_col"]].join( + scalars_df_null_index[["int64_too"]] + ) + pd_result = scalars_pandas_df_default_index[["int64_col"]].join( + scalars_pandas_df_default_index[["int64_too"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_null_index_series_self_aligns( scalars_df_null_index, scalars_pandas_df_default_index ): diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 48fb7011ea..aa7e919b24 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session): ["index_column_label"], ) mock_X.join(mock_y).sql = "input_X_y_sql" + mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) mock_X.join(mock_y)._to_sql_query.return_value = ( "input_X_y_sql", ["index_column_id"], From d0ab9cc47298bdde638299baecac9dffd7841ede Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 30 Jul 2024 11:28:48 -0700 Subject: [PATCH 33/36] feat: Support to_csv/parquet/json to local files/objects (#858) --- bigframes/core/utils.py | 4 ++ bigframes/dataframe.py | 50 +++++++++----- bigframes/series.py | 41 ++++++++--- tests/system/small/test_dataframe.py | 68 ++++++++++++++++++- tests/system/small/test_series.py | 38 +++++++++++ .../bigframes_vendored/pandas/core/frame.py | 14 ++-- .../bigframes_vendored/pandas/core/generic.py | 41 +++++++---- 7 files changed, 206 insertions(+), 50 deletions(-) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 97c5ef03e5..43c05c6c83 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -23,6 +23,10 @@ UNNAMED_INDEX_ID = "bigframes_unnamed_index" +def is_gcs_path(value) -> typing_extensions.TypeGuard[str]: + return isinstance(value, str) and value.startswith("gs://") + + def get_axis_number(axis: typing.Union[str, int]) -> typing.Literal[0, 1]: if axis in {0, "index", "rows"}: return 0 diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 717549316a..5d4918c3ce 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2952,15 +2952,21 @@ def from_records( ) def to_csv( - self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True - ) -> None: + self, + path_or_buf=None, + sep=",", + *, + header: bool = True, + index: bool = True, + ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. # TODO(swast): Some warning that wildcard is recommended for large # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size - if not path_or_buf.startswith("gs://"): - raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) + if not utils.is_gcs_path(path_or_buf): + pd_df = self.to_pandas() + return pd_df.to_csv(path_or_buf, sep=sep, header=header, index=index) if "*" not in path_or_buf: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -2977,22 +2983,28 @@ def to_csv( export_data_statement, api_name="dataframe-to_csv" ) self._set_internal_query_job(query_job) + return None def to_json( self, - path_or_buf: str, - orient: Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", + path_or_buf=None, + orient: Optional[ + Literal["split", "records", "index", "columns", "values", "table"] + ] = None, *, lines: bool = False, index: bool = True, - ) -> None: + ) -> Optional[str]: # TODO(swast): Can we support partition columns argument? - # TODO(chelsealin): Support local file paths. - if not path_or_buf.startswith("gs://"): - raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) - + if not utils.is_gcs_path(path_or_buf): + pd_df = self.to_pandas() + return pd_df.to_json( + path_or_buf, + orient=orient, + lines=lines, + index=index, + default_handler=str, + ) if "*" not in path_or_buf: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3021,6 +3033,7 @@ def to_json( export_data_statement, api_name="dataframe-to_json" ) self._set_internal_query_job(query_job) + return None def to_gbq( self, @@ -3119,19 +3132,19 @@ def __array__(self, dtype=None) -> numpy.ndarray: def to_parquet( self, - path: str, + path=None, *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, - ) -> None: + ) -> Optional[bytes]: # TODO(swast): Can we support partition columns argument? # TODO(chelsealin): Support local file paths. # TODO(swast): Some warning that wildcard is recommended for large # query results? See: # https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size - if not path.startswith("gs://"): - raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS) - + if not utils.is_gcs_path(path): + pd_df = self.to_pandas() + return pd_df.to_parquet(path, compression=compression, index=index) if "*" not in path: raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD) @@ -3155,6 +3168,7 @@ def to_parquet( export_data_statement, api_name="dataframe-to_parquet" ) self._set_internal_query_job(query_job) + return None def to_dict( self, diff --git a/bigframes/series.py b/bigframes/series.py index 7c530b9612..1a5661529c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1652,9 +1652,22 @@ def to_frame(self, name: blocks.Label = None) -> bigframes.dataframe.DataFrame: return bigframes.dataframe.DataFrame(block) def to_csv( - self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True - ) -> None: - return self.to_frame().to_csv(path_or_buf, sep=sep, header=header, index=index) + self, + path_or_buf=None, + sep=",", + *, + header: bool = True, + index: bool = True, + ) -> Optional[str]: + if utils.is_gcs_path(path_or_buf): + return self.to_frame().to_csv( + path_or_buf, sep=sep, header=header, index=index + ) + else: + pd_series = self.to_pandas() + return pd_series.to_csv( + path_or_buf=path_or_buf, sep=sep, header=header, index=index + ) def to_dict(self, into: type[dict] = dict) -> typing.Mapping: return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore @@ -1664,17 +1677,23 @@ def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: def to_json( self, - path_or_buf: str, - orient: typing.Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", + path_or_buf=None, + orient: Optional[ + typing.Literal["split", "records", "index", "columns", "values", "table"] + ] = None, *, lines: bool = False, index: bool = True, - ) -> None: - return self.to_frame().to_json( - path_or_buf=path_or_buf, orient=orient, lines=lines, index=index - ) + ) -> Optional[str]: + if utils.is_gcs_path(path_or_buf): + return self.to_frame().to_json( + path_or_buf=path_or_buf, orient=orient, lines=lines, index=index + ) + else: + pd_series = self.to_pandas() + return pd_series.to_json( + path_or_buf=path_or_buf, orient=orient, lines=lines, index=index # type: ignore + ) def to_latex( self, buf=None, columns=None, header=True, index=True, **kwargs diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 67792b3a1d..3a7eff621f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -4125,6 +4125,72 @@ def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_json() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_json(default_handler=str) + + assert bf_result == pd_result + + +@skip_legacy_pandas +def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_json(bf_result_file, orient="table") + # default_handler for arrow types that have no default conversion + scalars_pandas_df_index.to_json( + pd_result_file, orient="table", default_handler=str + ) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_csv() + + assert bf_result == pd_result + + +def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_csv(bf_result_file) + scalars_pandas_df_index.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + + bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + def test_df_to_records(scalars_df_index, scalars_pandas_df_index): unsupported = ["numeric_col"] bf_result = scalars_df_index.drop(columns=unsupported).to_records() @@ -4166,7 +4232,7 @@ def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): scalars_df_index.to_pickle(bf_result_file) scalars_pandas_df_index.to_pickle(pd_result_file) bf_result = bf_result_file.read() - pd_result = bf_result_file.read() + pd_result = pd_result_file.read() assert bf_result == pd_result diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 10fcec63ce..fe6e001797 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2753,6 +2753,44 @@ def test_to_latex(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_json() + pd_result = scalars_pandas_df_index.int64_col.to_json() + + assert bf_result == pd_result + + +@skip_legacy_pandas +def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_json(bf_result_file) + scalars_pandas_df_index.int64_col.to_json(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_series_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.int64_col.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.int64_col.to_csv() + + assert bf_result == pd_result + + +def test_series_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.int64_col.to_csv(bf_result_file) + scalars_pandas_df_index.int64_col.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + def test_to_dict(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index["int64_too"].to_dict() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f8088f8060..7048d9c6dd 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -476,11 +476,11 @@ def to_gbq( def to_parquet( self, - path: str, + path: Optional[str], *, compression: Optional[Literal["snappy", "gzip"]] = "snappy", index: bool = True, - ) -> None: + ) -> Optional[bytes]: """Write a DataFrame to the binary Parquet format. This function writes the dataframe as a `parquet file @@ -496,9 +496,13 @@ def to_parquet( >>> df.to_parquet(path=gcs_bucket) Args: - path (str): + path (str, path object, file-like object, or None, default None): + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. Destination URI(s) of Cloud Storage files(s) to store the extracted dataframe - in format of ``gs:///``. + should be formatted ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. @@ -511,7 +515,7 @@ def to_parquet( If ``False``, they will not be written to the file. Returns: - None. + bytes if no path argument is provided else None """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 95302e51b2..6734fb6aa9 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -210,14 +210,14 @@ def empty(self) -> bool: def to_json( self, - path_or_buf: str, - orient: Literal[ - "split", "records", "index", "columns", "values", "table" - ] = "columns", + path_or_buf, + orient: Optional[ + Literal["split", "records", "index", "columns", "values", "table"] + ] = None, *, index: bool = True, lines: bool = False, - ) -> None: + ) -> Optional[str]: """Convert the object to a JSON string, written to Cloud Storage. Note NaN's and None will be converted to null and datetime objects @@ -227,16 +227,18 @@ def to_json( Only ``orient='records'`` and ``lines=True`` is supported so far. Args: - path_or_buf (str): - A destination URI of Cloud Storage files(s) to store the extracted + path_or_buf (str, path object, file-like object, or None, default None): + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. + + Can be a destination URI of Cloud Storage files(s) to store the extracted dataframe in format of ``gs:///``. Must contain a wildcard `*` character. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files varies. - - None, file-like objects or local file paths not yet supported. orient ({`split`, `records`, `index`, `columns`, `values`, `table`}, default 'columns): Indication of expected JSON string format. @@ -271,17 +273,25 @@ def to_json( list-like. Returns: - None: String output not yet supported. + None or str: If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_csv(self, path_or_buf: str, *, index: bool = True) -> None: + def to_csv(self, path_or_buf, *, index: bool = True) -> Optional[str]: """Write object to a comma-separated values (csv) file on Cloud Storage. Args: - path_or_buf (str): - A destination URI of Cloud Storage files(s) to store the extracted dataframe - in format of ``gs:///``. + path_or_buf (str, path object, file-like object, or None, default None): + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. If a non-binary file object is passed, it should + be opened with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` might need to contain a `'b'`. + + Alternatively, a destination URI of Cloud Storage files(s) to store the + extracted dataframe in format of + ``gs:///``. If the data size is more than 1GB, you must use a wildcard to export the data into multiple files and the size of the files @@ -293,7 +303,8 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> None: If True, write row names (index). Returns: - None: String output not yet supported. + None or str: If path_or_buf is None, returns the resulting json format as a + string. Otherwise returns None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From cbf2d42e4d961a7537381a9c3b28a8b463ad8f74 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 30 Jul 2024 13:00:49 -0700 Subject: [PATCH 34/36] fix: reduce redundant `remote_function` deployments (#856) * fix: reduce redundant `remote_function` deployments * do filename override in the naming rather than pickling * update documentation * update documentation --- bigframes/functions/remote_function.py | 29 ++- .../functions/remote_function_template.py | 18 +- bigframes/session/__init__.py | 16 +- .../remote_function_usecases.ipynb | 232 +++++++++--------- 4 files changed, 161 insertions(+), 134 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index b95067983f..d84fbcdbab 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -167,7 +167,23 @@ def get_remote_function_locations(bq_location): def _get_hash(def_, package_requirements=None): "Get hash (32 digits alphanumeric) of a function." - def_repr = cloudpickle.dumps(def_, protocol=_pickle_protocol_version) + # There is a known cell-id sensitivity of the cloudpickle serialization in + # notebooks https://github.com/cloudpipe/cloudpickle/issues/538. Because of + # this, if a cell contains a udf decorated with @remote_function, a unique + # cloudpickle code is generated every time the cell is run, creating new + # cloud artifacts every time. This is slow and wasteful. + # A workaround of the same can be achieved by replacing the filename in the + # code object to a static value + # https://github.com/cloudpipe/cloudpickle/issues/120#issuecomment-338510661. + # + # To respect the user code/environment let's make this modification on a + # copy of the udf, not on the original udf itself. + def_copy = cloudpickle.loads(cloudpickle.dumps(def_)) + def_copy.__code__ = def_copy.__code__.replace( + co_filename="bigframes_place_holder_filename" + ) + + def_repr = cloudpickle.dumps(def_copy, protocol=_pickle_protocol_version) if package_requirements: for p in sorted(package_requirements): def_repr += p.encode() @@ -877,11 +893,16 @@ def remote_function( dynamically using the `bigquery_connection_client` assuming the user has necessary priviliges. The PROJECT_ID should be the same as the BigQuery connection project. reuse (bool, Optional): - Reuse the remote function if is already exists. - `True` by default, which results in reusing an existing remote + Reuse the remote function if already exists. + `True` by default, which will result in reusing an existing remote function and corresponding cloud function (if any) that was previously created for the same udf. - Setting it to `False` forces the creation of a unique remote function. + Please note that for an unnamed (i.e. created without an explicit + `name` argument) remote function, the BigQuery DataFrames + session id is attached in the cloud artifacts names. So for the + effective reuse across the sessions it is recommended to create + the remote function with an explicit `name`. + Setting it to `False` would force creating a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. name (str, Optional): diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py index 68fe1b917d..c666f41daa 100644 --- a/bigframes/functions/remote_function_template.py +++ b/bigframes/functions/remote_function_template.py @@ -215,9 +215,9 @@ def udf_http_row_processor(request): def generate_udf_code(def_, directory): - """Generate serialized bytecode using cloudpickle given a udf.""" + """Generate serialized code using cloudpickle given a udf.""" udf_code_file_name = "udf.py" - udf_bytecode_file_name = "udf.cloudpickle" + udf_pickle_file_name = "udf.cloudpickle" # original code, only for debugging purpose udf_code = textwrap.dedent(inspect.getsource(def_)) @@ -225,13 +225,13 @@ def generate_udf_code(def_, directory): with open(udf_code_file_path, "w") as f: f.write(udf_code) - # serialized bytecode - udf_bytecode_file_path = os.path.join(directory, udf_bytecode_file_name) + # serialized udf + udf_pickle_file_path = os.path.join(directory, udf_pickle_file_name) # TODO(b/345433300): try io.BytesIO to avoid writing to the file system - with open(udf_bytecode_file_path, "wb") as f: + with open(udf_pickle_file_path, "wb") as f: cloudpickle.dump(def_, f, protocol=_pickle_protocol_version) - return udf_code_file_name, udf_bytecode_file_name + return udf_code_file_name, udf_pickle_file_name def generate_cloud_function_main_code( @@ -252,15 +252,15 @@ def generate_cloud_function_main_code( """ # Pickle the udf with all its dependencies - udf_code_file, udf_bytecode_file = generate_udf_code(def_, directory) + udf_code_file, udf_pickle_file = generate_udf_code(def_, directory) code_blocks = [ f"""\ import cloudpickle # original udf code is in {udf_code_file} -# serialized udf code is in {udf_bytecode_file} -with open("{udf_bytecode_file}", "rb") as f: +# serialized udf code is in {udf_pickle_file} +with open("{udf_pickle_file}", "rb") as f: udf = cloudpickle.load(f) input_types = {repr(input_types)} diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 22ca63d25b..dfec83a56a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1629,15 +1629,21 @@ def remote_function( `True` by default, which will result in reusing an existing remote function and corresponding cloud function (if any) that was previously created for the same udf. + Please note that for an unnamed (i.e. created without an explicit + `name` argument) remote function, the BigQuery DataFrames + session id is attached in the cloud artifacts names. So for the + effective reuse across the sessions it is recommended to create + the remote function with an explicit `name`. Setting it to `False` would force creating a unique remote function. If the required remote function does not exist then it would be created irrespective of this param. name (str, Optional): - Explicit name of the persisted BigQuery remote function. Use it with - caution, because two users working in the same project and dataset - could overwrite each other's remote functions if they use the same - persistent name. When an explicit name is provided, any session - specific clean up (``bigframes.session.Session.close``/ + Explicit name of the persisted BigQuery remote function. Use it + with caution, because more than one users working in the same + project and dataset could overwrite each other's remote + functions if they use the same persistent name. When an explicit + name is provided, any session specific clean up ( + ``bigframes.session.Session.close``/ ``bigframes.pandas.close_session``/ ``bigframes.pandas.reset_session``/ ``bigframes.pandas.clean_up_by_session_id``) does not clean up diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb index 3d7ae3e8c7..9317e4b8fe 100644 --- a/notebooks/remote_functions/remote_function_usecases.ipynb +++ b/notebooks/remote_functions/remote_function_usecases.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 28, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 21, "metadata": { "id": "Y6QAttCqqMM0" }, @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -55,14 +55,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shobs/code/bigframes/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3550: UserWarning: Reading cached table from 2024-06-28 02:49:31.716256+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3550: UserWarning: Reading cached table from 2024-07-24 08:01:12.491984+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, { "data": { "text/html": [ - "Query job f72cda67-2a96-4cd2-a624-591c0d540fc9 is DONE. 582.8 kB processed. Open Job" + "Query job 9d155f10-e37a-4d20-b2ff-02868ecb58f4 is DONE. 582.8 kB processed. Open Job" ], "text/plain": [ "" @@ -74,7 +74,7 @@ { "data": { "text/html": [ - "Query job 65cf6ca3-73f0-49e6-84a8-1ff79af6ec75 is DONE. 82.0 kB processed. Open Job" + "Query job 5a524e70-12dc-4116-b416-04570bbf754e is DONE. 82.0 kB processed. Open Job" ], "text/plain": [ "" @@ -111,49 +111,49 @@ " \n", " \n", " \n", - " 50\n", - " Rays\n", - " Rangers\n", - " 181\n", + " 36\n", + " Reds\n", + " Cubs\n", + " 159\n", " \n", " \n", - " 72\n", - " Phillies\n", - " Pirates\n", - " 192\n", + " 358\n", + " Dodgers\n", + " Diamondbacks\n", + " 223\n", " \n", " \n", - " 89\n", - " Mariners\n", - " Blue Jays\n", - " 183\n", + " 416\n", + " Yankees\n", + " White Sox\n", + " 216\n", " \n", " \n", - " 351\n", - " Astros\n", - " Angels\n", - " 212\n", + " 523\n", + " Rays\n", + " Athletics\n", + " 187\n", " \n", " \n", - " 382\n", - " Royals\n", - " Yankees\n", - " 259\n", + " 594\n", + " Pirates\n", + " Brewers\n", + " 169\n", " \n", " \n", "\n", "" ], "text/plain": [ - " homeTeamName awayTeamName duration_minutes\n", - "50 Rays Rangers 181\n", - "72 Phillies Pirates 192\n", - "89 Mariners Blue Jays 183\n", - "351 Astros Angels 212\n", - "382 Royals Yankees 259" + " homeTeamName awayTeamName duration_minutes\n", + "36 Reds Cubs 159\n", + "358 Dodgers Diamondbacks 223\n", + "416 Yankees White Sox 216\n", + "523 Rays Athletics 187\n", + "594 Pirates Brewers 169" ] }, - "execution_count": 30, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -202,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -215,7 +215,7 @@ { "data": { "text/html": [ - "Query job f039d478-8dc4-4b60-8eda-179955e06586 is DONE. 0 Bytes processed. Open Job" + "Query job ec8d958d-93ef-45ae-8150-6ccfa8feb89a is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -228,7 +228,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-862150459da5240a6df1ce01c59b32d8-em4ibov0' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_862150459da5240a6df1ce01c59b32d8_em4ibov0'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-e22dbecc9ec0374bda36bc23df3775b0-g8zp' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_e22dbecc9ec0374bda36bc23df3775b0_g8zp'.\n" ] } ], @@ -247,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -260,7 +260,7 @@ { "data": { "text/html": [ - "Query job 23e95831-d913-4d2b-97f6-588fc7967455 is DONE. 58.3 kB processed. Open Job" + "Query job 4b116e3e-d4d3-4eb6-9764-0a29a7c5d036 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -272,7 +272,7 @@ { "data": { "text/html": [ - "Query job bb8b3d13-a521-4d45-b4c8-5686c944a9f2 is DONE. 157.2 kB processed. Open Job" + "Query job d62ac4f0-47c9-47ae-8611-c9ecf78f20c9 is DONE. 157.2 kB processed. Open Job" ], "text/plain": [ "" @@ -284,7 +284,7 @@ { "data": { "text/html": [ - "Query job 2a4653f5-cc6b-4279-a45e-40f0f97090a7 is DONE. 98.8 kB processed. Open Job" + "Query job 5f876ebb-2d95-4c68-9d84-947e02b37bad is DONE. 98.8 kB processed. Open Job" ], "text/plain": [ "" @@ -369,7 +369,7 @@ "654 Astros Angels 143 medium" ] }, - "execution_count": 32, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -396,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 25, "metadata": { "id": "2UEmTbu4znyS" }, @@ -409,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -422,7 +422,7 @@ { "data": { "text/html": [ - "Query job 5d914fde-81ec-46eb-9219-9822f77dd9a2 is DONE. 0 Bytes processed. Open Job" + "Query job 1909a652-5735-401b-8a77-674d8539ded0 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -435,7 +435,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-f3231b74ec807496f4894218d5d40ed5-688mx7hi' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_f3231b74ec807496f4894218d5d40ed5_688mx7hi'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-4191f0fce98d46cc09359de47e203236-e009' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_4191f0fce98d46cc09359de47e203236_e009'.\n" ] } ], @@ -454,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -467,7 +467,7 @@ { "data": { "text/html": [ - "Query job b0b39944-1e69-4185-97ba-985178ee241f is DONE. 58.3 kB processed. Open Job" + "Query job a942bdc5-6a6d-4db8-b2aa-a556197377b3 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -479,7 +479,7 @@ { "data": { "text/html": [ - "Query job 90d99515-eb5e-4bcd-bce5-292eea09770e is DONE. 147.7 kB processed. Open Job" + "Query job 175ae9d3-604f-495b-a167-8b06c0283bd2 is DONE. 147.7 kB processed. Open Job" ], "text/plain": [ "" @@ -491,7 +491,7 @@ { "data": { "text/html": [ - "Query job eb31d033-c871-49c5-a75e-4427e376516f is DONE. 89.3 kB processed. Open Job" + "Query job d331a785-e574-45c9-86c8-d29ddd79a4d1 is DONE. 89.3 kB processed. Open Job" ], "text/plain": [ "" @@ -576,7 +576,7 @@ "654 Astros Angels 143 M" ] }, - "execution_count": 35, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -607,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 28, "metadata": { "id": "zlQfhcW41uzM" }, @@ -618,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -631,7 +631,7 @@ { "data": { "text/html": [ - "Query job 2895676f-d15c-40fd-8cf2-3a0436291e6b is DONE. 0 Bytes processed. Open Job" + "Query job bbc0b78f-bc04-4bd5-b711-399786a51519 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -644,7 +644,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-9b20b0257558a42da610d8998022c25e-7k62x9l6' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_9b20b0257558a42da610d8998022c25e_7k62x9l6'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-cf31fc2d2c7fe111afa5526f5a9cdf06-gmmo' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_cf31fc2d2c7fe111afa5526f5a9cdf06_gmmo'.\n" ] } ], @@ -659,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -672,7 +672,7 @@ { "data": { "text/html": [ - "Query job 4efda755-2f54-4477-b48a-4a424c888559 is DONE. 58.3 kB processed. Open Job" + "Query job 991b54ed-9eaa-450f-9208-3e73404bb112 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -684,7 +684,7 @@ { "data": { "text/html": [ - "Query job a8992776-c2e8-4c3e-ab75-dfc01c5de89f is DONE. 150.1 kB processed. Open Job" + "Query job 4e464a58-ac5b-42fd-91e3-92c115bdd273 is DONE. 150.1 kB processed. Open Job" ], "text/plain": [ "" @@ -696,7 +696,7 @@ { "data": { "text/html": [ - "Query job 3ea299b0-27ad-432b-8dbf-81da3aae884f is DONE. 91.7 kB processed. Open Job" + "Query job d340f55d-1511-431a-970d-a70ed4356935 is DONE. 91.7 kB processed. Open Job" ], "text/plain": [ "" @@ -781,7 +781,7 @@ "654 Astros Angels 143 3h" ] }, - "execution_count": 38, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -812,7 +812,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 31, "metadata": { "id": "0G91fWiF3pKg" }, @@ -829,7 +829,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -842,7 +842,7 @@ { "data": { "text/html": [ - "Query job 411853db-bf83-4df8-af78-55b1ceb39cb1 is DONE. 0 Bytes processed. Open Job" + "Query job 10d1afa3-349b-49a8-adbd-79a8309ce77c is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -855,7 +855,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-b54aa0aa752af6a3bd6d9d529dac373b-h4lgpy4y' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_b54aa0aa752af6a3bd6d9d529dac373b_h4lgpy4y'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-3c03836c2044bf625d02e25ccdbfe101-k1m4' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_3c03836c2044bf625d02e25ccdbfe101_k1m4'.\n" ] } ], @@ -870,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -883,7 +883,7 @@ { "data": { "text/html": [ - "Query job d04abfa5-e2f2-4936-a708-ed97ef429df3 is DONE. 58.3 kB processed. Open Job" + "Query job 33aff336-48d6-4caa-8cae-f459d21b180e is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -895,7 +895,7 @@ { "data": { "text/html": [ - "Query job 2fc4edf0-7a86-4532-b8fb-bd3f5d153dcb is DONE. 157.4 kB processed. Open Job" + "Query job 561e0aa7-3962-4ef3-b308-a117a0ac3a7d is DONE. 157.4 kB processed. Open Job" ], "text/plain": [ "" @@ -907,7 +907,7 @@ { "data": { "text/html": [ - "Query job f7e6e18c-70d7-4b4e-926a-03b3a1abd1fe is DONE. 99.0 kB processed. Open Job" + "Query job 759dccf8-3d88-40e1-a38a-2a2064e1d269 is DONE. 99.0 kB processed. Open Job" ], "text/plain": [ "" @@ -992,7 +992,7 @@ "654 Astros Angels 143 3 hrs" ] }, - "execution_count": 41, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1018,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1031,7 +1031,7 @@ { "data": { "text/html": [ - "Query job c674e7b7-2349-4317-8f08-8bfd9aa99785 is DONE. 0 Bytes processed. Open Job" + "Query job e2a44878-2564-44a5-8dec-b7ea2f42afd4 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1057,7 +1057,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1070,7 +1070,7 @@ { "data": { "text/html": [ - "Query job eb9384c9-de7d-4232-bdca-94b61b50ff89 is DONE. 60.5 kB processed. Open Job" + "Query job bcfab000-ca19-4633-bf0e-45e7d053f3eb is DONE. 60.5 kB processed. Open Job" ], "text/plain": [ "" @@ -1082,7 +1082,7 @@ { "data": { "text/html": [ - "Query job 11a736a5-96d1-4e62-90e2-576156131a94 is DONE. 388.3 kB processed. Open Job" + "Query job 139a6449-c07e-41ff-9aed-c6fdd633740a is DONE. 388.3 kB processed. Open Job" ], "text/plain": [ "" @@ -1094,7 +1094,7 @@ { "data": { "text/html": [ - "Query job c66a9ad1-60f7-4af1-ad7c-65e4eecbb035 is DONE. 330.0 kB processed. Open Job" + "Query job 035fa2fb-0a55-4358-bb50-3ef915f5bf54 is DONE. 330.0 kB processed. Open Job" ], "text/plain": [ "" @@ -1132,61 +1132,61 @@ " \n", " \n", " \n", - " 719\n", - " Astros\n", - " Angels\n", - " 180\n", - " gAAAAABmflbKCFygsmoTzFkUCObFSBJG29Ksk8HEtk82ib...\n", + " 641\n", + " American League\n", + " National League\n", + " 185\n", + " gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-...\n", " \n", " \n", - " 2295\n", - " Astros\n", + " 349\n", " Angels\n", - " 204\n", - " gAAAAABmflbKv-XzIxcNS92RO4fXYIAwA0kGWsAy-tI5fm...\n", + " Astros\n", + " 187\n", + " gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM...\n", " \n", " \n", - " 1126\n", - " Astros\n", + " 2349\n", " Angels\n", - " 176\n", - " gAAAAABmflbJdjgpqnfvmklU7Zg3NJUqlTMYMs44dLEkwg...\n", + " Astros\n", + " 160\n", + " gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF...\n", " \n", " \n", - " 294\n", - " Astros\n", + " 557\n", " Angels\n", - " 189\n", - " gAAAAABmflbKmfBh4P3FnwyiIpVFek9TzF4GzwP_5rQmkv...\n", + " Astros\n", + " 166\n", + " gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg...\n", " \n", " \n", - " 351\n", - " Astros\n", + " 220\n", " Angels\n", - " 212\n", - " gAAAAABmflbJ_mzqao9i7BtoYlMpb6y3bV3x7-cYuWGxsT...\n", + " Astros\n", + " 162\n", + " gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " homeTeamName awayTeamName duration_minutes \\\n", - "719 Astros Angels 180 \n", - "2295 Astros Angels 204 \n", - "1126 Astros Angels 176 \n", - "294 Astros Angels 189 \n", - "351 Astros Angels 212 \n", + " homeTeamName awayTeamName duration_minutes \\\n", + "641 American League National League 185 \n", + "349 Angels Astros 187 \n", + "2349 Angels Astros 160 \n", + "557 Angels Astros 166 \n", + "220 Angels Astros 162 \n", "\n", " homeTeamNameRedacted \n", - "719 gAAAAABmflbKCFygsmoTzFkUCObFSBJG29Ksk8HEtk82ib... \n", - "2295 gAAAAABmflbKv-XzIxcNS92RO4fXYIAwA0kGWsAy-tI5fm... \n", - "1126 gAAAAABmflbJdjgpqnfvmklU7Zg3NJUqlTMYMs44dLEkwg... \n", - "294 gAAAAABmflbKmfBh4P3FnwyiIpVFek9TzF4GzwP_5rQmkv... \n", - "351 gAAAAABmflbJ_mzqao9i7BtoYlMpb6y3bV3x7-cYuWGxsT... " + "641 gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-... \n", + "349 gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM... \n", + "2349 gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF... \n", + "557 gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg... \n", + "220 gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z... " ] }, - "execution_count": 43, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -1211,7 +1211,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1221,13 +1221,13 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 21b054a9-8fb2-418f-a17b-effdf5aba9b5 is DONE. 0 Bytes processed. Open Job" + "Query job af73ab2d-8d88-4cbe-863f-d35e48af84e1 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1240,7 +1240,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-0879f72acd9b8ede460b69c5a8cc0dcb-edxlst27' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_0879f72acd9b8ede460b69c5a8cc0dcb_edxlst27'.\n" + "Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-a5e21a4ad488ce8b90de19c3c8cd33b6-0ab2' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_a5e21a4ad488ce8b90de19c3c8cd33b6_0ab2'.\n" ] } ], @@ -1255,13 +1255,13 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job d67b7cb9-9813-4863-99d1-01cf45ab4949 is DONE. 58.3 kB processed. Open Job" + "Query job 0a9ac329-619d-4303-8dbd-176a576d4ce8 is DONE. 58.3 kB processed. Open Job" ], "text/plain": [ "" @@ -1273,7 +1273,7 @@ { "data": { "text/html": [ - "Query job 579ba853-a7b8-49df-9539-bf22f08d2370 is DONE. 162.2 kB processed. Open Job" + "Query job 456bb9b4-0576-4c04-b707-4a04496aa538 is DONE. 162.2 kB processed. Open Job" ], "text/plain": [ "" @@ -1285,7 +1285,7 @@ { "data": { "text/html": [ - "Query job 72f9eb5d-1c1a-4ce8-8f2f-1f5a8f7cec99 is DONE. 103.9 kB processed. Open Job" + "Query job 37f59939-5d2c-4fb1-839b-282ae3702d3d is DONE. 103.9 kB processed. Open Job" ], "text/plain": [ "" @@ -1370,7 +1370,7 @@ "654 Astros Angels 143 2 hours" ] }, - "execution_count": 46, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } From a7d7197a32c55b989ae4ea8f6cf6e1c0f7184cd4 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:53:53 -0700 Subject: [PATCH 35/36] feat: add streaming.StreamingDataFrame class (#864) * feat: add StreamingDataFrame support * use setattr for properties * fix bug * read session from DF * fix docs and tests * fix test * add preview warning * resolve comments * move to streaming.read_gbq_table, add logger * fix unit test * fix doc test * update notebook * add back preview warning --- bigframes/core/blocks.py | 6 +- bigframes/dataframe.py | 8 +- bigframes/session/__init__.py | 51 +- bigframes/streaming/__init__.py | 256 +-------- bigframes/streaming/dataframe.py | 504 +++++++++++++++++ notebooks/streaming/streaming_dataframe.ipynb | 535 ++++++++++++++++++ tests/system/conftest.py | 8 + tests/system/large/test_streaming.py | 38 +- tests/unit/test_pandas.py | 2 + 9 files changed, 1134 insertions(+), 274 deletions(-) create mode 100644 bigframes/streaming/dataframe.py create mode 100644 notebooks/streaming/streaming_dataframe.ipynb diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index fd0c9c9539..1b7b231403 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2354,7 +2354,7 @@ def is_monotonic_decreasing( return self._is_monotonic(column_id, increasing=False) def to_sql_query( - self, include_index: bool + self, include_index: bool, enable_cache: bool = True ) -> typing.Tuple[str, list[str], list[Label]]: """ Compiles this DataFrame's expression tree to SQL, optionally @@ -2388,7 +2388,9 @@ def to_sql_query( # the BigQuery unicode column name feature? substitutions[old_id] = new_id - sql = self.session._to_sql(array_value, col_id_overrides=substitutions) + sql = self.session._to_sql( + array_value, col_id_overrides=substitutions, enable_cache=enable_cache + ) return ( sql, new_ids[: len(idx_labels)], diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5d4918c3ce..9789c7cf9f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -105,6 +105,8 @@ def guarded_meth(df: DataFrame, *args, **kwargs): @log_adapter.class_logger class DataFrame(vendored_pandas_frame.DataFrame): __doc__ = vendored_pandas_frame.DataFrame.__doc__ + # internal flag to disable cache at all + _disable_cache_override: bool = False def __init__( self, @@ -367,7 +369,7 @@ def astype( return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) def _to_sql_query( - self, include_index: bool + self, include_index: bool, enable_cache: bool = True ) -> Tuple[str, list[str], list[blocks.Label]]: """Compiles this DataFrame's expression tree to SQL, optionally including index columns. @@ -381,7 +383,7 @@ def _to_sql_query( If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ - return self._block.to_sql_query(include_index) + return self._block.to_sql_query(include_index, enable_cache=enable_cache) @property def sql(self) -> str: @@ -3628,6 +3630,8 @@ def _cached(self, *, force: bool = False) -> DataFrame: No-op if the dataframe represents a trivial transformation of an existing materialization. Force=True is used for BQML integration where need to copy data rather than use snapshot. """ + if self._disable_cache_override: + return self self._block.cached(force=force) return self diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index dfec83a56a..98cba867f2 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -107,6 +107,7 @@ import bigframes.core.indexes import bigframes.dataframe as dataframe import bigframes.series + import bigframes.streaming.dataframe as streaming_dataframe _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" @@ -749,6 +750,38 @@ def read_gbq_table( filters=filters, ) + def read_gbq_table_streaming( + self, table: str + ) -> streaming_dataframe.StreamingDataFrame: + """Turn a BigQuery table into a StreamingDataFrame. + + Note: The bigframes.streaming module is a preview feature, and subject to change. + + **Examples:** + + >>> import bigframes.streaming as bst + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") + """ + warnings.warn( + "The bigframes.streaming module is a preview feature, and subject to change.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + import bigframes.streaming.dataframe as streaming_dataframe + + df = self._read_gbq_table( + table, + api_name="read_gbq_table_steaming", + enable_snapshot=False, + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + return streaming_dataframe.StreamingDataFrame._from_table_df(df) + def _read_gbq_table( self, query: str, @@ -759,6 +792,7 @@ def _read_gbq_table( api_name: str, use_cache: bool = True, filters: third_party_pandas_gbq.FiltersType = (), + enable_snapshot: bool = True, ) -> dataframe.DataFrame: import bigframes.dataframe as dataframe @@ -877,7 +911,7 @@ def _read_gbq_table( else (*columns, *[col for col in index_cols if col not in columns]) ) - supports_snapshot = bf_read_gbq_table.validate_table( + enable_snapshot = enable_snapshot and bf_read_gbq_table.validate_table( self.bqclient, table_ref, all_columns, time_travel_timestamp, filter_str ) @@ -905,7 +939,7 @@ def _read_gbq_table( table, schema=schema, predicate=filter_str, - at_time=time_travel_timestamp if supports_snapshot else None, + at_time=time_travel_timestamp if enable_snapshot else None, primary_key=index_cols if is_index_unique else (), session=self, ) @@ -2056,17 +2090,20 @@ def _to_sql( offset_column: typing.Optional[str] = None, col_id_overrides: typing.Mapping[str, str] = {}, ordered: bool = False, + enable_cache: bool = True, ) -> str: if offset_column: array_value = array_value.promote_offsets(offset_column) - node_w_cached = self._with_cached_executions(array_value.node) + node = ( + self._with_cached_executions(array_value.node) + if enable_cache + else array_value.node + ) if ordered: return self._compiler.compile_ordered( - node_w_cached, col_id_overrides=col_id_overrides + node, col_id_overrides=col_id_overrides ) - return self._compiler.compile_unordered( - node_w_cached, col_id_overrides=col_id_overrides - ) + return self._compiler.compile_unordered(node, col_id_overrides=col_id_overrides) def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) diff --git a/bigframes/streaming/__init__.py b/bigframes/streaming/__init__.py index 0b6fd18561..66f345f0ab 100644 --- a/bigframes/streaming/__init__.py +++ b/bigframes/streaming/__init__.py @@ -12,253 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Module for bigquery continuous queries""" +import inspect -import json -from typing import Optional -import warnings +import bigframes.core.global_session as global_session +import bigframes.pandas as bpd +import bigframes.session +import bigframes.streaming.dataframe as streaming_dataframe -from google.cloud import bigquery -import bigframes - - -def to_bigtable( - query: str, - *, - instance: str, - table: str, - service_account_email: Optional[str] = None, - session: Optional[bigframes.Session] = None, - app_profile: Optional[str] = None, - truncate: bool = False, - overwrite: bool = False, - auto_create_column_families: bool = False, - bigtable_options: Optional[dict] = None, - job_id: Optional[str] = None, - job_id_prefix: Optional[str] = None, -) -> bigquery.QueryJob: - """Launches a BigQuery continuous query and returns a - QueryJob object for some management functionality. - - This method requires an existing bigtable preconfigured to - accept the continuous query export statement. For instructions - on export to bigtable, see - https://cloud.google.com/bigquery/docs/export-to-bigtable. - - Args: - query (str): - The sql statement to execute as a continuous function. - For example: "SELECT * FROM dataset.table" - This will be wrapped in an EXPORT DATA statement to - launch a continuous query writing to bigtable. - instance (str): - The name of the bigtable instance to export to. - table (str): - The name of the bigtable table to export to. - service_account_email (str): - Full name of the service account to run the continuous query. - Example: accountname@projectname.gserviceaccounts.com - If not provided, the user account will be used, but this - limits the lifetime of the continuous query. - session (bigframes.Session, default None): - The session object to use for the query. This determines - the project id and location of the query. If None, will - default to the bigframes global session. - app_profile (str, default None): - The bigtable app profile to export to. If None, no app - profile will be used. - truncate (bool, default False): - The export truncate option, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - overwrite (bool, default False): - The export overwrite option, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - auto_create_column_families (bool, default False): - The auto_create_column_families option, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - bigtable_options (dict, default None): - The bigtable options dict, which will be converted to JSON - using json.dumps, see - https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option - If None, no bigtable_options parameter will be passed. - job_id (str, default None): - If specified, replace the default job id for the query, - see job_id parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - job_id_prefix (str, default None): - If specified, a job id prefix for the query, see - job_id_prefix parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - - Returns: - google.cloud.bigquery.QueryJob: - See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob - The ongoing query job can be managed using this object. - For example, the job can be cancelled or its error status - can be examined. - """ - warnings.warn( - "The bigframes.streaming module is a preview feature, and subject to change.", - stacklevel=1, - category=bigframes.exceptions.PreviewWarning, - ) - - # get default client if not passed - if session is None: - session = bigframes.get_global_session() - bq_client = session.bqclient - - # build export string from parameters - project = bq_client.project - - app_profile_url_string = "" - if app_profile is not None: - app_profile_url_string = f"appProfiles/{app_profile}/" - - bigtable_options_parameter_string = "" - if bigtable_options is not None: - bigtable_options_parameter_string = ( - 'bigtable_options = """' + json.dumps(bigtable_options) + '""",\n' - ) - - sql = ( - "EXPORT DATA\n" - "OPTIONS (\n" - "format = 'CLOUD_BIGTABLE',\n" - f"{bigtable_options_parameter_string}" - f"truncate = {str(truncate)},\n" - f"overwrite = {str(overwrite)},\n" - f"auto_create_column_families = {str(auto_create_column_families)},\n" - f'uri = "https://bigtable.googleapis.com/projects/{project}/instances/{instance}/{app_profile_url_string}tables/{table}"\n' - ")\n" - "AS (\n" - f"{query});" - ) - - # override continuous http parameter - job_config = bigquery.job.QueryJobConfig() - - job_config_dict: dict = {"query": {"continuous": True}} - if service_account_email is not None: - job_config_dict["query"]["connectionProperties"] = { - "key": "service_account", - "value": service_account_email, - } - job_config_filled = job_config.from_api_repr(job_config_dict) - job_config_filled.labels = {"bigframes-api": "streaming_to_bigtable"} - - # begin the query job - query_job = bq_client.query( - sql, - job_config=job_config_filled, # type:ignore - # typing error above is in bq client library - # (should accept abstract job_config, only takes concrete) - job_id=job_id, - job_id_prefix=job_id_prefix, - ) - - # return the query job to the user for lifetime management - return query_job - - -def to_pubsub( - query: str, - *, - topic: str, - service_account_email: str, - session: Optional[bigframes.Session] = None, - job_id: Optional[str] = None, - job_id_prefix: Optional[str] = None, -) -> bigquery.QueryJob: - """Launches a BigQuery continuous query and returns a - QueryJob object for some management functionality. - - This method requires an existing pubsub topic. For instructions - on creating a pubsub topic, see - https://cloud.google.com/pubsub/docs/samples/pubsub-quickstart-create-topic?hl=en - - Note that a service account is a requirement for continuous queries - exporting to pubsub. - - Args: - query (str): - The sql statement to execute as a continuous function. - For example: "SELECT * FROM dataset.table" - This will be wrapped in an EXPORT DATA statement to - launch a continuous query writing to pubsub. - topic (str): - The name of the pubsub topic to export to. - For example: "taxi-rides" - service_account_email (str): - Full name of the service account to run the continuous query. - Example: accountname@projectname.gserviceaccounts.com - session (bigframes.Session, default None): - The session object to use for the query. This determines - the project id and location of the query. If None, will - default to the bigframes global session. - job_id (str, default None): - If specified, replace the default job id for the query, - see job_id parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - job_id_prefix (str, default None): - If specified, a job id prefix for the query, see - job_id_prefix parameter of - https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query - - Returns: - google.cloud.bigquery.QueryJob: - See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob - The ongoing query job can be managed using this object. - For example, the job can be cancelled or its error status - can be examined. - """ - warnings.warn( - "The bigframes.streaming module is a preview feature, and subject to change.", - stacklevel=1, - category=bigframes.exceptions.PreviewWarning, +def read_gbq_table(table: str) -> streaming_dataframe.StreamingDataFrame: + bpd._set_default_session_location_if_possible(table) + return global_session.with_default_session( + bigframes.session.Session.read_gbq_table_streaming, table ) - # get default client if not passed - if session is None: - session = bigframes.get_global_session() - bq_client = session.bqclient - - # build export string from parameters - sql = ( - "EXPORT DATA\n" - "OPTIONS (\n" - "format = 'CLOUD_PUBSUB',\n" - f'uri = "https://pubsub.googleapis.com/projects/{bq_client.project}/topics/{topic}"\n' - ")\n" - "AS (\n" - f"{query});" - ) - # override continuous http parameter - job_config = bigquery.job.QueryJobConfig() - job_config_filled = job_config.from_api_repr( - { - "query": { - "continuous": True, - "connectionProperties": { - "key": "service_account", - "value": service_account_email, - }, - } - } - ) - job_config_filled.labels = {"bigframes-api": "streaming_to_pubsub"} - - # begin the query job - query_job = bq_client.query( - sql, - job_config=job_config_filled, # type:ignore - # typing error above is in bq client library - # (should accept abstract job_config, only takes concrete) - job_id=job_id, - job_id_prefix=job_id_prefix, - ) +read_gbq_table.__doc__ = inspect.getdoc( + bigframes.session.Session.read_gbq_table_streaming +) - # return the query job to the user for lifetime management - return query_job +StreamingDataFrame = streaming_dataframe.StreamingDataFrame diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py new file mode 100644 index 0000000000..64a4898c57 --- /dev/null +++ b/bigframes/streaming/dataframe.py @@ -0,0 +1,504 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module for bigquery continuous queries""" +from __future__ import annotations + +import functools +import inspect +import json +from typing import Optional +import warnings + +from google.cloud import bigquery + +import bigframes +from bigframes import dataframe +from bigframes.core import log_adapter + + +def _return_type_wrapper(method, cls): + @functools.wraps(method) + def wrapper(*args, **kwargs): + return_value = method(*args, **kwargs) + if isinstance(return_value, dataframe.DataFrame): + return cls._from_table_df(return_value) + return return_value + + return wrapper + + +def _curate_df_doc(doc: Optional[str]): + if not doc: + return doc + + # Remove examples, some are not applicable to StreamingDataFrame + doc = doc[: doc.find("**Examples:**")] + doc[doc.find("Args:") :] + + doc = doc.replace("dataframe.DataFrame", "streaming.StreamingDataFrame") + doc = doc.replace(" DataFrame", " StreamingDataFrame") + + return doc + + +class StreamingBase: + sql: str + _session: bigframes.Session + + def to_bigtable( + self, + *, + instance: str, + table: str, + service_account_email: Optional[str] = None, + app_profile: Optional[str] = None, + truncate: bool = False, + overwrite: bool = False, + auto_create_column_families: bool = False, + bigtable_options: Optional[dict] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, + ) -> bigquery.QueryJob: + """ + Export the StreamingDataFrame as a continue job and returns a + QueryJob object for some management functionality. + + This method requires an existing bigtable preconfigured to + accept the continuous query export statement. For instructions + on export to bigtable, see + https://cloud.google.com/bigquery/docs/export-to-bigtable. + + Args: + instance (str): + The name of the bigtable instance to export to. + table (str): + The name of the bigtable table to export to. + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + If not provided, the user account will be used, but this + limits the lifetime of the continuous query. + app_profile (str, default None): + The bigtable app profile to export to. If None, no app + profile will be used. + truncate (bool, default False): + The export truncate option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + overwrite (bool, default False): + The export overwrite option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + auto_create_column_families (bool, default False): + The auto_create_column_families option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + bigtable_options (dict, default None): + The bigtable options dict, which will be converted to JSON + using json.dumps, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + If None, no bigtable_options parameter will be passed. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + return _to_bigtable( + self.sql, + instance=instance, + table=table, + service_account_email=service_account_email, + session=self._session, + app_profile=app_profile, + truncate=truncate, + overwrite=overwrite, + auto_create_column_families=auto_create_column_families, + bigtable_options=bigtable_options, + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + def to_pubsub( + self, + *, + topic: str, + service_account_email: str, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, + ) -> bigquery.QueryJob: + """ + Export the StreamingDataFrame as a continue job and returns a + QueryJob object for some management functionality. + + This method requires an existing pubsub topic. For instructions + on creating a pubsub topic, see + https://cloud.google.com/pubsub/docs/samples/pubsub-quickstart-create-topic?hl=en + + Note that a service account is a requirement for continuous queries + exporting to pubsub. + + Args: + topic (str): + The name of the pubsub topic to export to. + For example: "taxi-rides" + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + return _to_pubsub( + self.sql, + topic=topic, + service_account_email=service_account_email, + session=self._session, + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + +@log_adapter.class_logger +class StreamingDataFrame(StreamingBase): + __doc__ = _curate_df_doc(dataframe.DataFrame.__doc__) + + # Private constructor + _create_key = object() + + def __init__(self, df: dataframe.DataFrame, *, create_key=0): + if create_key is not StreamingDataFrame._create_key: + raise ValueError( + "StreamingDataFrame class shouldn't be created through constructor. Call bigframes.Session.read_gbq_table_streaming method to create." + ) + self._df = df + self._df._disable_cache_override = True + + @classmethod + def _from_table_df(cls, df: dataframe.DataFrame) -> StreamingDataFrame: + return cls(df, create_key=cls._create_key) + + def __getitem__(self, *args, **kwargs): + return _return_type_wrapper(self._df.__getitem__, StreamingDataFrame)( + *args, **kwargs + ) + + __getitem__.__doc__ = _curate_df_doc( + inspect.getdoc(dataframe.DataFrame.__getitem__) + ) + + def __setitem__(self, *args, **kwargs): + return _return_type_wrapper(self._df.__setitem__, StreamingDataFrame)( + *args, **kwargs + ) + + __setitem__.__doc__ = _curate_df_doc( + inspect.getdoc(dataframe.DataFrame.__setitem__) + ) + + def rename(self, *args, **kwargs): + return _return_type_wrapper(self._df.rename, StreamingDataFrame)( + *args, **kwargs + ) + + rename.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.rename)) + + def __repr__(self, *args, **kwargs): + return _return_type_wrapper(self._df.__repr__, StreamingDataFrame)( + *args, **kwargs + ) + + __repr__.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.__repr__)) + + def _repr_html_(self, *args, **kwargs): + return _return_type_wrapper(self._df._repr_html_, StreamingDataFrame)( + *args, **kwargs + ) + + _repr_html_.__doc__ = _curate_df_doc( + inspect.getdoc(dataframe.DataFrame._repr_html_) + ) + + @property + def sql(self): + sql_str, _, _ = self._df._to_sql_query(include_index=False, enable_cache=False) + return sql_str + + sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql)) + + @property + def _session(self): + return self._df._session + + _session.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame._session)) + + +def _to_bigtable( + query: str, + *, + instance: str, + table: str, + service_account_email: Optional[str] = None, + session: Optional[bigframes.Session] = None, + app_profile: Optional[str] = None, + truncate: bool = False, + overwrite: bool = False, + auto_create_column_families: bool = False, + bigtable_options: Optional[dict] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, +) -> bigquery.QueryJob: + """Launches a BigQuery continuous query and returns a + QueryJob object for some management functionality. + + This method requires an existing bigtable preconfigured to + accept the continuous query export statement. For instructions + on export to bigtable, see + https://cloud.google.com/bigquery/docs/export-to-bigtable. + + Args: + query (str): + The sql statement to execute as a continuous function. + For example: "SELECT * FROM dataset.table" + This will be wrapped in an EXPORT DATA statement to + launch a continuous query writing to bigtable. + instance (str): + The name of the bigtable instance to export to. + table (str): + The name of the bigtable table to export to. + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + If not provided, the user account will be used, but this + limits the lifetime of the continuous query. + session (bigframes.Session, default None): + The session object to use for the query. This determines + the project id and location of the query. If None, will + default to the bigframes global session. + app_profile (str, default None): + The bigtable app profile to export to. If None, no app + profile will be used. + truncate (bool, default False): + The export truncate option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + overwrite (bool, default False): + The export overwrite option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + auto_create_column_families (bool, default False): + The auto_create_column_families option, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + bigtable_options (dict, default None): + The bigtable options dict, which will be converted to JSON + using json.dumps, see + https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements#bigtable_export_option + If None, no bigtable_options parameter will be passed. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + warnings.warn( + "The bigframes.streaming module is a preview feature, and subject to change.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + # get default client if not passed + if session is None: + session = bigframes.get_global_session() + bq_client = session.bqclient + + # build export string from parameters + project = bq_client.project + + app_profile_url_string = "" + if app_profile is not None: + app_profile_url_string = f"appProfiles/{app_profile}/" + + bigtable_options_parameter_string = "" + if bigtable_options is not None: + bigtable_options_parameter_string = ( + 'bigtable_options = """' + json.dumps(bigtable_options) + '""",\n' + ) + + sql = ( + "EXPORT DATA\n" + "OPTIONS (\n" + "format = 'CLOUD_BIGTABLE',\n" + f"{bigtable_options_parameter_string}" + f"truncate = {str(truncate)},\n" + f"overwrite = {str(overwrite)},\n" + f"auto_create_column_families = {str(auto_create_column_families)},\n" + f'uri = "https://bigtable.googleapis.com/projects/{project}/instances/{instance}/{app_profile_url_string}tables/{table}"\n' + ")\n" + "AS (\n" + f"{query});" + ) + + # override continuous http parameter + job_config = bigquery.job.QueryJobConfig() + + job_config_dict: dict = {"query": {"continuous": True}} + if service_account_email is not None: + job_config_dict["query"]["connectionProperties"] = { + "key": "service_account", + "value": service_account_email, + } + job_config_filled = job_config.from_api_repr(job_config_dict) + job_config_filled.labels = {"bigframes-api": "streaming_to_bigtable"} + + # begin the query job + query_job = bq_client.query( + sql, + job_config=job_config_filled, # type:ignore + # typing error above is in bq client library + # (should accept abstract job_config, only takes concrete) + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + # return the query job to the user for lifetime management + return query_job + + +def _to_pubsub( + query: str, + *, + topic: str, + service_account_email: str, + session: Optional[bigframes.Session] = None, + job_id: Optional[str] = None, + job_id_prefix: Optional[str] = None, +) -> bigquery.QueryJob: + """Launches a BigQuery continuous query and returns a + QueryJob object for some management functionality. + + This method requires an existing pubsub topic. For instructions + on creating a pubsub topic, see + https://cloud.google.com/pubsub/docs/samples/pubsub-quickstart-create-topic?hl=en + + Note that a service account is a requirement for continuous queries + exporting to pubsub. + + Args: + query (str): + The sql statement to execute as a continuous function. + For example: "SELECT * FROM dataset.table" + This will be wrapped in an EXPORT DATA statement to + launch a continuous query writing to pubsub. + topic (str): + The name of the pubsub topic to export to. + For example: "taxi-rides" + service_account_email (str): + Full name of the service account to run the continuous query. + Example: accountname@projectname.gserviceaccounts.com + session (bigframes.Session, default None): + The session object to use for the query. This determines + the project id and location of the query. If None, will + default to the bigframes global session. + job_id (str, default None): + If specified, replace the default job id for the query, + see job_id parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + job_id_prefix (str, default None): + If specified, a job id prefix for the query, see + job_id_prefix parameter of + https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_query + + Returns: + google.cloud.bigquery.QueryJob: + See https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob + The ongoing query job can be managed using this object. + For example, the job can be cancelled or its error status + can be examined. + """ + warnings.warn( + "The bigframes.streaming module is a preview feature, and subject to change.", + stacklevel=1, + category=bigframes.exceptions.PreviewWarning, + ) + + # get default client if not passed + if session is None: + session = bigframes.get_global_session() + bq_client = session.bqclient + + # build export string from parameters + sql = ( + "EXPORT DATA\n" + "OPTIONS (\n" + "format = 'CLOUD_PUBSUB',\n" + f'uri = "https://pubsub.googleapis.com/projects/{bq_client.project}/topics/{topic}"\n' + ")\n" + "AS (\n" + f"{query});" + ) + + # override continuous http parameter + job_config = bigquery.job.QueryJobConfig() + job_config_filled = job_config.from_api_repr( + { + "query": { + "continuous": True, + "connectionProperties": { + "key": "service_account", + "value": service_account_email, + }, + } + } + ) + job_config_filled.labels = {"bigframes-api": "streaming_to_pubsub"} + + # begin the query job + query_job = bq_client.query( + sql, + job_config=job_config_filled, # type:ignore + # typing error above is in bq client library + # (should accept abstract job_config, only takes concrete) + job_id=job_id, + job_id_prefix=job_id_prefix, + ) + + # return the query job to the user for lifetime management + return query_job diff --git a/notebooks/streaming/streaming_dataframe.ipynb b/notebooks/streaming/streaming_dataframe.ipynb new file mode 100644 index 0000000000..a2da30720d --- /dev/null +++ b/notebooks/streaming/streaming_dataframe.ipynb @@ -0,0 +1,535 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BigFrames StreamingDataFrame\n", + "bigframes.streaming.StreamingDataFrame is a special DataFrame type that allows simple operations and can create steaming jobs to BigTable and PubSub.\n", + "\n", + "In this notebook, we will:\n", + "* Create a StreamingDataFrame from a BigQuery table\n", + "* Do some opeartions like select, filter and preview the content\n", + "* Create and manage streaming jobs to both BigTable and Pubsub" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.streaming as bst" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "bigframes.options._bigquery_options.project = \"bigframes-load-testing\"\n", + "job_id_prefix = \"test_streaming_\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create, select, filter and preview\n", + "Create the StreamingDataFrame from a BigQuery table, select certain columns, filter rows and preview the output" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "sdf = bst.read_gbq_table(\"birds.penguins_bigtable_streaming\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "Query job d57200dd-e6f1-42c7-876b-7f4a54994ae6 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n", + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 1decce4a-eb32-49f4-8e47-7bda0220037a is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
speciesrowkeybody_mass_g
0Adelie Penguin (Pygoscelis adeliae)Torgersen3875
1Adelie Penguin (Pygoscelis adeliae)Torgersen2900
2Adelie Penguin (Pygoscelis adeliae)Biscoe3725
3Adelie Penguin (Pygoscelis adeliae)Dream2975
4Adelie Penguin (Pygoscelis adeliae)Torgersen3050
5Chinstrap penguin (Pygoscelis antarctica)Dream2700
6Adelie Penguin (Pygoscelis adeliae)Dream3900
7Adelie Penguin (Pygoscelis adeliae)Biscoe3825
8Chinstrap penguin (Pygoscelis antarctica)Dream3775
9Adelie Penguin (Pygoscelis adeliae)Dream3350
10Adelie Penguin (Pygoscelis adeliae)Biscoe3900
11Adelie Penguin (Pygoscelis adeliae)Torgersen3650
12Adelie Penguin (Pygoscelis adeliae)Biscoe3200
13Chinstrap penguin (Pygoscelis antarctica)Dream3650
14Adelie Penguin (Pygoscelis adeliae)Dream3700
15Chinstrap penguin (Pygoscelis antarctica)Dream3800
16Chinstrap penguin (Pygoscelis antarctica)Dream3950
17Chinstrap penguin (Pygoscelis antarctica)Dream3350
18Adelie Penguin (Pygoscelis adeliae)Dream3100
19Chinstrap penguin (Pygoscelis antarctica)Dream3750
20Adelie Penguin (Pygoscelis adeliae)Biscoe3550
21Chinstrap penguin (Pygoscelis antarctica)Dream3400
22Adelie Penguin (Pygoscelis adeliae)Torgersen3450
23Adelie Penguin (Pygoscelis adeliae)Torgersen3600
24Chinstrap penguin (Pygoscelis antarctica)Dream3650
\n", + "

25 rows × 3 columns

\n", + "
[165 rows x 3 columns in total]" + ], + "text/plain": [ + " species rowkey body_mass_g\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3875\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 2900\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3725\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 2975\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3050\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 2700\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3900\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3825\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3775\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3350\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3900\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3650\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3200\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3700\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3800\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3950\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3350\n", + " Adelie Penguin (Pygoscelis adeliae) Dream 3100\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3750\n", + " Adelie Penguin (Pygoscelis adeliae) Biscoe 3550\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3400\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3450\n", + " Adelie Penguin (Pygoscelis adeliae) Torgersen 3600\n", + "Chinstrap penguin (Pygoscelis antarctica) Dream 3650\n", + "...\n", + "\n", + "[165 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf = sdf[[\"species\", \"island\", \"body_mass_g\"]]\n", + "sdf = sdf[sdf[\"body_mass_g\"] < 4000]\n", + "# BigTable needs a rowkey column\n", + "sdf = sdf.rename(columns={\"island\": \"rowkey\"})\n", + "print(type(sdf))\n", + "sdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BigTable\n", + "Create BigTable streaming job" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:338: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "job = sdf.to_bigtable(instance=\"streaming-testing-instance\",\n", + " table=\"garrettwu-no-col-family\",\n", + " service_account_email=\"streaming-testing-admin@bigframes-load-testing.iam.gserviceaccount.com\",\n", + " app_profile=None,\n", + " truncate=True,\n", + " overwrite=True,\n", + " auto_create_column_families=True,\n", + " bigtable_options={},\n", + " job_id=None,\n", + " job_id_prefix=job_id_prefix,)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "None\n" + ] + } + ], + "source": [ + "print(job.running())\n", + "print(job.error_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.cancel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PubSub\n", + "Create Pubsub streaming job" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/blocks.py:126: NullIndexPreviewWarning: Creating object with Null Index. Null Index is a preview feature.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "sdf = sdf[[\"rowkey\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/streaming/dataframe.py:453: PreviewWarning: The bigframes.streaming module is a preview feature, and subject to change.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "job = sdf.to_pubsub(\n", + " topic=\"penguins\",\n", + " service_account_email=\"streaming-testing@bigframes-load-testing.iam.gserviceaccount.com\",\n", + " job_id=None,\n", + " job_id_prefix=job_id_prefix,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "None\n" + ] + } + ], + "source": [ + "print(job.running())\n", + "print(job.error_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "job.cancel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 3acae0e75b..6bd7bf9348 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -143,6 +143,14 @@ def session() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time +@pytest.fixture(scope="session") +def session_load() -> Generator[bigframes.Session, None, None]: + context = bigframes.BigQueryOptions(location="US", project="bigframes-load-testing") + session = bigframes.Session(context=context) + yield session + session.close() # close generated session at cleanup time + + @pytest.fixture(scope="session", params=["ordered", "unordered"]) def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") diff --git a/tests/system/large/test_streaming.py b/tests/system/large/test_streaming.py index 2debc09994..391aec8533 100644 --- a/tests/system/large/test_streaming.py +++ b/tests/system/large/test_streaming.py @@ -14,20 +14,20 @@ import time -import pytest - +import bigframes import bigframes.streaming -@pytest.mark.skip(reason="b/354024943. Concurrency error need to be fixed.") -def test_streaming_to_bigtable(): +def test_streaming_df_to_bigtable(session_load: bigframes.Session): # launch a continuous query job_id_prefix = "test_streaming_" - sql = """SELECT - body_mass_g, island as rowkey - FROM birds.penguins_bigtable_streaming""" - query_job = bigframes.streaming.to_bigtable( - sql, + sdf = session_load.read_gbq_table_streaming("birds.penguins_bigtable_streaming") + + sdf = sdf[["species", "island", "body_mass_g"]] + sdf = sdf[sdf["body_mass_g"] < 4000] + sdf = sdf.rename(columns={"island": "rowkey"}) + + query_job = sdf.to_bigtable( instance="streaming-testing-instance", table="table-testing", service_account_email="streaming-testing@bigframes-load-testing.iam.gserviceaccount.com", @@ -44,23 +44,22 @@ def test_streaming_to_bigtable(): # wait 100 seconds in order to ensure the query doesn't stop # (i.e. it is continuous) time.sleep(100) - assert query_job.error_result is None - assert query_job.errors is None assert query_job.running() + assert query_job.error_result is None assert str(query_job.job_id).startswith(job_id_prefix) finally: query_job.cancel() -@pytest.mark.skip(reason="b/354024943. Concurrency error need to be fixed.") -def test_streaming_to_pubsub(): +def test_streaming_df_to_pubsub(session_load: bigframes.Session): # launch a continuous query job_id_prefix = "test_streaming_pubsub_" - sql = """SELECT - island - FROM birds.penguins_pubsub_streaming""" - query_job = bigframes.streaming.to_pubsub( - sql, + sdf = session_load.read_gbq_table_streaming("birds.penguins_bigtable_streaming") + + sdf = sdf[sdf["body_mass_g"] < 4000] + sdf = sdf[["island"]] + + query_job = sdf.to_pubsub( topic="penguins", service_account_email="streaming-testing@bigframes-load-testing.iam.gserviceaccount.com", job_id=None, @@ -71,9 +70,8 @@ def test_streaming_to_pubsub(): # wait 100 seconds in order to ensure the query doesn't stop # (i.e. it is continuous) time.sleep(100) - assert query_job.error_result is None - assert query_job.errors is None assert query_job.running() + assert query_job.error_result is None assert str(query_job.job_id).startswith(job_id_prefix) finally: query_job.cancel() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 408590d4bb..1ee52c08a1 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -35,6 +35,8 @@ def all_session_methods(): if not attribute.startswith("_") ) session_attributes.remove("close") + # streaming isn't in pandas + session_attributes.remove("read_gbq_table_streaming") for attribute in sorted(session_attributes): session_method = getattr(bigframes.session.Session, attribute) From 8e00fe2878aceeb222b4d67bf6520dd4544807c6 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 31 Jul 2024 15:11:09 -0700 Subject: [PATCH 36/36] chore(main): release 1.12.0 (#834) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 27 +++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8249515719..354c356c7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,33 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.12.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.11.1...v1.12.0) (2024-07-31) + + +### Features + +* Add bigframes-mode label to query jobs ([#832](https://github.com/googleapis/python-bigquery-dataframes/issues/832)) ([c9eaff0](https://github.com/googleapis/python-bigquery-dataframes/commit/c9eaff0a1a0731b28f4c67bca5606db12a47c8c0)) +* Add config option to set partial ordering mode ([#855](https://github.com/googleapis/python-bigquery-dataframes/issues/855)) ([823c0ce](https://github.com/googleapis/python-bigquery-dataframes/commit/823c0ce57611c0918a9e9999638d7393337fe9af)) +* Add stratify param support to ml.model_selection.train_test_split method ([#815](https://github.com/googleapis/python-bigquery-dataframes/issues/815)) ([27f8631](https://github.com/googleapis/python-bigquery-dataframes/commit/27f8631be81a3e136cfeb8904558bb4f3f5caa05)) +* Add streaming.StreamingDataFrame class ([#864](https://github.com/googleapis/python-bigquery-dataframes/issues/864)) ([a7d7197](https://github.com/googleapis/python-bigquery-dataframes/commit/a7d7197a32c55b989ae4ea8f6cf6e1c0f7184cd4)) +* Allow DataFrame.join for self-join on Null index ([#860](https://github.com/googleapis/python-bigquery-dataframes/issues/860)) ([e950533](https://github.com/googleapis/python-bigquery-dataframes/commit/e95053372c36ea5a91a2d7295c1a3a3671181670)) +* Support remote function cleanup with `session.close` ([#818](https://github.com/googleapis/python-bigquery-dataframes/issues/818)) ([ed06436](https://github.com/googleapis/python-bigquery-dataframes/commit/ed06436612c0d46f190f79721416d473bde7e2f4)) +* Support to_csv/parquet/json to local files/objects ([#858](https://github.com/googleapis/python-bigquery-dataframes/issues/858)) ([d0ab9cc](https://github.com/googleapis/python-bigquery-dataframes/commit/d0ab9cc47298bdde638299baecac9dffd7841ede)) + + +### Bug Fixes + +* Fewer relation joins from df self-operations ([#823](https://github.com/googleapis/python-bigquery-dataframes/issues/823)) ([0d24f73](https://github.com/googleapis/python-bigquery-dataframes/commit/0d24f737041c7dd70253ebb4baa8d8ef67bd4f1d)) +* Fix 'sql' property for null index ([#844](https://github.com/googleapis/python-bigquery-dataframes/issues/844)) ([1b6a556](https://github.com/googleapis/python-bigquery-dataframes/commit/1b6a556206a7a66283339d827ab12db2753521e2)) +* Fix unordered mode using ordered path to print frame ([#839](https://github.com/googleapis/python-bigquery-dataframes/issues/839)) ([93785cb](https://github.com/googleapis/python-bigquery-dataframes/commit/93785cb48be4a2eb8770129148bd0b897fed4ee7)) +* Reduce redundant `remote_function` deployments ([#856](https://github.com/googleapis/python-bigquery-dataframes/issues/856)) ([cbf2d42](https://github.com/googleapis/python-bigquery-dataframes/commit/cbf2d42e4d961a7537381a9c3b28a8b463ad8f74)) + + +### Documentation + +* Add partner attribution steps to integrations sample notebook ([#835](https://github.com/googleapis/python-bigquery-dataframes/issues/835)) ([d7b333f](https://github.com/googleapis/python-bigquery-dataframes/commit/d7b333fa26acddaeb5ccca4f81b1d624dff03ba2)) +* Make `get_global_session`/`close_session`/`reset_session` appears in the docs ([#847](https://github.com/googleapis/python-bigquery-dataframes/issues/847)) ([01d6bbb](https://github.com/googleapis/python-bigquery-dataframes/commit/01d6bbb7479da706dc62bb5e7d51dc28a4042812)) + ## [1.11.1](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.11.0...v1.11.1) (2024-07-08) diff --git a/bigframes/version.py b/bigframes/version.py index 1186811c97..29cf036f42 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.11.1" +__version__ = "1.12.0"