diff --git a/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb new file mode 100644 index 0000000000..0efac1eee3 --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_llm_output_schema.ipynb @@ -0,0 +1,770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigFrames LLM Output Schema\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This Notebook introduces BigFrames LLM with output schema to generate structured output dataframes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd\n", + "from bigframes.ml import llm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a BigFrames DataFrame and a Gemini model\n", + "Starting from creating a simple dataframe of several cities and a Gemini model in BigFrames" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city
0Seattle
1New York
2Shanghai
\n", + "

3 rows × 1 columns

\n", + "
[3 rows x 1 columns in total]" + ], + "text/plain": [ + " city\n", + "0 Seattle\n", + "1 New York\n", + "2 Shanghai\n", + "\n", + "[3 rows x 1 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({\"city\": [\"Seattle\", \"New York\", \"Shanghai\"]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n" + ] + } + ], + "source": [ + "gemini = llm.GeminiTextGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Generate structured output data\n", + "Before, llm models can only generate text output. Saying if you want to know whether the city is a US city, for example:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityml_generate_text_llm_result
0SeattleYes, Seattle is a city in the United States. I...
1New YorkYes, New York City is a city in the United Sta...
2ShanghaiNo, Shanghai is not a US city. It is a major c...
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city ml_generate_text_llm_result\n", + "0 Seattle Yes, Seattle is a city in the United States. I...\n", + "1 New York Yes, New York City is a city in the United Sta...\n", + "2 Shanghai No, Shanghai is not a US city. It is a major c...\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"], \"is a US city?\"])\n", + "result[[\"city\", \"ml_generate_text_llm_result\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The outputs are text results that human can read. But if want the output data to be more useful for analysis, it is better to transfer to structured data like boolean, int or float values. Usually the process wasn't easy.\n", + "\n", + "Now you can get structured output out-of-the-box by specifying the output_schema parameter in Gemini model predict method. In below example, the outputs are only boolean values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_us_city
0SeattleTrue
1New YorkTrue
2ShanghaiFalse
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city is_us_city\n", + "0 Seattle True\n", + "1 New York True\n", + "2 Shanghai False\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"], \"is a US city?\"], output_schema={\"is_us_city\": \"bool\"})\n", + "result[[\"city\", \"is_us_city\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get float or int values, for example, to get polulations in millions:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
citypolulation_million
0Seattle0.75
1New York19.68
2Shanghai26.32
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city polulation_million\n", + "0 Seattle 0.75\n", + "1 New York 19.68\n", + "2 Shanghai 26.32\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[\"what is the population in millions of\", df[\"city\"]], output_schema={\"polulation_million\": \"float64\"})\n", + "result[[\"city\", \"polulation_million\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And yearly rainy days:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityrainy_days
0Seattle152
1New York123
2Shanghai123
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " city rainy_days\n", + "0 Seattle 152\n", + "1 New York 123\n", + "2 Shanghai 123\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[\"how many rainy days per year in\", df[\"city\"]], output_schema={\"rainy_days\": \"int64\"})\n", + "result[[\"city\", \"rainy_days\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Generate all types of data in one prediction\n", + "You can get the different output columns and types in one prediction. \n", + "\n", + "Note it doesn't require dedicated prompts, as long as the output column names are informative to the model." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_US_citypolulation_in_millionsrainy_days_per_year
0SeattleTrue0.75152
1New YorkTrue8.8121
2ShanghaiFalse26.32115
\n", + "

3 rows × 4 columns

\n", + "
[3 rows x 4 columns in total]" + ], + "text/plain": [ + " city is_US_city polulation_in_millions rainy_days_per_year\n", + "0 Seattle True 0.75 152\n", + "1 New York True 8.8 121\n", + "2 Shanghai False 26.32 115\n", + "\n", + "[3 rows x 4 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"polulation_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\"})\n", + "result[[\"city\", \"is_US_city\", \"polulation_in_millions\", \"rainy_days_per_year\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Generate composite data types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Composite datatypes like array and struct can also be generated. Here the example generates a places_to_visit column as array of strings and a gps_coordinates as struct of floats. Along with previous fields, all in one prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cityis_US_citypolulation_in_millionsrainy_days_per_yearplaces_to_visitgps_coordinates
0SeattleTrue0.74150['Space Needle' 'Pike Place Market' 'Museum of...{'latitude': 47.6062, 'longitude': -122.3321}
1New YorkTrue8.4121['Times Square' 'Central Park' 'Statue of Libe...{'latitude': 40.7128, 'longitude': -74.006}
2ShanghaiFalse26.32115['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori...{'latitude': 31.2304, 'longitude': 121.4737}
\n", + "

3 rows × 6 columns

\n", + "
[3 rows x 6 columns in total]" + ], + "text/plain": [ + " city is_US_city polulation_in_millions rainy_days_per_year \\\n", + "0 Seattle True 0.74 150 \n", + "1 New York True 8.4 121 \n", + "2 Shanghai False 26.32 115 \n", + "\n", + " places_to_visit \\\n", + "0 ['Space Needle' 'Pike Place Market' 'Museum of... \n", + "1 ['Times Square' 'Central Park' 'Statue of Libe... \n", + "2 ['The Bund' 'Yu Garden' 'Shanghai Museum' 'Ori... \n", + "\n", + " gps_coordinates \n", + "0 {'latitude': 47.6062, 'longitude': -122.3321} \n", + "1 {'latitude': 40.7128, 'longitude': -74.006} \n", + "2 {'latitude': 31.2304, 'longitude': 121.4737} \n", + "\n", + "[3 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = gemini.predict(df, prompt=[df[\"city\"]], output_schema={\"is_US_city\": \"bool\", \"polulation_in_millions\": \"float64\", \"rainy_days_per_year\": \"int64\", \"places_to_visit\": \"array\", \"gps_coordinates\": \"struct\"})\n", + "result[[\"city\", \"is_US_city\", \"polulation_in_millions\", \"rainy_days_per_year\", \"places_to_visit\", \"gps_coordinates\"]]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index ce3f10b881..b7d713c342 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -55,7 +55,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook is introducing BigFrames experimental Multimodal features:\n", + "This notebook is introducing BigFrames Multimodal features:\n", "1. Create Multimodal DataFrame\n", "2. Combine unstructured data with structured data\n", "3. Conduct image transformations\n",