From 726b4b84a4394aff2ca58f39950c1285852f734a Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 28 Sep 2023 21:17:16 +0000 Subject: [PATCH] chore: add vertex sdkhand bigframes integration notebooks --- .../vertex_sdk/sdk2_bigframes_pytorch.ipynb | 723 +++++++++++++++++ .../vertex_sdk/sdk2_bigframes_sklearn.ipynb | 727 ++++++++++++++++++ .../sdk2_bigframes_tensorflow.ipynb | 646 ++++++++++++++++ noxfile.py | 3 + 4 files changed, 2099 insertions(+) create mode 100644 notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb create mode 100644 notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb create mode 100644 notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb diff --git a/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb new file mode 100644 index 0000000000..598d958f0c --- /dev/null +++ b/notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb @@ -0,0 +1,723 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Train a pytorch model with Vertex AI SDK 2.0 and Bigframes\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"VertexOpen in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to train a pytorch model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", + "\n", + "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", + "\n", + "\n", + "This tutorial uses the following Google Cloud ML services:\n", + "\n", + "- `Vertex AI Training`\n", + "- `Vertex AI Remote Training`\n", + "\n", + "\n", + "The steps performed include:\n", + "\n", + "- Initialize a dataframe from a BigQuery table and split the dataset\n", + "- Perform transformations as a Vertex AI remote training.\n", + "- Train the model remotely and evaluate the model locally\n", + "\n", + "**Local-to-remote training**\n", + "\n", + "```\n", + "import vertexai\n", + "from my_module import MyModelClass\n", + "\n", + "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", + "\n", + "# Wrap the model class with `vertex_ai.preview.remote`\n", + "MyModelClass = vertexai.preview.remote(MyModelClass)\n", + "\n", + "# Instantiate the class\n", + "model = MyModelClass(...)\n", + "\n", + "# Optional set remote config\n", + "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", + "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", + "\n", + "# This `fit` call will be executed remotely\n", + "model.fit(...)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the IRIS dataset, which predicts the iris species." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* Vertex AI\n", + "* BigQuery\n", + "* Cloud Storage\n", + "\n", + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", + "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", + "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "# Install the packages\n", + "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", + "! pip3 install --upgrade --quiet bigframes\n", + "! pip3 install --upgrade --quiet torch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Region\n", + "\n", + "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "region" + }, + "outputs": [], + "source": [ + "REGION = \"us-central1\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6b2ccc891ed" + }, + "source": [ + "**4. Service account or other**\n", + "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zgPO1eR3CYjk" + }, + "source": [ + "### Create a Cloud Storage bucket\n", + "\n", + "Create a storage bucket to store intermediate artifacts such as datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries and define constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import torch\n", + "import vertexai\n", + "from vertexai.preview import VertexModel\n", + "\n", + "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "\n", + "from bigframes.ml.model_selection import \\\n", + " train_test_split as bf_train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "## Initialize Vertex AI SDK for Python\n", + "\n", + "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "outputs": [], + "source": [ + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + " staging_bucket=BUCKET_URI,\n", + ")\n", + "\n", + "REMOTE_JOB_NAME = \"sdk2-bigframes-pytorch\"\n", + "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "105334524e96" + }, + "source": [ + "## Prepare the dataset\n", + "\n", + "Now load the Iris dataset and split the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b44cdc4e03f1" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", + "\n", + "species_categories = {\n", + " \"versicolor\": 0,\n", + " \"virginica\": 1,\n", + " \"setosa\": 2,\n", + "}\n", + "df[\"species\"] = df[\"species\"].map(species_categories)\n", + "\n", + "# Assign an index column name\n", + "index_col = \"index\"\n", + "df.index.name = index_col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9cb8616b1997" + }, + "outputs": [], + "source": [ + "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", + "label_columns = df[[\"species\"]]\n", + "train_X, test_X, train_y, test_y = bf_train_test_split(\n", + " feature_columns, label_columns, test_size=0.2\n", + ")\n", + "\n", + "print(\"X_train size: \", train_X.size)\n", + "print(\"X_test size: \", test_X.size)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "23fe7b734b08" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5904a0f1bb03" + }, + "source": [ + "## PyTorch remote training with CPU (Custom PyTorch model)\n", + "\n", + "First, train a PyTorch model as a remote training job:\n", + "\n", + "- Reinitialize Vertex AI for remote training.\n", + "- Set TorchLogisticRegression for the remote training job.\n", + "- Invoke TorchLogisticRegression locally which will launch the remote training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2a1b85195a17" + }, + "outputs": [], + "source": [ + "# define the custom model\n", + "class TorchLogisticRegression(VertexModel, torch.nn.Module):\n", + " def __init__(self, input_size: int, output_size: int):\n", + " torch.nn.Module.__init__(self)\n", + " VertexModel.__init__(self)\n", + " self.linear = torch.nn.Linear(input_size, output_size)\n", + " self.softmax = torch.nn.Softmax(dim=1)\n", + "\n", + " def forward(self, x):\n", + " return self.softmax(self.linear(x))\n", + "\n", + " @vertexai.preview.developer.mark.train()\n", + " def train(self, X, y, num_epochs, lr):\n", + " X = X.to(torch.float32)\n", + " y = torch.flatten(y) # necessary to get 1D tensor\n", + " dataloader = torch.utils.data.DataLoader(\n", + " torch.utils.data.TensorDataset(X, y),\n", + " batch_size=10,\n", + " shuffle=True,\n", + " generator=torch.Generator(device=X.device),\n", + " )\n", + "\n", + " criterion = torch.nn.CrossEntropyLoss()\n", + " optimizer = torch.optim.SGD(self.parameters(), lr=lr)\n", + "\n", + " for t in range(num_epochs):\n", + " for batch, (X, y) in enumerate(dataloader):\n", + " optimizer.zero_grad()\n", + " pred = self(X)\n", + " loss = criterion(pred, y)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " @vertexai.preview.developer.mark.predict()\n", + " def predict(self, X):\n", + " X = torch.tensor(X).to(torch.float32)\n", + " with torch.no_grad():\n", + " pred = torch.argmax(self(X), dim=1)\n", + " return pred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4e35593f520a" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)\n", + "\n", + "# Instantiate model\n", + "model = TorchLogisticRegression(4, 3)\n", + "\n", + "# Set training config\n", + "model.train.vertex.remote_config.custom_commands = [\n", + " \"pip install torchdata\",\n", + " \"pip install torcharrow\",\n", + "]\n", + "model.train.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-model\"\n", + "model.train.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Train model on Vertex\n", + "model.train(train_X, train_y, num_epochs=200, lr=0.05)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edf4d0708f02" + }, + "source": [ + "## Remote prediction\n", + "\n", + "Obtain predictions from the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "42dfbff0ca15" + }, + "outputs": [], + "source": [ + "vertexai.preview.init(remote=True)\n", + "\n", + "# Set remote config\n", + "model.predict.vertex.remote_config.custom_commands = [\n", + " \"pip install torchdata\",\n", + " \"pip install torcharrow\",\n", + "]\n", + "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-torch-predict\"\n", + "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "predictions = model.predict(test_X)\n", + "\n", + "print(f\"Remote predictions: {predictions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4340ed8316cd" + }, + "source": [ + "## Local evaluation\n", + "\n", + "Evaluate model results locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eb27a31cec6f" + }, + "outputs": [], + "source": [ + "# User must convert bigframes to torch tensor for local evaluation\n", + "train_X_tensor = torch.from_numpy(\n", + " train_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")\n", + "train_y_tensor = torch.from_numpy(\n", + " train_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")\n", + "\n", + "test_X_tensor = torch.from_numpy(\n", + " test_X.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")\n", + "test_y_tensor = torch.from_numpy(\n", + " test_y.to_pandas().reset_index().drop(columns=[\"index\"]).values.astype(float)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7db44ad81389" + }, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "# Switch to local mode for evaluation\n", + "vertexai.preview.init(remote=False)\n", + "\n", + "# Evaluate model's accuracy score\n", + "print(\n", + " f\"Train accuracy: {accuracy_score(train_y_tensor, model.predict(train_X_tensor))}\"\n", + ")\n", + "\n", + "print(f\"Test accuracy: {accuracy_score(test_y_tensor, model.predict(test_X_tensor))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Delete Cloud Storage objects that were created\n", + "delete_bucket = False\n", + "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", + " ! gsutil -m rm -r $BUCKET_URI" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "sdk2_bigframes_pytorch.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb new file mode 100644 index 0000000000..021c070753 --- /dev/null +++ b/notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb @@ -0,0 +1,727 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Train a scikit-learn model with Vertex AI SDK 2.0 and Bigframes\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"VertexOpen in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to train a scikit-learn model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", + "\n", + "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", + "\n", + "\n", + "This tutorial uses the following Google Cloud ML services:\n", + "\n", + "- `Vertex AI Training`\n", + "- `Vertex AI Remote Training`\n", + "\n", + "\n", + "The steps performed include:\n", + "\n", + "- Initialize a dataframe from a BigQuery table and split the dataset\n", + "- Perform transformations as a Vertex AI remote training.\n", + "- Train the model remotely and evaluate the model locally\n", + "\n", + "**Local-to-remote training**\n", + "\n", + "```\n", + "import vertexai\n", + "from my_module import MyModelClass\n", + "\n", + "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", + "\n", + "# Wrap the model class with `vertex_ai.preview.remote`\n", + "MyModelClass = vertexai.preview.remote(MyModelClass)\n", + "\n", + "# Instantiate the class\n", + "model = MyModelClass(...)\n", + "\n", + "# Optional set remote config\n", + "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", + "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", + "\n", + "# This `fit` call will be executed remotely\n", + "model.fit(...)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the IRIS dataset, which predicts the iris species." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* Vertex AI\n", + "* BigQuery\n", + "* Cloud Storage\n", + "\n", + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", + "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", + "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "# Install the packages\n", + "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", + "! pip3 install --upgrade --quiet bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Region\n", + "\n", + "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "region" + }, + "outputs": [], + "source": [ + "REGION = \"us-central1\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6b2ccc891ed" + }, + "source": [ + "**4. Service account or other**\n", + "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zgPO1eR3CYjk" + }, + "source": [ + "### Create a Cloud Storage bucket\n", + "\n", + "Create a storage bucket to store intermediate artifacts such as datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries and define constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import vertexai\n", + "\n", + "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "\n", + "from bigframes.ml.model_selection import \\\n", + " train_test_split as bf_train_test_split\n", + "\n", + "REMOTE_JOB_NAME = \"sdk2-bigframes-sklearn\"\n", + "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "## Initialize Vertex AI SDK for Python\n", + "\n", + "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "outputs": [], + "source": [ + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + " staging_bucket=BUCKET_URI,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "105334524e96" + }, + "source": [ + "## Prepare the dataset\n", + "\n", + "Now load the Iris dataset and split the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b44cdc4e03f1" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", + "\n", + "species_categories = {\n", + " \"versicolor\": 0,\n", + " \"virginica\": 1,\n", + " \"setosa\": 2,\n", + "}\n", + "df[\"species\"] = df[\"species\"].map(species_categories)\n", + "\n", + "# Assign an index column name\n", + "index_col = \"index\"\n", + "df.index.name = index_col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9cb8616b1997" + }, + "outputs": [], + "source": [ + "feature_columns = df[[\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]]\n", + "label_columns = df[[\"species\"]]\n", + "train_X, test_X, train_y, test_y = bf_train_test_split(\n", + " feature_columns, label_columns, test_size=0.2\n", + ")\n", + "\n", + "print(\"X_train size: \", train_X.size)\n", + "print(\"X_test size: \", test_X.size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8306545fcc57" + }, + "source": [ + "## Feature transformation\n", + "\n", + "Next, you do feature transformations on the data using the Vertex AI remote training service.\n", + "\n", + "First, you re-initialize Vertex AI to enable remote training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "55e701c31036" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4a0e9d59b273" + }, + "source": [ + "### Execute remote job for fit_transform() on training data\n", + "\n", + "Next, indicate that the `StandardScalar` class is to be executed remotely. Then set up the data transform and call the `fit_transform()` method is executed remotely." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "90333089d362" + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# Wrap classes to enable Vertex remote execution\n", + "StandardScaler = vertexai.preview.remote(StandardScaler)\n", + "\n", + "# Instantiate transformer\n", + "transformer = StandardScaler()\n", + "\n", + "# Set training config\n", + "transformer.fit_transform.vertex.remote_config.display_name = (\n", + " f\"{REMOTE_JOB_NAME}-fit-transformer-bigframes\"\n", + ")\n", + "transformer.fit_transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Execute transformer on Vertex (train_X is bigframes.dataframe.DataFrame, X_train is np.array)\n", + "X_train = transformer.fit_transform(train_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6bf95574c907" + }, + "source": [ + "### Remote transform on test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "da6eea22a89a" + }, + "outputs": [], + "source": [ + "# Transform test dataset before calculate test score\n", + "transformer.transform.vertex.remote_config.display_name = (\n", + " REMOTE_JOB_NAME + \"-transformer\"\n", + ")\n", + "transformer.transform.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Execute transformer on Vertex (test_X is bigframes.dataframe.DataFrame, X_test is np.array)\n", + "X_test = transformer.transform(test_X)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ddf906c886e4" + }, + "source": [ + "## Remote training\n", + "\n", + "First, train the scikit-learn model as a remote training job:\n", + "\n", + "- Set LogisticRegression for the remote training job.\n", + "- Invoke LogisticRegression locally which will launch the remote training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c7b0116fa60c" + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Wrap classes to enable Vertex remote execution\n", + "LogisticRegression = vertexai.preview.remote(LogisticRegression)\n", + "\n", + "# Instantiate model, warm_start=True for uptraining\n", + "model = LogisticRegression(warm_start=True)\n", + "\n", + "# Set training config\n", + "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-sklearn-model\"\n", + "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "\n", + "# Train model on Vertex\n", + "model.fit(train_X, train_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ffe1d5903bcb" + }, + "source": [ + "## Remote prediction\n", + "\n", + "Obtain predictions from the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d00ce35920fa" + }, + "outputs": [], + "source": [ + "# Remote evaluation\n", + "vertexai.preview.init(remote=True)\n", + "\n", + "# Evaluate model's accuracy score\n", + "predictions = model.predict(test_X)\n", + "\n", + "print(f\"Remote predictions: {predictions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a8cd6cbd4403" + }, + "source": [ + "## Local evaluation\n", + "\n", + "Score model results locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dc105dafdfb9" + }, + "outputs": [], + "source": [ + "# User must convert bigframes to pandas dataframe for local evaluation\n", + "train_X_pd = train_X.to_pandas().reset_index(drop=True)\n", + "train_y_pd = train_y.to_pandas().reset_index(drop=True)\n", + "\n", + "test_X_pd = test_X.to_pandas().reset_index(drop=True)\n", + "test_y_pd = test_y.to_pandas().reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "25fec549de69" + }, + "outputs": [], + "source": [ + "# Switch to local mode for testing\n", + "vertexai.preview.init(remote=False)\n", + "\n", + "# Evaluate model's accuracy score\n", + "print(f\"Train accuracy: {model.score(train_X_pd, train_y_pd)}\")\n", + "\n", + "print(f\"Test accuracy: {model.score(test_X_pd, test_y_pd)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Delete Cloud Storage objects that were created\n", + "delete_bucket = False\n", + "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", + " ! gsutil -m rm -r $BUCKET_URI" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "sdk2_bigframes_sklearn.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb new file mode 100644 index 0000000000..e6843b66b5 --- /dev/null +++ b/notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb @@ -0,0 +1,646 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Train a Tensorflow Keras model with Vertex AI SDK 2.0 and Bigframes \n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"VertexOpen in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial demonstrates how to train a tensorflow keras model using Vertex AI local-to-remote training with Vertex AI SDK 2.0 and BigQuery Bigframes as the data source.\n", + "\n", + "Learn more about [bigframes](https://cloud.google.com/bigquery/docs/)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn to use `Vertex AI SDK 2.0` with Bigframes as input data source.\n", + "\n", + "\n", + "This tutorial uses the following Google Cloud ML services:\n", + "\n", + "- `Vertex AI Training`\n", + "- `Vertex AI Remote Training`\n", + "\n", + "\n", + "The steps performed include:\n", + "\n", + "- Initialize a dataframe from a BigQuery table and split the dataset\n", + "- Perform transformations as a Vertex AI remote training.\n", + "- Train the model remotely and evaluate the model locally\n", + "\n", + "**Local-to-remote training**\n", + "\n", + "```\n", + "import vertexai\n", + "from my_module import MyModelClass\n", + "\n", + "vertexai.preview.init(remote=True, project=\"my-project\", location=\"my-location\", staging_bucket=\"gs://my-bucket\")\n", + "\n", + "# Wrap the model class with `vertex_ai.preview.remote`\n", + "MyModelClass = vertexai.preview.remote(MyModelClass)\n", + "\n", + "# Instantiate the class\n", + "model = MyModelClass(...)\n", + "\n", + "# Optional set remote config\n", + "model.fit.vertex.remote_config.display_name = \"MyModelClass-remote-training\"\n", + "model.fit.vertex.remote_config.staging_bucket = \"gs://my-bucket\"\n", + "\n", + "# This `fit` call will be executed remotely\n", + "model.fit(...)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the IRIS dataset, which predicts the iris species." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* Vertex AI\n", + "* BigQuery\n", + "* Cloud Storage\n", + "\n", + "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing),\n", + "[BigQuery pricing](https://cloud.google.com/bigquery/pricing),\n", + "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "# Install the packages\n", + "! pip3 install --upgrade --quiet google-cloud-aiplatform[preview]\n", + "! pip3 install --upgrade --quiet bigframes\n", + "! pip3 install --upgrade --quiet tensorflow==2.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Region\n", + "\n", + "You can also change the `REGION` variable used by Vertex AI. Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "region" + }, + "outputs": [], + "source": [ + "REGION = \"us-central1\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f6b2ccc891ed" + }, + "source": [ + "**4. Service account or other**\n", + "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zgPO1eR3CYjk" + }, + "source": [ + "### Create a Cloud Storage bucket\n", + "\n", + "Create a storage bucket to store intermediate artifacts such as datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_URI = f\"gs://your-bucket-name-{PROJECT_ID}-unique\" # @param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries and define constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import tensorflow as tf\n", + "import vertexai\n", + "from tensorflow import keras\n", + "\n", + "bf.options.bigquery.location = \"us\" # Dataset is in 'us' not 'us-central1'\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "\n", + "from bigframes.ml.model_selection import \\\n", + " train_test_split as bf_train_test_split" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "## Initialize Vertex AI SDK for Python\n", + "\n", + "Initialize the Vertex AI SDK for Python for your project and corresponding bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "outputs": [], + "source": [ + "vertexai.init(\n", + " project=PROJECT_ID,\n", + " location=REGION,\n", + " staging_bucket=BUCKET_URI,\n", + ")\n", + "\n", + "REMOTE_JOB_NAME = \"sdk2-bigframes-tensorflow\"\n", + "REMOTE_JOB_BUCKET = f\"{BUCKET_URI}/{REMOTE_JOB_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "105334524e96" + }, + "source": [ + "## Prepare the dataset\n", + "\n", + "Now load the Iris dataset and split the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "94576deccd8c" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.iris\")\n", + "\n", + "species_categories = {\n", + " \"versicolor\": 0,\n", + " \"virginica\": 1,\n", + " \"setosa\": 2,\n", + "}\n", + "df[\"target\"] = df[\"species\"].map(species_categories)\n", + "df = df.drop(columns=[\"species\"])\n", + "\n", + "train, test = bf_train_test_split(df, test_size=0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cfcbce726efa" + }, + "source": [ + "## Remote training with GPU\n", + "\n", + "First, train a TensorFlow model as a remote training job:\n", + "\n", + "- Reinitialize Vertex AI for remote training.\n", + "- Instantiate the tensorflow keras model for the remote training job.\n", + "- Invoke the tensorflow keras model.fit() locally which will launch the remote training job." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fd865b0c4e8b" + }, + "outputs": [], + "source": [ + "# Switch to remote mode for training\n", + "vertexai.preview.init(remote=True)\n", + "\n", + "keras.Sequential = vertexai.preview.remote(keras.Sequential)\n", + "\n", + "# Instantiate model\n", + "model = keras.Sequential(\n", + " [keras.layers.Dense(5, input_shape=(4,)), keras.layers.Softmax()]\n", + ")\n", + "\n", + "# Specify optimizer and loss function\n", + "model.compile(optimizer=\"adam\", loss=\"mean_squared_error\")\n", + "\n", + "# Set training config\n", + "model.fit.vertex.remote_config.enable_cuda = True\n", + "model.fit.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-model-gpu\"\n", + "model.fit.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "model.fit.vertex.remote_config.custom_commands = [\"pip install tensorflow-io==0.32.0\"]\n", + "\n", + "# Manually set compute resources this time\n", + "model.fit.vertex.remote_config.machine_type = \"n1-highmem-4\"\n", + "model.fit.vertex.remote_config.accelerator_type = \"NVIDIA_TESLA_K80\"\n", + "model.fit.vertex.remote_config.accelerator_count = 4\n", + "\n", + "# Train model on Vertex\n", + "model.fit(train, epochs=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f1af94ac1477" + }, + "source": [ + "## Remote prediction\n", + "\n", + "Obtain predictions from the trained model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1d75879948b5" + }, + "outputs": [], + "source": [ + "vertexai.preview.init(remote=True)\n", + "\n", + "# Set remote config\n", + "model.predict.vertex.remote_config.enable_cuda = False\n", + "model.predict.vertex.remote_config.display_name = REMOTE_JOB_NAME + \"-keras-predict-cpu\"\n", + "model.predict.vertex.remote_config.staging_bucket = REMOTE_JOB_BUCKET\n", + "model.predict.vertex.remote_config.custom_commands = [\n", + " \"pip install tensorflow-io==0.32.0\"\n", + "]\n", + "\n", + "predictions = model.predict(train)\n", + "\n", + "print(f\"Remote predictions: {predictions}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "798b77c95067" + }, + "source": [ + "## Local evaluation\n", + "\n", + "Evaluate model results locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "88e734e30791" + }, + "outputs": [], + "source": [ + "# User must convert bigframes to pandas dataframe for local evaluation\n", + "feature_columns = [\"sepal_length\", \"sepal_width\", \"petal_length\", \"petal_width\"]\n", + "label_columns = [\"target\"]\n", + "\n", + "train_X_np = train[feature_columns].to_pandas().values.astype(float)\n", + "train_y_np = train[label_columns].to_pandas().values.astype(float)\n", + "train_ds = tf.data.Dataset.from_tensor_slices((train_X_np, train_y_np))\n", + "\n", + "test_X_np = test[feature_columns].to_pandas().values.astype(float)\n", + "test_y_np = test[label_columns].to_pandas().values.astype(float)\n", + "test_ds = tf.data.Dataset.from_tensor_slices((test_X_np, test_y_np))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cb8637f783ad" + }, + "outputs": [], + "source": [ + "# Switch to local mode for evaluation\n", + "vertexai.preview.init(remote=False)\n", + "\n", + "# Evaluate model's mean square errors\n", + "print(f\"Train loss: {model.evaluate(train_ds.batch(32))}\")\n", + "\n", + "print(f\"Test loss: {model.evaluate(test_ds.batch(32))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Delete Cloud Storage objects that were created\n", + "delete_bucket = False\n", + "if delete_bucket or os.getenv(\"IS_TESTING\"):\n", + " ! gsutil -m rm -r $BUCKET_URI" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "sdk2_bigframes_tensorflow.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index 033bbfefe4..81ed9c2b2c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -610,6 +610,9 @@ def notebook(session): "notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb", "notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb", "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", + "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", + "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", + "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb",