renxiaoming
diff --git a/‎tables/automl/pipeline/README.md
Lines changed: 108 additions & 0 deletions b/‎tables/automl/pipeline/README.md
Lines changed: 108 additions & 0 deletions
diff --git a/‎tables/automl/pipeline/__init__.py b/‎tables/automl/pipeline/__init__.py
diff --git a/‎tables/automl/pipeline/config/example.yaml
Lines changed: 33 additions & 0 deletions b/‎tables/automl/pipeline/config/example.yaml
Lines changed: 33 additions & 0 deletions
diff --git a/‎tables/automl/pipeline/log/.gitkeep b/‎tables/automl/pipeline/log/.gitkeep
diff --git a/‎tables/automl/pipeline/requirements.txt
Lines changed: 3 additions & 0 deletions b/‎tables/automl/pipeline/requirements.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎tables/automl/pipeline/run_pipeline.py
Lines changed: 178 additions & 0 deletions b/‎tables/automl/pipeline/run_pipeline.py
Lines changed: 178 additions & 0 deletions
@@ -0,0 +1,108 @@
+# AutoML Tables Pipeline
+- Launch training and prediction jobs for AutoML Tables with a single command. 
+- Define your pipelines with YAML configuration files for easy reuse.
+- Log parameters, operations, and results.
+
+## Before you begin
+Install the most recent google cloud packages and additional requirements.
+```
+pip install --upgrade google-cloud
+pip install --upgrade google-cloud-automl
+pip install -r requirements.txt
+```
+Set up service account authentication with an env variable or in cmd line at run
+time with the `--json_key_filepath` arg.
+```
+export GOOGLE_APPLICATION_CREDENTIALS=path/to/json_key
+```
+
+
+## Defining a pipeline
+YAML configuration files are used to manage an inventory of previously trained
+models, or to act as a template for repeated jobs with shared parameters. As a
+minimal example, consider a config file "my_config.yaml" with parameters:
+```
+dataset_display_name: my_dataset
+dataset_input_path: bq://project.dataset.table
+model_display_name: my_model
+label_column: my_label
+```
+See the provided "example.yaml" and Configuration section below for more
+details.
+
+## Running a pipeline
+Parameters can also be provided in the command line, and will take priority over
+parameters in the config. Together they support a number of usage patterns, here
+are the two most common repeated jobs.
+
+#### Training job
+Import a new dataset "my_dataset" then train a new model "my_model" with
+`--build_dataset` and `--build_model`:
+```
+python run_pipeline.py \
+    --project=my_project \
+    --config_filename=config/my_config.yaml \
+    --build_dataset \
+    --build_model
+```
+
+#### Batch prediction job
+Load "my_dataset" and "my_model" (default behavior) then make a batch
+prediction with `--make_prediction`:
+```
+python run_pipeline.py \
+    --project=my_project \
+    --config_filename=config/my_config.yaml \    
+    --predict_input_path=bq://project.dataset.table \
+    --predict_output_path=bq://project \
+    --make_prediction
+```
+
+## Project Structure
+```
+.
+├── run_pipeline     # Script to run the Tables pipeline from the command line.
+├── tables_config    # TablesConfig reads parameters from YAML and command line.
+├── tables_client    # TablesClient adds helper functions for the AutoML client.
+├── tables_pipeline  # TablesPipeline queues/executes operations with logging.
+├── config/          # Directory to read YAML parameter config files from.
+└── log/             # Directory to write logging files to.
+```
+
+## Configuration
+YAML configuration files may be created in the config/ directory, an
+example.yaml is provided as a basis, and detailed descriptions for all
+parameters are provided below (X denotes required).
+
+| Parameter              | Default        | Type   | Comments                                                                       |
+|------------------------|----------------|--------|--------------------------------------------------------------------------------|
+| project                | X              | String | Recommend setting through command line.                                        |
+| location               | us-central1    | String | Location of compute resources.                                                 |
+| build_dataset          | false          | Bool   | true builds a new dataset, false loads an old one.                             |
+| build_model            | false          | Bool   | true builds a new model, false loads an old one.                               |
+| make_prediction        | false          | Bool   | Make a batch prediction after loads/builds.                                    |
+| dataset_display_name   | X              | String | A unique and informative < 32 char name.                                       |
+| dataset_input_path     | X              | String | bq://project.dataset.table or gs://path/to/train/data                          |
+| label_column           | X              | String | Label dtype determines if regression or classification.                        |
+| weight_column          | null           | String | Weights loss and evaluation metrics.                                           |
+| split_column           | null           | String | Manually split data, time column is preferred.                                 |
+| time_column            | null           | String | TIMESTAMP type column, auto split data on it.                                  |
+| columns_nullable       | null           | Dict   | Only modify columns detected differently than intended (display name to bool). |
+| columns_dtype          | null           | Dict   | Only modify columns detected differently than intended (display name to str).  |
+| model_display_name     | X              | String | A unique and informative < 32 char name.                                       |
+| train_hours            | 1.0            | Float  | Maximum time for training, must be >= 1.                                       |
+| optimization_objective | null           | String | Recommend using defaults.                                                      |
+| ignore_columns         | null           | List   | Columns (other than label/split/weight) to exclude in train.                   |
+| predict_input_path     | X (if predict) | String | bq://project.dataset.table or gs://path/to/predict/data                        |
+| predict_output_path    | X (if predict) | String | bq://project or gs://path/to/basedir (dataset.table or subdir generated).      |
+
+
+## Logging
+Log files are written to the log/ directory by default, but the directory and
+filename can be set explicitly with the `--log_filepath` arg. Log levels can be
+set by `--console_log_level` and `--console_log_level` args.
+
+- Parameters are logged (in YAML format) at run time for reproducibility.
+- Evaluation metrics and feature importance are logged during model load/build.
+- Full output path logged during prediction.
+- Operation names logged at INFO level, full responses at DEBUG.
@@ -0,0 +1,33 @@
+# This YAML config is an example containing all possible parameters.
+# Parameters can be set in this config, or through the command line.
+
+# Required parameters.
+project: my_project
+dataset_display_name: my_dataset
+dataset_input_path: bq://project.dataset.table
+label_column: my_label_column
+model_display_name: my_model
+
+# Optional parameters.
+build_dataset: true
+build_model: true
+make_prediction: true
+location: us-central1
+weight_column: my_weight_column
+split_column: my_split_column
+time_column: my_time_column
+columns_nullable:
+  my_non_nullable_column: false
+  my_nullable_column: true
+columns_dtype:
+  my_categorical_column: CATEGORY
+  my_numerical_column: FLOAT64
+train_hours: 1.5
+optimization_objective: MINIMIZE_RMSE
+ignore_columns:
+  - my_id_column_1
+  - my_id_column_2
+
+# Required if make_prediction is true.
+predict_input_path: bq://project.dataset.table
+predict_output_path: bq://project
@@ -0,0 +1,3 @@
+PyYAML>=5.1
+futures>=3.1.0
+DateTime>=4.3
@@ -0,0 +1,178 @@
+# Copyright 2019 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import logging
+import argparse
+
+import tables_config
+import tables_client
+import tables_pipeline
+
+
+def parse_arguments(argv):
+  """Parses command line arguments."""
+
+  parser = argparse.ArgumentParser(description='Args for Tables pipeline.')
+  parser.add_argument(
+      '--config_filename',
+      required=True,
+      type=str,
+      help='The filepath for the YAML configuration file.')
+  parser.add_argument(
+      '--log_dir',
+      required=False,
+      type=str,
+      help='The directory to generate a session log file in.')
+  parser.add_argument(
+      '--service_account_filename',
+      required=False,
+      type=str,
+      help='The filepath for the json key for oath.')
+  parser.add_argument(
+      '--console_log_level',
+      required=False,
+      type=str,
+      default=logging.WARN,
+      help='Controls the log level for the console display.')
+  parser.add_argument(
+      '--file_log_level',
+      required=False,
+      type=str,
+      default=logging.INFO,
+      help='Controls the log level to write to file. Set to logging.DEBUG for'
+           'to write out the full AutoML service responses (very verbose).')
+  args, _ = parser.parse_known_args(args=argv[1:])
+
+  # Parser for parameters to pass to TablesConfig
+  param_parser = argparse.ArgumentParser(description='Args for config params.')
+
+  # Resource parameters
+  param_parser.add_argument(
+      '--project',
+      required=False,
+      type=str,
+      help='GCP project ID to run AutoML Tables on.')
+  param_parser.add_argument(
+      '--location',
+      required=False,
+      default='us-central1',
+      type=str,
+      help='GCP location to run AutoML Tables in.')
+
+  # Runtime parameters
+  param_parser.add_argument(
+      '--build_dataset',
+      action='store_const',
+      const=True,
+      help='Builds a new dataset, loads an old dataset otherwise.')
+  param_parser.add_argument(
+      '--build_model',
+      action='store_const',
+      const=True,
+      help='Builds a new model, loads an old model otherwise.')
+  param_parser.add_argument(
+      '--make_prediction',
+      action='store_const',
+      const=True,
+      help='Makes a batch prediction.')
+
+  # Dataset parameters
+  # Note that columns_dtype and columns_nullable must be set in YAML config.
+  param_parser.add_argument(
+      '--dataset_display_name',
+      required=False,
+      type=str,
+      help='Name of the Tables Dataset (32 character max).')
+  param_parser.add_argument(
+      '--dataset_input_path',
+      required=False,
+      type=str,
+      help=('Path to import the training data from, one of'
+            'bq://project.dataset.table or gs://path/to/csv'))
+  param_parser.add_argument(
+      '--label_column',
+      required=False,
+      type=str,
+      help='Label to to train model on, for regression or classification.')
+  param_parser.add_argument(
+      '--split_column',
+      required=False,
+      type=str,
+      help='Explicitly defines "TRAIN"/"VALIDATION"/"TEST" split.')
+  param_parser.add_argument(
+      '--weight_column',
+      required=False,
+      type=str,
+      help='Weights loss and metrics.')
+  param_parser.add_argument(
+      '--time_column',
+      required=False,
+      type=str,
+      help='Date/timestamp to automatically split data on.')
+
+  # Model parameters
+  # Note that ignore columns must be set in YAML config.
+  param_parser.add_argument(
+      '--model_display_name',
+      required=False,
+      type=str,
+      help='Name of the Tables Model (32 character max).')
+  param_parser.add_argument(
+      '--train_hours',
+      required=False,
+      type=float,
+      help='The number of hours to train the model for.')
+  param_parser.add_argument(
+      '--optimization_objective',
+      required=False,
+      type=str,
+      help='Metric to optimize for in training.')
+
+  # Predict parameters
+  param_parser.add_argument(
+      '--predict_input_path',
+      required=False,
+      type=str,
+      help=('Path to import the batch prediction data from, one of'
+            'bq://project.dataset.table or gs://path/to/csv'))
+  param_parser.add_argument(
+      '--predict_output_path',
+      required=False,
+      type=str,
+      help=('Path to export batch predictions to, one of'
+            'bq://project or gs://path'))
+  params, _ = param_parser.parse_known_args(args=argv[1:])
+  return args, params
+
+
+def main():
+  args, params = parse_arguments(sys.argv)
+  config = tables_config.TablesConfig(args.config_filename, vars(params))
+  client = tables_client.TablesClient(args.service_account_filename)
+  pipeline = tables_pipeline.TablesPipeline(
+      tables_config=config,
+      tables_client=client,
+      log_dir=args.log_dir,
+      console_log_level=args.console_log_level,
+      file_log_level=args.file_log_level)
+  pipeline.run()
+
+
+if __name__ == '__main__':
+  main()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+PyYAML>=5.1`
	`2`	`+futures>=3.1.0`
	`3`	`+DateTime>=4.3`