From 92cf76335d74e89324d0a7b3a3f979db5f50c50e Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Fri, 5 Jun 2020 15:35:13 -0400
Subject: [PATCH 01/59] add data ingestion code

---
 .../data-ingestion/requirements-test.txt      |   1 +
 .../data-ingestion/requirements.txt           |   6 +
 .../data-ingestion/setup-test.py              | 210 ++++++++++++++++++
 data-science-onramp/data-ingestion/setup.py   | 149 +++++++++++++
 data-science-onramp/data-ingestion/setup.sh   |   6 +
 5 files changed, 372 insertions(+)
 create mode 100644 data-science-onramp/data-ingestion/requirements-test.txt
 create mode 100644 data-science-onramp/data-ingestion/requirements.txt
 create mode 100644 data-science-onramp/data-ingestion/setup-test.py
 create mode 100644 data-science-onramp/data-ingestion/setup.py
 create mode 100644 data-science-onramp/data-ingestion/setup.sh

diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt
new file mode 100644
index 00000000000..781d4326c94
--- /dev/null
+++ b/data-science-onramp/data-ingestion/requirements-test.txt
@@ -0,0 +1 @@
+pytest==5.3.2
diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
new file mode 100644
index 00000000000..f435423c623
--- /dev/null
+++ b/data-science-onramp/data-ingestion/requirements.txt
@@ -0,0 +1,6 @@
+grpcio==1.29.0
+google-auth==1.16.0
+google-auth-httplib2==0.0.3
+google-cloud==0.34.0
+google-cloud-storage==1.28.1
+google-cloud-dataproc==0.8.0
\ No newline at end of file
diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup-test.py
new file mode 100644
index 00000000000..d827c805818
--- /dev/null
+++ b/data-science-onramp/data-ingestion/setup-test.py
@@ -0,0 +1,210 @@
+import os
+import re
+
+import uuid
+
+from google.api_core.exceptions import GoogleAPICallError
+
+from google.cloud import dataproc_v1 as dataproc
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+
+import pytest
+
+waiting_cluster_callback = False
+
+# Set global variables
+project = os.environ['GCLOUD_PROJECT']
+region = "us-central1"
+zone = "us-central1-a"
+cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
+bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
+
+
+@pytest.fixture(autouse=True)
+def teardown():
+    yield
+
+    # Delete cluster
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{region}-dataproc.googleapis.com:443'
+    })
+
+    try:
+        operation = cluster_client.delete_cluster(project, region,
+                                                  cluster_name)
+        operation.result()
+    except GoogleAPICallError:
+        pass
+
+    # Delete GCS bucket
+    storage_client = storage.Client()
+    try:
+        bucket = storage_client.get_bucket(bucket_name)
+        bucket.delete(force=True)
+    except NotFound:
+        pass
+
+
+def test_setup(capsys):
+    '''Tests setup.py by submitting it to a dataproc cluster'''
+
+    # Create GCS Bucket
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(bucket_name)
+
+    # Upload file
+    destination_blob_name = "setup.py"
+    blob = bucket.blob(destination_blob_name)
+    blob.upload_from_filename("setup.py")
+
+    job_file_name = "gs://" + bucket_name + "/setup.py"
+
+    # Create cluster configuration
+    zone_uri = \
+        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
+            project, zone)
+    cluster_data = {
+        'project_id': project,
+        'cluster_name': cluster_name,
+        'config': {
+            'gce_cluster_config': {
+                'zone_uri': zone_uri,
+                "metadata": {
+                    "PIP_PACKAGES": "google-cloud-storage"
+                },
+            },
+            'master_config': {
+                'num_instances': 1,
+                'machine_type_uri': 'n1-standard-8'
+            },
+            'worker_config': {
+                'num_instances': 6,
+                'machine_type_uri': 'n1-standard-8'
+            },
+            "initialization_actions": [
+                {
+                    "executable_file": ("gs://dataproc-initialization-actions/"
+                                        "python/pip-install.sh"),
+                }
+            ],
+            "software_config": {
+                "image_version": "1.5.4-debian10",
+                "optional_components": [
+                    "ANACONDA"
+                ],
+            }
+        }
+    }
+
+    # Create cluster using cluster client
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+    })
+
+    cluster = cluster_client.create_cluster(project, region, cluster_data)
+    cluster.add_done_callback(callback)
+
+    # Wait for cluster to provision
+    global waiting_cluster_callback
+    waiting_cluster_callback = True
+
+    wait_for_cluster_creation()
+
+    # Create job configuration
+    job_details = {
+        'placement': {
+            'cluster_name': cluster_name
+        },
+        'pyspark_job': {
+            'main_python_file_uri': job_file_name,
+            'args': [
+                bucket_name,
+                "--test",
+            ],
+            "jar_file_uris": [
+                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
+            ],
+        },
+    }
+
+    # Submit job to dataproc cluster
+    job_client = dataproc.JobControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+    })
+
+    result = job_client.submit_job(project_id=project, region=region,
+                                   job=job_details)
+
+    job_id = result.reference.job_id
+    print('Submitted job \"{}\".'.format(job_id))
+
+    # Wait for job to complete
+    wait_for_job(job_client, job_id)
+
+    # Get job output
+    cluster_info = cluster_client.get_cluster(project, region, cluster_name)
+    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
+    output_blob = (
+        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
+        .format(cluster_info.cluster_uuid, job_id))
+    out = bucket.blob(output_blob).download_as_string().decode("utf-8")
+
+    # tripDuration
+    assert re.search("[0-9] s", out)
+    assert re.search("[0-9] m", out)
+    assert re.search("[0-9] h", out)
+
+    # station latitude & longitude
+    assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+
+    # birth_year
+    assert re.search("19[0-9][0-9]\\|", out)
+    assert re.search("20[0-9][0-9]\\|", out)
+
+    # gender
+    assert "M" in out
+    assert "male" in out
+    assert "MALE" in out
+    assert "F" in out
+    assert "female" in out
+    assert "FEMALE" in out
+    assert "u" in out
+    assert "unknown" in out
+    assert "UNKNOWN" in out
+
+    # customer_plan
+    assert "Subscriber" in out
+    assert "subscriber" in out
+    assert "SUBSCRIBER" in out
+    assert "sub" in out
+    assert "Customer" in out
+    assert "customer" in out
+    assert "CUSTOMER" in out
+    assert "cust" in out
+
+    # Missing data
+    assert "null" in out
+
+
+def callback(operation_future):
+    '''Sets a flag to stop waiting'''
+    global waiting_cluster_callback
+    waiting_cluster_callback = False
+
+
+def wait_for_cluster_creation():
+    '''Waits for cluster to create'''
+    while True:
+        if not waiting_cluster_callback:
+            break
+
+
+def wait_for_job(job_client, job_id):
+    '''Waits for job to finish'''
+    while True:
+        job = job_client.get_job(project, region, job_id)
+        assert job.status.State.Name(job.status.state) != "ERROR"
+
+        if job.status.State.Name(job.status.state) == "DONE":
+            return
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
new file mode 100644
index 00000000000..91a740b34d0
--- /dev/null
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -0,0 +1,149 @@
+from random import choice, choices, randint, seed
+import sys
+
+from time import time_ns
+
+from google.cloud import bigquery
+
+from py4j.protocol import Py4JJavaError
+from pyspark.sql import SparkSession
+
+from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.types import IntegerType, StringType
+
+
+# Create a SparkSession under the name "setup". Viewable via the Spark UI
+spark = SparkSession.builder.appName("setup").getOrCreate()
+
+bucket_name = sys.argv[1]
+upload = True  # Whether to upload data to BigQuery
+
+# Check whether or not results should be uploaded
+try:
+    sys.argv[2]
+    upload = False
+except IndexError:
+    print("Results will be uploaded to BigQuery")
+
+table = "bigquery-public-data.new_york_citibike.citibike_trips"
+
+# Check if table exists
+try:
+    df = spark.read.format('bigquery').option('table', table).load()
+except Py4JJavaError:
+    print(f"{table} does not exist. ")
+    sys.exit(0)
+
+# START MAKING DATA DIRTY
+
+
+def random_select(items, cum_weights):
+    '''Picks an item according to the cumulative weights'''
+    return choices(items, cum_weights=cum_weights, k=1)[0]
+
+
+def tripduration(duration):
+    '''Converts trip duration to other units'''
+    seconds = str(duration) + " s"
+    minutes = str(float(duration) / 60) + " min"
+    hours = str(float(duration) / 3600) + " h"
+    return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
+                         [0.3, 0.6, 0.9, 1])
+
+
+def station_name(name):
+    '''Replaces '&' with '/' with a 50% chance'''
+    return choice([name, name.replace("&", "/")])
+
+
+def usertype(user):
+    '''Manipulates the user type string'''
+    return choice([user, user.upper(), user.lower(),
+                  "sub" if user == "Subscriber" else user,
+                   "cust" if user == "Customer" else user])
+
+
+def gender(s):
+    '''Manipulates the gender string'''
+    return choice([s, s.upper(), s.lower(),
+                  s[0] if len(s) > 0 else "",
+                   s[0].lower() if len(s) > 0 else ""])
+
+
+def convertAngle(angle):
+    '''Converts long and lat to DMS notation'''
+    degrees = int(angle)
+    minutes = int((angle - degrees) * 60)
+    seconds = int((angle - degrees - minutes/60) * 3600)
+    new_angle = str(degrees) + u"\u00B0" + \
+        str(minutes) + "'" + str(seconds) + '"'
+    return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
+
+
+def dirty_data(proc_func, allow_none):
+    '''Master function returns a user defined function
+    that transforms the column data'''
+    def udf(col_value):
+        seed(hash(col_value) + time_ns())
+        if col_value is None:
+            return col_value
+        elif allow_none:
+            return random_select([None, proc_func(col_value)],
+                                 cum_weights=[0.05, 1])
+        else:
+            return proc_func(col_value)
+    return udf
+
+
+def id(x):
+    return x
+
+
+# Declare data transformations for each column in dataframe
+udfs = [
+    (dirty_data(tripduration, True), StringType()),  # tripduration
+    (dirty_data(id, True), StringType()),  # starttime
+    (dirty_data(id, True), StringType()),  # stoptime
+    (id, IntegerType()),  # start_station_id
+    (dirty_data(station_name, False), StringType()),  # start_station_name
+    (dirty_data(convertAngle, True), StringType()),  # start_station_latitude
+    (dirty_data(convertAngle, True), StringType()),  # start_station_longitude
+    (id, IntegerType()),  # end_station_id
+    (dirty_data(station_name, False), StringType()),  # end_station_name
+    (dirty_data(convertAngle, True), StringType()),  # end_station_latitude
+    (dirty_data(convertAngle, True), StringType()),  # end_station_longitude
+    (id, IntegerType()),  # bikeid
+    (dirty_data(usertype, False), StringType()),  # usertype
+    (id, IntegerType()),  # birth_year
+    (dirty_data(gender, False), StringType()),  # gender
+    (id, StringType()),  # customer_plan
+]
+
+# Apply dirty transformations to df
+names = df.schema.names
+new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
+                     for udf, column, name in zip(udfs, df.columns, names)])
+
+# Duplicate about 0.01% of the rows
+dup_df = new_df.sample(False, 0.0001, seed=42)
+
+# Create final dirty dataframe
+df = new_df.union(dup_df)
+df.sample(False, 0.0001, seed=50).show(n=200)
+print("Dataframe sample printed")
+
+# Write to BigQuery
+if upload:
+    # Create BigQuery Dataset
+    client = bigquery.Client()
+    dataset_id = '{}.new_york_citibike_trips'.format(client.project)
+    dataset = bigquery.Dataset(dataset_id)
+    dataset.location = "US"
+    dataset = client.create_dataset(dataset)
+
+    # Saving the data to BigQuery
+    spark.conf.set('temporaryGcsBucket', bucket_name)
+
+    df.write.format('bigquery') \
+        .option('table', dataset_id + ".RAW_DATA") \
+        .save()
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
new file mode 100644
index 00000000000..12730a3a6fe
--- /dev/null
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -0,0 +1,6 @@
+# Submit a PySpark job via the Cloud Dataproc Jobs API
+gcloud dataproc jobs submit pyspark \
+    --cluster ${CLUSTER_NAME} \
+    --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
+    --driver-log-levels root=FATAL \
+    setup.py -- ${BUCKET_NAME}

From 739114a595c9698737bc853a06f52dae7ca63291 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Mon, 8 Jun 2020 10:24:56 -0400
Subject: [PATCH 02/59] begin addressing comments

---
 data-science-onramp/data-ingestion/setup.py   |  57 +++----
 data-science-onramp/data-ingestion/setup.sh   |   3 +
 .../{setup-test.py => setup_test.py}          | 145 +++++++-----------
 3 files changed, 90 insertions(+), 115 deletions(-)
 rename data-science-onramp/data-ingestion/{setup-test.py => setup_test.py} (59%)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 91a740b34d0..dc869903c84 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,4 +1,4 @@
-from random import choice, choices, randint, seed
+import random
 import sys
 
 from time import time_ns
@@ -19,10 +19,10 @@
 upload = True  # Whether to upload data to BigQuery
 
 # Check whether or not results should be uploaded
-try:
-    sys.argv[2]
+if len(sys.arv) > 1:
     upload = False
-except IndexError:
+    print("Not uploading results to BigQuery")
+else:
     print("Results will be uploaded to BigQuery")
 
 table = "bigquery-public-data.new_york_citibike.citibike_trips"
@@ -37,59 +37,60 @@
 # START MAKING DATA DIRTY
 
 
-def random_select(items, cum_weights):
+def random_select(items, weights):
     '''Picks an item according to the cumulative weights'''
-    return choices(items, cum_weights=cum_weights, k=1)[0]
+    return random.choices(items, weights=weights, k=1)[0]
 
 
-def tripduration(duration):
+def trip_duration(duration):
     '''Converts trip duration to other units'''
     seconds = str(duration) + " s"
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
-    return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
-                         [0.3, 0.6, 0.9, 1])
+    return random_select([seconds, minutes, hours,
+                         str(random.randint(-1000, -1))],
+                         [0.3, 0.3, 0.3, 0.1])
 
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
-    return choice([name, name.replace("&", "/")])
+    return random.choice([name, name.replace("&", "/")])
 
 
-def usertype(user):
+def user_type(user):
     '''Manipulates the user type string'''
-    return choice([user, user.upper(), user.lower(),
-                  "sub" if user == "Subscriber" else user,
-                   "cust" if user == "Customer" else user])
+    return random.choice([user, user.upper(), user.lower(),
+                          "sub" if user == "Subscriber" else user,
+                          "cust" if user == "Customer" else user])
 
 
 def gender(s):
     '''Manipulates the gender string'''
-    return choice([s, s.upper(), s.lower(),
-                  s[0] if len(s) > 0 else "",
-                   s[0].lower() if len(s) > 0 else ""])
+    return random.choice([s, s.upper(), s.lower(),
+                         s[0] if len(s) > 0 else "",
+                         s[0].lower() if len(s) > 0 else ""])
 
 
-def convertAngle(angle):
+def convert_angle(angle):
     '''Converts long and lat to DMS notation'''
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
     new_angle = str(degrees) + u"\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
-    return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
+    return random_select([str(angle), new_angle], [0.55, 0.45])
 
 
 def dirty_data(proc_func, allow_none):
     '''Master function returns a user defined function
     that transforms the column data'''
     def udf(col_value):
-        seed(hash(col_value) + time_ns())
+        random.seed(hash(col_value) + time_ns())
         if col_value is None:
             return col_value
         elif allow_none:
             return random_select([None, proc_func(col_value)],
-                                 cum_weights=[0.05, 1])
+                                 [0.05, 0.95])
         else:
             return proc_func(col_value)
     return udf
@@ -101,19 +102,19 @@ def id(x):
 
 # Declare data transformations for each column in dataframe
 udfs = [
-    (dirty_data(tripduration, True), StringType()),  # tripduration
+    (dirty_data(trip_duration, True), StringType()),  # tripduration
     (dirty_data(id, True), StringType()),  # starttime
     (dirty_data(id, True), StringType()),  # stoptime
     (id, IntegerType()),  # start_station_id
     (dirty_data(station_name, False), StringType()),  # start_station_name
-    (dirty_data(convertAngle, True), StringType()),  # start_station_latitude
-    (dirty_data(convertAngle, True), StringType()),  # start_station_longitude
+    (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
+    (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
     (id, IntegerType()),  # end_station_id
     (dirty_data(station_name, False), StringType()),  # end_station_name
-    (dirty_data(convertAngle, True), StringType()),  # end_station_latitude
-    (dirty_data(convertAngle, True), StringType()),  # end_station_longitude
+    (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
+    (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
     (id, IntegerType()),  # bikeid
-    (dirty_data(usertype, False), StringType()),  # usertype
+    (dirty_data(user_type, False), StringType()),  # usertype
     (id, IntegerType()),  # birth_year
     (dirty_data(gender, False), StringType()),  # gender
     (id, StringType()),  # customer_plan
@@ -136,7 +137,7 @@ def id(x):
 if upload:
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = '{}.new_york_citibike_trips'.format(client.project)
+    dataset_id = f'{client.project}.new_york_citibike_trips'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
index 12730a3a6fe..f78c8cd120b 100644
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -1,4 +1,7 @@
 # Submit a PySpark job via the Cloud Dataproc Jobs API
+# Requires having CLUSTER_NAME and BUCKET_NAME set as 
+# environment variables
+
 gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup_test.py
similarity index 59%
rename from data-science-onramp/data-ingestion/setup-test.py
rename to data-science-onramp/data-ingestion/setup_test.py
index d827c805818..54f3d20e902 100644
--- a/data-science-onramp/data-ingestion/setup-test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -11,62 +11,25 @@
 
 import pytest
 
-waiting_cluster_callback = False
 
 # Set global variables
-project = os.environ['GCLOUD_PROJECT']
-region = "us-central1"
-zone = "us-central1-a"
-cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
-bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
+PROJECT = os.environ['GCLOUD_PROJECT']
+REGION = "us-central1"
+ZONE = "us-central1-a"
+CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
+BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
 
+BUCKET = None
 
-@pytest.fixture(autouse=True)
-def teardown():
-    yield
-
-    # Delete cluster
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{region}-dataproc.googleapis.com:443'
-    })
-
-    try:
-        operation = cluster_client.delete_cluster(project, region,
-                                                  cluster_name)
-        operation.result()
-    except GoogleAPICallError:
-        pass
-
-    # Delete GCS bucket
-    storage_client = storage.Client()
-    try:
-        bucket = storage_client.get_bucket(bucket_name)
-        bucket.delete(force=True)
-    except NotFound:
-        pass
-
-
-def test_setup(capsys):
-    '''Tests setup.py by submitting it to a dataproc cluster'''
-
-    # Create GCS Bucket
-    storage_client = storage.Client()
-    bucket = storage_client.create_bucket(bucket_name)
-
-    # Upload file
-    destination_blob_name = "setup.py"
-    blob = bucket.blob(destination_blob_name)
-    blob.upload_from_filename("setup.py")
-
-    job_file_name = "gs://" + bucket_name + "/setup.py"
 
+@pytest.fixture(autouse=True)
+def setup_and_teardown_cluster():
     # Create cluster configuration
     zone_uri = \
-        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
-            project, zone)
+        f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
     cluster_data = {
-        'project_id': project,
-        'cluster_name': cluster_name,
+        'project_id': PROJECT,
+        'cluster_name': CLUSTER_NAME,
         'config': {
             'gce_cluster_config': {
                 'zone_uri': zone_uri,
@@ -99,27 +62,59 @@ def test_setup(capsys):
 
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    cluster = cluster_client.create_cluster(project, region, cluster_data)
-    cluster.add_done_callback(callback)
+    operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
 
     # Wait for cluster to provision
-    global waiting_cluster_callback
-    waiting_cluster_callback = True
+    operation.result()
 
-    wait_for_cluster_creation()
+    yield
+
+    # Delete cluster
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
+    })
+
+    operation = cluster_client.delete_cluster(PROJECT, REGION,
+                                              CLUSTER_NAME)
+    operation.result()
+
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown_bucket():
+    global BUCKET
+    # Create GCS Bucket
+    storage_client = storage.Client()
+    BUCKET = storage_client.create_bucket(BUCKET_NAME)
+
+    yield
+
+    # Delete GCS bucket
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(BUCKET_NAME)
+    bucket.delete(force=True)
+
+def test_setup(capsys):
+    '''Tests setup.py by submitting it to a dataproc cluster'''
+
+    # Upload file
+    destination_blob_name = "setup.py"
+    blob = BUCKET.blob(destination_blob_name)
+    blob.upload_from_filename("setup.py")
+
+    job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
 
     # Create job configuration
     job_details = {
         'placement': {
-            'cluster_name': cluster_name
+            'cluster_name': CLUSTER_NAME
         },
         'pyspark_job': {
             'main_python_file_uri': job_file_name,
             'args': [
-                bucket_name,
+                BUCKET_NAME,
                 "--test",
             ],
             "jar_file_uris": [
@@ -130,25 +125,21 @@ def test_setup(capsys):
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    result = job_client.submit_job(project_id=project, region=region,
+    response = job_client.submit_job(project_id=PROJECT, region=REGION,
                                    job=job_details)
 
-    job_id = result.reference.job_id
+    job_id = response.reference.job_id
     print('Submitted job \"{}\".'.format(job_id))
 
     # Wait for job to complete
-    wait_for_job(job_client, job_id)
+    result = response.add_done_callback(callback)
 
     # Get job output
-    cluster_info = cluster_client.get_cluster(project, region, cluster_name)
-    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
-    output_blob = (
-        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
-        .format(cluster_info.cluster_uuid, job_id))
-    out = bucket.blob(output_blob).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri() + ".000000000"
+    output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -186,25 +177,5 @@ def test_setup(capsys):
     # Missing data
     assert "null" in out
 
-
 def callback(operation_future):
-    '''Sets a flag to stop waiting'''
-    global waiting_cluster_callback
-    waiting_cluster_callback = False
-
-
-def wait_for_cluster_creation():
-    '''Waits for cluster to create'''
-    while True:
-        if not waiting_cluster_callback:
-            break
-
-
-def wait_for_job(job_client, job_id):
-    '''Waits for job to finish'''
-    while True:
-        job = job_client.get_job(project, region, job_id)
-        assert job.status.State.Name(job.status.state) != "ERROR"
-
-        if job.status.State.Name(job.status.state) == "DONE":
-            return
+    return operation_future.result()

From 681eaf3c85c3e159796d12cbcdf6c68c03a906f0 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Mon, 8 Jun 2020 11:26:33 -0400
Subject: [PATCH 03/59] change submit job

---
 data-science-onramp/data-ingestion/noxfile.py | 225 ++++++++++++++++++
 .../data-ingestion/setup_test.py              |   9 +-
 2 files changed, 228 insertions(+), 6 deletions(-)
 create mode 100644 data-science-onramp/data-ingestion/noxfile.py

diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py
new file mode 100644
index 00000000000..b23055f14a6
--- /dev/null
+++ b/data-science-onramp/data-ingestion/noxfile.py
@@ -0,0 +1,225 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+from pathlib import Path
+import sys
+
+import nox
+
+
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+#           DO NOT EDIT THIS FILE EVER!
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+
+# Copy `noxfile_config.py` to your directory and modify it instead.
+
+
+# `TEST_CONFIG` dict is a configuration hook that allows users to
+# modify the test configurations. The values here should be in sync
+# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
+# their directory and modify it.
+
+TEST_CONFIG = {
+    # You can opt out from the test for specific Python versions.
+    'ignored_versions': ["2.7"],
+
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    'gcloud_project_env': 'GCLOUD_PROJECT',
+    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    'envs': {},
+}
+
+
+try:
+    # Ensure we can import noxfile_config in the project's directory.
+    sys.path.append('.')
+    from noxfile_config import TEST_CONFIG_OVERRIDE
+except ImportError as e:
+    print("No user noxfile_config found: detail: {}".format(e))
+    TEST_CONFIG_OVERRIDE = {}
+
+# Update the TEST_CONFIG with the user supplied values.
+TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
+
+
+def get_pytest_env_vars():
+    """Returns a dict for pytest invocation."""
+    ret = {}
+
+    # Override the GCLOUD_PROJECT and the alias.
+    env_key = TEST_CONFIG['gcloud_project_env']
+    # This should error out if not set.
+    ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
+    ret['GCLOUD_PROJECT'] = os.environ[env_key]
+
+    # Apply user supplied envs.
+    ret.update(TEST_CONFIG['envs'])
+    return ret
+
+
+# DO NOT EDIT - automatically generated.
+# All versions used to tested samples.
+ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
+
+# Any default versions that should be ignored.
+IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
+
+TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
+
+INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False))
+#
+# Style Checks
+#
+
+
+def _determine_local_import_names(start_dir):
+    """Determines all import names that should be considered "local".
+
+    This is used when running the linter to insure that import order is
+    properly checked.
+    """
+    file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)]
+    return [
+        basename
+        for basename, extension in file_ext_pairs
+        if extension == ".py"
+        or os.path.isdir(os.path.join(start_dir, basename))
+        and basename not in ("__pycache__")
+    ]
+
+
+# Linting with flake8.
+#
+# We ignore the following rules:
+#   E203: whitespace before ‘:’
+#   E266: too many leading ‘#’ for block comment
+#   E501: line too long
+#   I202: Additional newline in a section of imports
+#
+# We also need to specify the rules which are ignored by default:
+# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
+FLAKE8_COMMON_ARGS = [
+    "--show-source",
+    "--builtin=gettext",
+    "--max-complexity=20",
+    "--import-order-style=google",
+    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
+    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
+    "--max-line-length=88",
+]
+
+
+@nox.session
+def lint(session):
+    session.install("flake8", "flake8-import-order")
+
+    local_names = _determine_local_import_names(".")
+    args = FLAKE8_COMMON_ARGS + [
+        "--application-import-names",
+        ",".join(local_names),
+        "."
+    ]
+    session.run("flake8", *args)
+
+
+#
+# Sample Tests
+#
+
+
+PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
+
+
+def _session_tests(session, post_install=None):
+    """Runs py.test for a particular project."""
+    if os.path.exists("requirements.txt"):
+        session.install("-r", "requirements.txt")
+
+    if os.path.exists("requirements-test.txt"):
+        session.install("-r", "requirements-test.txt")
+
+    if INSTALL_LIBRARY_FROM_SOURCE:
+        session.install("-e", _get_repo_root())
+
+    if post_install:
+        post_install(session)
+
+    session.run(
+        "pytest",
+        *(PYTEST_COMMON_ARGS + session.posargs),
+        # Pytest will return 5 when no tests are collected. This can happen
+        # on travis where slow and flaky tests are excluded.
+        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
+        success_codes=[0, 5],
+        env=get_pytest_env_vars()
+    )
+
+
+@nox.session(python=ALL_VERSIONS)
+def py(session):
+    """Runs py.test for a sample using the specified version of Python."""
+    if session.python in TESTED_VERSIONS:
+        _session_tests(session)
+    else:
+        session.skip("SKIPPED: {} tests are disabled for this sample.".format(
+            session.python
+        ))
+
+
+#
+# Readmegen
+#
+
+
+def _get_repo_root():
+    """ Returns the root folder of the project. """
+    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
+    p = Path(os.getcwd())
+    for i in range(10):
+        if p is None:
+            break
+        if Path(p / ".git").exists():
+            return str(p)
+        p = p.parent
+    raise Exception("Unable to detect repository root.")
+
+
+GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
+
+
+@nox.session
+@nox.parametrize("path", GENERATED_READMES)
+def readmegen(session, path):
+    """(Re-)generates the readme for a sample."""
+    session.install("jinja2", "pyyaml")
+    dir_ = os.path.dirname(path)
+
+    if os.path.exists(os.path.join(dir_, "requirements.txt")):
+        session.install("-r", os.path.join(dir_, "requirements.txt"))
+
+    in_file = os.path.join(dir_, "README.rst.in")
+    session.run(
+        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
+    )
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 54f3d20e902..919dcc4f35c 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -128,17 +128,14 @@ def test_setup(capsys):
         'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    response = job_client.submit_job(project_id=PROJECT, region=REGION,
+    response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
                                    job=job_details)
 
-    job_id = response.reference.job_id
-    print('Submitted job \"{}\".'.format(job_id))
-
     # Wait for job to complete
-    result = response.add_done_callback(callback)
+    result = response.result()
 
     # Get job output
-    output_location = result.driver_output_resource_uri() + ".000000000"
+    output_location = result.driver_output_resource_uri + ".000000000"
     output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration

From 4afbf1c5935e71882524a29511266b8dc2bf2a3a Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Mon, 8 Jun 2020 14:43:48 -0400
Subject: [PATCH 04/59] address code structure and global variable issues

---
 data-science-onramp/data-ingestion/noxfile.py | 225 ------------------
 data-science-onramp/data-ingestion/setup.py   | 125 +++++-----
 .../data-ingestion/setup_test.py              |  17 +-
 3 files changed, 78 insertions(+), 289 deletions(-)
 delete mode 100644 data-science-onramp/data-ingestion/noxfile.py

diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py
deleted file mode 100644
index b23055f14a6..00000000000
--- a/data-science-onramp/data-ingestion/noxfile.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-from pathlib import Path
-import sys
-
-import nox
-
-
-# WARNING - WARNING - WARNING - WARNING - WARNING
-# WARNING - WARNING - WARNING - WARNING - WARNING
-#           DO NOT EDIT THIS FILE EVER!
-# WARNING - WARNING - WARNING - WARNING - WARNING
-# WARNING - WARNING - WARNING - WARNING - WARNING
-
-# Copy `noxfile_config.py` to your directory and modify it instead.
-
-
-# `TEST_CONFIG` dict is a configuration hook that allows users to
-# modify the test configurations. The values here should be in sync
-# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
-# their directory and modify it.
-
-TEST_CONFIG = {
-    # You can opt out from the test for specific Python versions.
-    'ignored_versions': ["2.7"],
-
-    # An envvar key for determining the project id to use. Change it
-    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
-    # build specific Cloud project. You can also use your own string
-    # to use your own Cloud project.
-    'gcloud_project_env': 'GCLOUD_PROJECT',
-    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
-
-    # A dictionary you want to inject into your test. Don't put any
-    # secrets here. These values will override predefined values.
-    'envs': {},
-}
-
-
-try:
-    # Ensure we can import noxfile_config in the project's directory.
-    sys.path.append('.')
-    from noxfile_config import TEST_CONFIG_OVERRIDE
-except ImportError as e:
-    print("No user noxfile_config found: detail: {}".format(e))
-    TEST_CONFIG_OVERRIDE = {}
-
-# Update the TEST_CONFIG with the user supplied values.
-TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
-
-
-def get_pytest_env_vars():
-    """Returns a dict for pytest invocation."""
-    ret = {}
-
-    # Override the GCLOUD_PROJECT and the alias.
-    env_key = TEST_CONFIG['gcloud_project_env']
-    # This should error out if not set.
-    ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
-    ret['GCLOUD_PROJECT'] = os.environ[env_key]
-
-    # Apply user supplied envs.
-    ret.update(TEST_CONFIG['envs'])
-    return ret
-
-
-# DO NOT EDIT - automatically generated.
-# All versions used to tested samples.
-ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
-
-# Any default versions that should be ignored.
-IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
-
-TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
-
-INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False))
-#
-# Style Checks
-#
-
-
-def _determine_local_import_names(start_dir):
-    """Determines all import names that should be considered "local".
-
-    This is used when running the linter to insure that import order is
-    properly checked.
-    """
-    file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)]
-    return [
-        basename
-        for basename, extension in file_ext_pairs
-        if extension == ".py"
-        or os.path.isdir(os.path.join(start_dir, basename))
-        and basename not in ("__pycache__")
-    ]
-
-
-# Linting with flake8.
-#
-# We ignore the following rules:
-#   E203: whitespace before ‘:’
-#   E266: too many leading ‘#’ for block comment
-#   E501: line too long
-#   I202: Additional newline in a section of imports
-#
-# We also need to specify the rules which are ignored by default:
-# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
-FLAKE8_COMMON_ARGS = [
-    "--show-source",
-    "--builtin=gettext",
-    "--max-complexity=20",
-    "--import-order-style=google",
-    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
-    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
-    "--max-line-length=88",
-]
-
-
-@nox.session
-def lint(session):
-    session.install("flake8", "flake8-import-order")
-
-    local_names = _determine_local_import_names(".")
-    args = FLAKE8_COMMON_ARGS + [
-        "--application-import-names",
-        ",".join(local_names),
-        "."
-    ]
-    session.run("flake8", *args)
-
-
-#
-# Sample Tests
-#
-
-
-PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
-
-
-def _session_tests(session, post_install=None):
-    """Runs py.test for a particular project."""
-    if os.path.exists("requirements.txt"):
-        session.install("-r", "requirements.txt")
-
-    if os.path.exists("requirements-test.txt"):
-        session.install("-r", "requirements-test.txt")
-
-    if INSTALL_LIBRARY_FROM_SOURCE:
-        session.install("-e", _get_repo_root())
-
-    if post_install:
-        post_install(session)
-
-    session.run(
-        "pytest",
-        *(PYTEST_COMMON_ARGS + session.posargs),
-        # Pytest will return 5 when no tests are collected. This can happen
-        # on travis where slow and flaky tests are excluded.
-        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
-        success_codes=[0, 5],
-        env=get_pytest_env_vars()
-    )
-
-
-@nox.session(python=ALL_VERSIONS)
-def py(session):
-    """Runs py.test for a sample using the specified version of Python."""
-    if session.python in TESTED_VERSIONS:
-        _session_tests(session)
-    else:
-        session.skip("SKIPPED: {} tests are disabled for this sample.".format(
-            session.python
-        ))
-
-
-#
-# Readmegen
-#
-
-
-def _get_repo_root():
-    """ Returns the root folder of the project. """
-    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
-    p = Path(os.getcwd())
-    for i in range(10):
-        if p is None:
-            break
-        if Path(p / ".git").exists():
-            return str(p)
-        p = p.parent
-    raise Exception("Unable to detect repository root.")
-
-
-GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
-
-
-@nox.session
-@nox.parametrize("path", GENERATED_READMES)
-def readmegen(session, path):
-    """(Re-)generates the readme for a sample."""
-    session.install("jinja2", "pyyaml")
-    dir_ = os.path.dirname(path)
-
-    if os.path.exists(os.path.join(dir_, "requirements.txt")):
-        session.install("-r", os.path.join(dir_, "requirements.txt"))
-
-    in_file = os.path.join(dir_, "README.rst.in")
-    session.run(
-        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
-    )
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index dc869903c84..da162e1c91d 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -12,31 +12,11 @@
 from pyspark.sql.types import IntegerType, StringType
 
 
-# Create a SparkSession under the name "setup". Viewable via the Spark UI
-spark = SparkSession.builder.appName("setup").getOrCreate()
+BUCKET_NAME = sys.argv[1] 
+TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 
-bucket_name = sys.argv[1]
-upload = True  # Whether to upload data to BigQuery
-
-# Check whether or not results should be uploaded
-if len(sys.arv) > 1:
-    upload = False
-    print("Not uploading results to BigQuery")
-else:
-    print("Results will be uploaded to BigQuery")
-
-table = "bigquery-public-data.new_york_citibike.citibike_trips"
-
-# Check if table exists
-try:
-    df = spark.read.format('bigquery').option('table', table).load()
-except Py4JJavaError:
-    print(f"{table} does not exist. ")
-    sys.exit(0)
 
 # START MAKING DATA DIRTY
-
-
 def random_select(items, weights):
     '''Picks an item according to the cumulative weights'''
     return random.choices(items, weights=weights, k=1)[0]
@@ -81,6 +61,8 @@ def convert_angle(angle):
     return random_select([str(angle), new_angle], [0.55, 0.45])
 
 
+# This function is nested since a UserDefinedFunction is
+# expected to take a single argument
 def dirty_data(proc_func, allow_none):
     '''Master function returns a user defined function
     that transforms the column data'''
@@ -99,42 +81,9 @@ def udf(col_value):
 def id(x):
     return x
 
+def write_to_bigquery(df):
+    '''Write a dataframe to BigQuery'''
 
-# Declare data transformations for each column in dataframe
-udfs = [
-    (dirty_data(trip_duration, True), StringType()),  # tripduration
-    (dirty_data(id, True), StringType()),  # starttime
-    (dirty_data(id, True), StringType()),  # stoptime
-    (id, IntegerType()),  # start_station_id
-    (dirty_data(station_name, False), StringType()),  # start_station_name
-    (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
-    (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-    (id, IntegerType()),  # end_station_id
-    (dirty_data(station_name, False), StringType()),  # end_station_name
-    (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
-    (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-    (id, IntegerType()),  # bikeid
-    (dirty_data(user_type, False), StringType()),  # usertype
-    (id, IntegerType()),  # birth_year
-    (dirty_data(gender, False), StringType()),  # gender
-    (id, StringType()),  # customer_plan
-]
-
-# Apply dirty transformations to df
-names = df.schema.names
-new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
-                     for udf, column, name in zip(udfs, df.columns, names)])
-
-# Duplicate about 0.01% of the rows
-dup_df = new_df.sample(False, 0.0001, seed=42)
-
-# Create final dirty dataframe
-df = new_df.union(dup_df)
-df.sample(False, 0.0001, seed=50).show(n=200)
-print("Dataframe sample printed")
-
-# Write to BigQuery
-if upload:
     # Create BigQuery Dataset
     client = bigquery.Client()
     dataset_id = f'{client.project}.new_york_citibike_trips'
@@ -143,8 +92,68 @@ def id(x):
     dataset = client.create_dataset(dataset)
 
     # Saving the data to BigQuery
-    spark.conf.set('temporaryGcsBucket', bucket_name)
+    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
 
     df.write.format('bigquery') \
         .option('table', dataset_id + ".RAW_DATA") \
         .save()
+
+def main():
+    # Create a SparkSession under the name "setup". Viewable via the Spark UI
+    spark = SparkSession.builder.appName("setup").getOrCreate()
+
+    upload = True  # Whether to upload data to BigQuery
+
+    # Check whether or not results should be uploaded
+    if len(sys.argv) > 1:
+        upload = False
+        print("Not uploading results to BigQuery")
+    else:
+        print("Results will be uploaded to BigQuery")
+
+    # Check if table exists
+    try:
+        df = spark.read.format('bigquery').option('table', TABLE).load()
+    except Py4JJavaError:
+        print(f"{TABLE} does not exist. ")
+        sys.exit(0)
+
+    # Declare data transformations for each column in dataframe
+    udfs = [
+        (dirty_data(trip_duration, True), StringType()),  # tripduration
+        (dirty_data(id, True), StringType()),  # starttime
+        (dirty_data(id, True), StringType()),  # stoptime
+        (id, IntegerType()),  # start_station_id
+        (dirty_data(station_name, False), StringType()),  # start_station_name
+        (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
+        (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
+        (id, IntegerType()),  # end_station_id
+        (dirty_data(station_name, False), StringType()),  # end_station_name
+        (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
+        (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
+        (id, IntegerType()),  # bikeid
+        (dirty_data(user_type, False), StringType()),  # usertype
+        (id, IntegerType()),  # birth_year
+        (dirty_data(gender, False), StringType()),  # gender
+        (id, StringType()),  # customer_plan
+    ]
+
+    # Apply dirty transformations to df
+    names = df.schema.names
+    new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
+                         for udf, column, name in zip(udfs, df.columns, names)])
+
+    # Duplicate about 0.01% of the rows
+    dup_df = new_df.sample(False, 0.0001, seed=42)
+
+    # Create final dirty dataframe
+    df = new_df.union(dup_df)
+    df.sample(False, 0.0001, seed=50).show(n=200)
+    print("Dataframe sample printed")
+
+    if upload:
+        write_to_bigquery(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 919dcc4f35c..e62d2cc1355 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -52,7 +52,7 @@ def setup_and_teardown_cluster():
                 }
             ],
             "software_config": {
-                "image_version": "1.5.4-debian10",
+                "image_version": "1.4-debian10",
                 "optional_components": [
                     "ANACONDA"
                 ],
@@ -134,9 +134,17 @@ def test_setup(capsys):
     # Wait for job to complete
     result = response.result()
 
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+    })
+
+    cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME)
+
     # Get job output
-    output_location = result.driver_output_resource_uri + ".000000000"
-    output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000"
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
+    output = bucket.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -173,6 +181,3 @@ def test_setup(capsys):
 
     # Missing data
     assert "null" in out
-
-def callback(operation_future):
-    return operation_future.result()

From 744f80c805160fc6b72b6c26673f17202168b92a Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Mon, 8 Jun 2020 19:12:09 -0400
Subject: [PATCH 05/59] get dataproc job output and fix linting

---
 .gitignore                                    |  1 +
 data-science-onramp/data-ingestion/setup.py   | 14 ++++++----
 .../data-ingestion/setup_test.py              | 28 +++++++++----------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore
index c827e035649..369e7983b52 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ credentials.dat
 .DS_store
 env/
 .idea
+data-science-onramp/data-ingestion/noxfile.py
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index da162e1c91d..09046b75879 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -12,7 +12,7 @@
 from pyspark.sql.types import IntegerType, StringType
 
 
-BUCKET_NAME = sys.argv[1] 
+BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 
 
@@ -81,7 +81,8 @@ def udf(col_value):
 def id(x):
     return x
 
-def write_to_bigquery(df):
+
+def write_to_bigquery(spark, df):
     '''Write a dataframe to BigQuery'''
 
     # Create BigQuery Dataset
@@ -98,6 +99,7 @@ def write_to_bigquery(df):
         .option('table', dataset_id + ".RAW_DATA") \
         .save()
 
+
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
@@ -143,16 +145,16 @@ def main():
     new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
                          for udf, column, name in zip(udfs, df.columns, names)])
 
+    new_df.sample(False, 0.0001, seed=50).show(n=100)
+
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(False, 0.0001, seed=42)
+    dup_df = new_df.sample(True, 0.0001, seed=42)
 
     # Create final dirty dataframe
     df = new_df.union(dup_df)
-    df.sample(False, 0.0001, seed=50).show(n=200)
-    print("Dataframe sample printed")
 
     if upload:
-        write_to_bigquery(df)
+        write_to_bigquery(spark, df)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index e62d2cc1355..f55a155bc75 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -3,11 +3,8 @@
 
 import uuid
 
-from google.api_core.exceptions import GoogleAPICallError
-
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-from google.cloud.exceptions import NotFound
 
 import pytest
 
@@ -52,7 +49,7 @@ def setup_and_teardown_cluster():
                 }
             ],
             "software_config": {
-                "image_version": "1.4-debian10",
+                "image_version": "1.5.4-debian10",
                 "optional_components": [
                     "ANACONDA"
                 ],
@@ -96,6 +93,7 @@ def setup_and_teardown_bucket():
     bucket = storage_client.get_bucket(BUCKET_NAME)
     bucket.delete(force=True)
 
+
 def test_setup(capsys):
     '''Tests setup.py by submitting it to a dataproc cluster'''
 
@@ -129,22 +127,15 @@ def test_setup(capsys):
     })
 
     response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                   job=job_details)
+                                                  job=job_details)
 
     # Wait for job to complete
     result = response.result()
 
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
-    })
-
-    cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME)
-
     # Get job output
-    output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000"
-    storage_client = storage.Client()
-    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
-    output = bucket.blob(output_location).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri + ".000000000"
+    blob = get_blob_from_path(output_location)
+    out = blob.download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -181,3 +172,10 @@ def test_setup(capsys):
 
     # Missing data
     assert "null" in out
+
+
+def get_blob_from_path(path):
+    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
+    bucket = storage.Client().get_bucket(bucket_name)
+    output_location = re.search("google-cloud-dataproc.+", path).group(0)
+    return bucket.blob(output_location)

From 8cd7dc611e877b71b778ba91878e91b439f6c334 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Tue, 9 Jun 2020 15:32:02 -0400
Subject: [PATCH 06/59] fix PR comments

---
 .gitignore                                    |  30 -----
 data-science-onramp/data-ingestion/setup.py   |  28 ++--
 .../data-ingestion/setup_test.py              | 125 ++++++++----------
 3 files changed, 73 insertions(+), 110 deletions(-)
 delete mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 369e7983b52..00000000000
--- a/.gitignore
+++ /dev/null
@@ -1,30 +0,0 @@
-.coveralls.yml
-*.pyc
-.coverage
-.tox
-.pytest_cache
-.ipynb_checkpoints
-.executed_notebooks
-coverage.xml
-python-docs-samples.json
-service-account.json
-client-secrets.json
-__pycache__
-*db\.sqlite3
-managed_vms/django_tutorial/static/*
-**/migrations/*
-lib
-testing/resources/test-env.sh
-testing/resources/service-account.json
-testing/resources/client-secrets.json
-secrets.tar
-.cache
-junit.xml
-credentials.dat
-.nox
-.vscode/
-*sponge_log.xml
-.DS_store
-env/
-.idea
-data-science-onramp/data-ingestion/noxfile.py
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 09046b75879..cf61f81562a 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -46,8 +46,8 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
-    return random.choice([s, s.upper(), s.lower(),
-                         s[0] if len(s) > 0 else "",
+    return random.choice([s.upper(), s.lower(),
+                         s[0].upper() if len(s) > 0 else "",
                          s[0].lower() if len(s) > 0 else ""])
 
 
@@ -78,7 +78,9 @@ def udf(col_value):
     return udf
 
 
-def id(x):
+# This function is required because we need to apply a
+# function for every column and some columns do not change
+def identity(x):
     return x
 
 
@@ -118,26 +120,26 @@ def main():
         df = spark.read.format('bigquery').option('table', TABLE).load()
     except Py4JJavaError:
         print(f"{TABLE} does not exist. ")
-        sys.exit(0)
+        return
 
     # Declare data transformations for each column in dataframe
     udfs = [
         (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(id, True), StringType()),  # starttime
-        (dirty_data(id, True), StringType()),  # stoptime
-        (id, IntegerType()),  # start_station_id
+        (dirty_data(identity, True), StringType()),  # starttime
+        (dirty_data(identity, True), StringType()),  # stoptime
+        (identity, IntegerType()),  # start_station_id
         (dirty_data(station_name, False), StringType()),  # start_station_name
         (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (id, IntegerType()),  # end_station_id
+        (identity, IntegerType()),  # end_station_id
         (dirty_data(station_name, False), StringType()),  # end_station_name
         (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (id, IntegerType()),  # bikeid
+        (identity, IntegerType()),  # bikeid
         (dirty_data(user_type, False), StringType()),  # usertype
-        (id, IntegerType()),  # birth_year
+        (identity, IntegerType()),  # birth_year
         (dirty_data(gender, False), StringType()),  # gender
-        (id, StringType()),  # customer_plan
+        (identity, StringType()),  # customer_plan
     ]
 
     # Apply dirty transformations to df
@@ -145,10 +147,10 @@ def main():
     new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
                          for udf, column, name in zip(udfs, df.columns, names)])
 
-    new_df.sample(False, 0.0001, seed=50).show(n=100)
+    new_df.sample(False, 0.0001).show(n=100)
 
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(True, 0.0001, seed=42)
+    dup_df = new_df.sample(True, 0.0001)
 
     # Create final dirty dataframe
     df = new_df.union(dup_df)
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index f55a155bc75..aab08230028 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -12,24 +12,36 @@
 # Set global variables
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"
-ZONE = "us-central1-a"
 CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
 BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
-
-BUCKET = None
+DESTINATION_BLOB_NAME = "setup.py"
+JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
+JOB_DETAILS = {  # Job configuration
+    'placement': {
+        'cluster_name': CLUSTER_NAME
+    },
+    'pyspark_job': {
+        'main_python_file_uri': JOB_FILE_NAME,
+        'args': [
+            BUCKET_NAME,
+            "--test",
+        ],
+        "jar_file_uris": [
+                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
+        ],
+    },
+}
 
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_cluster():
     # Create cluster configuration
-    zone_uri = \
-        f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
     cluster_data = {
         'project_id': PROJECT,
         'cluster_name': CLUSTER_NAME,
         'config': {
             'gce_cluster_config': {
-                'zone_uri': zone_uri,
+                'zone_uri': '',
                 "metadata": {
                     "PIP_PACKAGES": "google-cloud-storage"
                 },
@@ -59,9 +71,8 @@ def setup_and_teardown_cluster():
 
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-
     operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
 
     # Wait for cluster to provision
@@ -70,10 +81,6 @@ def setup_and_teardown_cluster():
     yield
 
     # Delete cluster
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
-    })
-
     operation = cluster_client.delete_cluster(PROJECT, REGION,
                                               CLUSTER_NAME)
     operation.result()
@@ -81,53 +88,41 @@ def setup_and_teardown_cluster():
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_bucket():
-    global BUCKET
     # Create GCS Bucket
     storage_client = storage.Client()
-    BUCKET = storage_client.create_bucket(BUCKET_NAME)
+    bucket = storage_client.create_bucket(BUCKET_NAME)
+
+    # Upload file
+    blob = bucket.blob(DESTINATION_BLOB_NAME)
+    blob.upload_from_filename("setup.py")
 
     yield
 
     # Delete GCS bucket
-    storage_client = storage.Client()
     bucket = storage_client.get_bucket(BUCKET_NAME)
     bucket.delete(force=True)
 
 
-def test_setup(capsys):
-    '''Tests setup.py by submitting it to a dataproc cluster'''
+def get_blob_from_path(path):
+    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
+    bucket = storage.Client().get_bucket(bucket_name)
+    output_location = re.search("google-cloud-dataproc.+", path).group(0)
+    return bucket.blob(output_location)
 
-    # Upload file
-    destination_blob_name = "setup.py"
-    blob = BUCKET.blob(destination_blob_name)
-    blob.upload_from_filename("setup.py")
 
-    job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
-
-    # Create job configuration
-    job_details = {
-        'placement': {
-            'cluster_name': CLUSTER_NAME
-        },
-        'pyspark_job': {
-            'main_python_file_uri': job_file_name,
-            'args': [
-                BUCKET_NAME,
-                "--test",
-            ],
-            "jar_file_uris": [
-                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
-            ],
-        },
-    }
+def is_in_table(value, out):
+    return re.search(f"\| *{value}\|", out)
+
+
+def test_setup():
+    '''Tests setup.py by submitting it to a dataproc cluster'''
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-
     response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                                  job=job_details)
+                                                  job=JOB_DETAILS)
 
     # Wait for job to complete
     result = response.result()
@@ -150,32 +145,28 @@ def test_setup(capsys):
     assert re.search("20[0-9][0-9]\\|", out)
 
     # gender
-    assert "M" in out
-    assert "male" in out
-    assert "MALE" in out
-    assert "F" in out
-    assert "female" in out
-    assert "FEMALE" in out
-    assert "u" in out
-    assert "unknown" in out
-    assert "UNKNOWN" in out
+    assert is_in_table("M", out)
+    assert is_in_table("m", out)
+    assert is_in_table("male", out)
+    assert is_in_table("MALE", out)
+    assert is_in_table("F", out)
+    assert is_in_table("f", out)
+    assert is_in_table("female", out)
+    assert is_in_table("FEMALE", out)
+    assert is_in_table("U", out)
+    assert is_in_table("u", out)
+    assert is_in_table("unknown", out)
+    assert is_in_table("UNKNOWN", out)
 
     # customer_plan
-    assert "Subscriber" in out
-    assert "subscriber" in out
-    assert "SUBSCRIBER" in out
-    assert "sub" in out
-    assert "Customer" in out
-    assert "customer" in out
-    assert "CUSTOMER" in out
-    assert "cust" in out
+    assert is_in_table("Subscriber", out)
+    assert is_in_table("subscriber", out)
+    assert is_in_table("SUBSCRIBER", out)
+    assert is_in_table("sub", out)
+    assert is_in_table("Customer", out)
+    assert is_in_table("customer", out)
+    assert is_in_table("CUSTOMER", out)
+    assert is_in_table("cust", out)
 
     # Missing data
-    assert "null" in out
-
-
-def get_blob_from_path(path):
-    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
-    bucket = storage.Client().get_bucket(bucket_name)
-    output_location = re.search("google-cloud-dataproc.+", path).group(0)
-    return bucket.blob(output_location)
+    assert is_in_table("null", out)

From 81265d29c7b7e613cdb247871c89899833d89694 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Tue, 9 Jun 2020 16:01:52 -0400
Subject: [PATCH 07/59] linting and global vars

---
 .../data-ingestion/setup_test.py              | 68 +++++++++----------
 1 file changed, 33 insertions(+), 35 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index aab08230028..e9358de912c 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -31,49 +31,47 @@
         ],
     },
 }
-
-
-@pytest.fixture(autouse=True)
-def setup_and_teardown_cluster():
-    # Create cluster configuration
-    cluster_data = {
-        'project_id': PROJECT,
-        'cluster_name': CLUSTER_NAME,
-        'config': {
-            'gce_cluster_config': {
-                'zone_uri': '',
-                "metadata": {
-                    "PIP_PACKAGES": "google-cloud-storage"
-                },
+CLUSTER_DATA = {  # Create cluster configuration
+    'project_id': PROJECT,
+    'cluster_name': CLUSTER_NAME,
+    'config': {
+        'gce_cluster_config': {
+            'zone_uri': '',
+            "metadata": {
+                "PIP_PACKAGES": "google-cloud-storage"
             },
-            'master_config': {
-                'num_instances': 1,
-                'machine_type_uri': 'n1-standard-8'
-            },
-            'worker_config': {
-                'num_instances': 6,
-                'machine_type_uri': 'n1-standard-8'
-            },
-            "initialization_actions": [
-                {
-                    "executable_file": ("gs://dataproc-initialization-actions/"
-                                        "python/pip-install.sh"),
-                }
-            ],
-            "software_config": {
-                "image_version": "1.5.4-debian10",
-                "optional_components": [
-                    "ANACONDA"
-                ],
+        },
+        'master_config': {
+            'num_instances': 1,
+            'machine_type_uri': 'n1-standard-8'
+        },
+        'worker_config': {
+            'num_instances': 6,
+            'machine_type_uri': 'n1-standard-8'
+        },
+        "initialization_actions": [
+            {
+                "executable_file": ("gs://dataproc-initialization-actions/"
+                                    "python/pip-install.sh"),
             }
+        ],
+        "software_config": {
+            "image_version": "1.5.4-debian10",
+            "optional_components": [
+                "ANACONDA"
+            ],
         }
     }
+}
 
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown_cluster():
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
         'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-    operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
+    operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA)
 
     # Wait for cluster to provision
     operation.result()
@@ -111,7 +109,7 @@ def get_blob_from_path(path):
 
 
 def is_in_table(value, out):
-    return re.search(f"\| *{value}\|", out)
+    return re.search(f"\\| *{value}\\|", out)
 
 
 def test_setup():

From 3e86bda7c1da37c4201792b792003baa2147559c Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Wed, 10 Jun 2020 11:27:11 -0400
Subject: [PATCH 08/59] address Brad PR comments

---
 data-science-onramp/data-ingestion/setup.py   | 34 +++++++------------
 .../data-ingestion/setup_test.py              |  3 --
 2 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index cf61f81562a..7f5efa28e0a 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,13 +1,10 @@
 import random
 import sys
-
 from time import time_ns
 
 from google.cloud import bigquery
-
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-
 from pyspark.sql.functions import UserDefinedFunction
 from pyspark.sql.types import IntegerType, StringType
 
@@ -56,7 +53,7 @@ def convert_angle(angle):
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
-    new_angle = str(degrees) + u"\u00B0" + \
+    new_angle = str(degrees) + "\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
     return random_select([str(angle), new_angle], [0.55, 0.45])
 
@@ -78,13 +75,7 @@ def udf(col_value):
     return udf
 
 
-# This function is required because we need to apply a
-# function for every column and some columns do not change
-def identity(x):
-    return x
-
-
-def write_to_bigquery(spark, df):
+def write_to_bigquery(df):
     '''Write a dataframe to BigQuery'''
 
     # Create BigQuery Dataset
@@ -95,10 +86,9 @@ def write_to_bigquery(spark, df):
     dataset = client.create_dataset(dataset)
 
     # Saving the data to BigQuery
-    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
-
     df.write.format('bigquery') \
         .option('table', dataset_id + ".RAW_DATA") \
+        .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
 
@@ -109,7 +99,7 @@ def main():
     upload = True  # Whether to upload data to BigQuery
 
     # Check whether or not results should be uploaded
-    if len(sys.argv) > 1:
+    if len(sys.argv) > 2:
         upload = False
         print("Not uploading results to BigQuery")
     else:
@@ -125,21 +115,21 @@ def main():
     # Declare data transformations for each column in dataframe
     udfs = [
         (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(identity, True), StringType()),  # starttime
-        (dirty_data(identity, True), StringType()),  # stoptime
-        (identity, IntegerType()),  # start_station_id
+        (dirty_data(lambda x: x, True), StringType()),  # starttime
+        (dirty_data(lambda x: x, True), StringType()),  # stoptime
+        (lambda x: x, IntegerType()),  # start_station_id
         (dirty_data(station_name, False), StringType()),  # start_station_name
         (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (identity, IntegerType()),  # end_station_id
+        (lambda x: x, IntegerType()),  # end_station_id
         (dirty_data(station_name, False), StringType()),  # end_station_name
         (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (identity, IntegerType()),  # bikeid
+        (lambda x: x, IntegerType()),  # bikeid
         (dirty_data(user_type, False), StringType()),  # usertype
-        (identity, IntegerType()),  # birth_year
+        (lambda x: x, IntegerType()),  # birth_year
         (dirty_data(gender, False), StringType()),  # gender
-        (identity, StringType()),  # customer_plan
+        (lambda x: x, StringType()),  # customer_plan
     ]
 
     # Apply dirty transformations to df
@@ -156,7 +146,7 @@ def main():
     df = new_df.union(dup_df)
 
     if upload:
-        write_to_bigquery(spark, df)
+        write_to_bigquery(df)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index e9358de912c..2aa82535d79 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -1,14 +1,11 @@
 import os
 import re
-
 import uuid
 
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-
 import pytest
 
-
 # Set global variables
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"

From 580c8e1078e9480ef30d3083522fd2c467c4f1b1 Mon Sep 17 00:00:00 2001
From: Tushar Khan <tusharkhan@google.com>
Date: Thu, 11 Jun 2020 11:45:10 -0400
Subject: [PATCH 09/59] broken clean.py

---
 data-science-onramp/data-processing/clean.py | 44 ++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 data-science-onramp/data-processing/clean.py

diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py
new file mode 100644
index 00000000000..0bca32d3299
--- /dev/null
+++ b/data-science-onramp/data-processing/clean.py
@@ -0,0 +1,44 @@
+import os
+import sys
+
+from py4j.protocol import Py4JJavaError
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import UserDefinedFunction, lit
+from pyspark.sql.types import IntegerType, StringType
+
+
+PROJECT_ID = sys.argv[1]
+BUCKET_NAME = sys.argv[2]
+TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA'
+
+def station_name(name):
+    if name:
+        return name.replace('/', '&')
+    else:
+        return ''
+
+def main():
+    '''...'''
+    # Create a SparkSession under the name 'clean'. Viewable via the Spark UI
+    spark = SparkSession.builder.appName('clean').getOrCreate()
+
+    # Check if table exists
+    
+    try:
+        df = spark.read.format('bigquery').option('table', TABLE).load()
+    except Py4JJavaError:
+        print(f"{TABLE} does not exist. ")
+        return
+
+    udf_map = {
+        'start_station_name': (station_name, StringType())
+    }
+
+    for name, (func, col_type) in udf_map.items():
+        df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name))
+    
+    df = spark.createDataframe
+    df.show(n=100)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 4ed5a157a9b2492602c5e2925a8b3d30376215d8 Mon Sep 17 00:00:00 2001
From: Tushar Khan <tusharkhan@google.com>
Date: Thu, 11 Jun 2020 11:49:49 -0400
Subject: [PATCH 10/59] Revert "broken clean.py"

This reverts commit 580c8e1078e9480ef30d3083522fd2c467c4f1b1.
---
 data-science-onramp/data-processing/clean.py | 44 --------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 data-science-onramp/data-processing/clean.py

diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py
deleted file mode 100644
index 0bca32d3299..00000000000
--- a/data-science-onramp/data-processing/clean.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-
-from py4j.protocol import Py4JJavaError
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction, lit
-from pyspark.sql.types import IntegerType, StringType
-
-
-PROJECT_ID = sys.argv[1]
-BUCKET_NAME = sys.argv[2]
-TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA'
-
-def station_name(name):
-    if name:
-        return name.replace('/', '&')
-    else:
-        return ''
-
-def main():
-    '''...'''
-    # Create a SparkSession under the name 'clean'. Viewable via the Spark UI
-    spark = SparkSession.builder.appName('clean').getOrCreate()
-
-    # Check if table exists
-    
-    try:
-        df = spark.read.format('bigquery').option('table', TABLE).load()
-    except Py4JJavaError:
-        print(f"{TABLE} does not exist. ")
-        return
-
-    udf_map = {
-        'start_station_name': (station_name, StringType())
-    }
-
-    for name, (func, col_type) in udf_map.items():
-        df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name))
-    
-    df = spark.createDataframe
-    df.show(n=100)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From e6fe99dfbef9c2377335b52674ed41d41178696e Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 16 Jun 2020 11:29:46 -0400
Subject: [PATCH 11/59] optimize data ingestion

---
 data-science-onramp/data-ingestion/setup.py   | 109 +++++++++---------
 .../data-ingestion/setup_test.py              |  14 +--
 2 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 7f5efa28e0a..33c7c728733 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,41 +1,43 @@
 import random
 import sys
-from time import time_ns
 
 from google.cloud import bigquery
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.functions import UserDefinedFunction, when, expr
 from pyspark.sql.types import IntegerType, StringType
 
 
 BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
+RAW_DATASET_NAME = "new_york_citibike_trips5"
+RAW_TABLE_NAME = "RAW_DATA"
 
 
 # START MAKING DATA DIRTY
-def random_select(items, weights):
-    '''Picks an item according to the cumulative weights'''
-    return random.choices(items, weights=weights, k=1)[0]
-
-
 def trip_duration(duration):
     '''Converts trip duration to other units'''
+    if duration is None:
+        return None
     seconds = str(duration) + " s"
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
-    return random_select([seconds, minutes, hours,
+    return random.choices([seconds, minutes, hours,
                          str(random.randint(-1000, -1))],
-                         [0.3, 0.3, 0.3, 0.1])
+                         weights=[0.3, 0.3, 0.3, 0.1])[0]
 
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
+    if name is None:
+        return None
     return random.choice([name, name.replace("&", "/")])
 
 
 def user_type(user):
     '''Manipulates the user type string'''
+    if user is None:
+        return None
     return random.choice([user, user.upper(), user.lower(),
                           "sub" if user == "Subscriber" else user,
                           "cust" if user == "Customer" else user])
@@ -43,6 +45,8 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
+    if s is None:
+        return None
     return random.choice([s.upper(), s.lower(),
                          s[0].upper() if len(s) > 0 else "",
                          s[0].lower() if len(s) > 0 else ""])
@@ -50,29 +54,15 @@ def gender(s):
 
 def convert_angle(angle):
     '''Converts long and lat to DMS notation'''
+    if angle is None:
+        return None
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
     new_angle = str(degrees) + "\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
-    return random_select([str(angle), new_angle], [0.55, 0.45])
-
-
-# This function is nested since a UserDefinedFunction is
-# expected to take a single argument
-def dirty_data(proc_func, allow_none):
-    '''Master function returns a user defined function
-    that transforms the column data'''
-    def udf(col_value):
-        random.seed(hash(col_value) + time_ns())
-        if col_value is None:
-            return col_value
-        elif allow_none:
-            return random_select([None, proc_func(col_value)],
-                                 [0.05, 0.95])
-        else:
-            return proc_func(col_value)
-    return udf
+    return random.choices([str(angle), new_angle],
+                          weights=[0.55, 0.45])[0]
 
 
 def write_to_bigquery(df):
@@ -80,17 +70,19 @@ def write_to_bigquery(df):
 
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f'{client.project}.new_york_citibike_trips'
+    dataset_id = f'{client.project}.{RAW_DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
     # Saving the data to BigQuery
     df.write.format('bigquery') \
-        .option('table', dataset_id + ".RAW_DATA") \
+        .option('table', dataset_id + f".{RAW_TABLE_NAME}") \
         .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
+    print("Table successfully written to BigQuery")
+
 
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
@@ -112,42 +104,49 @@ def main():
         print(f"{TABLE} does not exist. ")
         return
 
-    # Declare data transformations for each column in dataframe
-    udfs = [
-        (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(lambda x: x, True), StringType()),  # starttime
-        (dirty_data(lambda x: x, True), StringType()),  # stoptime
-        (lambda x: x, IntegerType()),  # start_station_id
-        (dirty_data(station_name, False), StringType()),  # start_station_name
-        (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
-        (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (lambda x: x, IntegerType()),  # end_station_id
-        (dirty_data(station_name, False), StringType()),  # end_station_name
-        (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
-        (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (lambda x: x, IntegerType()),  # bikeid
-        (dirty_data(user_type, False), StringType()),  # usertype
-        (lambda x: x, IntegerType()),  # birth_year
-        (dirty_data(gender, False), StringType()),  # gender
-        (lambda x: x, StringType()),  # customer_plan
+    # Declare dictionary with keys column names and values user defined
+    #  functions and return types
+    udf_map = {
+            'tripduration': (trip_duration, StringType()),
+            'start_station_name': (station_name, StringType()),
+            'start_station_latitude': (convert_angle, StringType()),
+            'start_station_longitude': (convert_angle, StringType()),
+            'end_station_name': (station_name, StringType()),
+            'end_station_latitude': (convert_angle, StringType()),
+            'end_station_longitude': (convert_angle, StringType()),
+            'usertype': (user_type, StringType()),
+            'gender': (gender, StringType()),
+    }
+    
+    # Declare which columns to set some values to null randomly
+    null_columns = [
+            'tripduration',
+            'starttime',
+            'stoptime',
+            'start_station_latitude',
+            'start_station_longitude',
+            'end_station_latitude',
+            'end_station_longitude',
     ]
 
-    # Apply dirty transformations to df
-    names = df.schema.names
-    new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
-                         for udf, column, name in zip(udfs, df.columns, names)])
+    # Dirty the columns
+    for name, udf in udf_map.items():
+        df = df.withColumn(name, UserDefinedFunction(*udf)(name))
 
-    new_df.sample(False, 0.0001).show(n=100)
+    # Randomly set about 5% of the values in some columns to null
+    for name in null_columns:
+        df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
 
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(True, 0.0001)
+    dup_df = df.sample(True, 0.0001)
 
     # Create final dirty dataframe
-    df = new_df.union(dup_df)
+    df = df.union(dup_df)
 
     if upload:
         write_to_bigquery(df)
-
+    else:
+        df.sample(True, 0.0001).show(n=500, truncate=False)
 
 if __name__ == '__main__':
     main()
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 2aa82535d79..8fb1938c843 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -106,7 +106,7 @@ def get_blob_from_path(path):
 
 
 def is_in_table(value, out):
-    return re.search(f"\\| *{value}\\|", out)
+    return re.search(f"\\|{value} *\\|", out)
 
 
 def test_setup():
@@ -128,16 +128,16 @@ def test_setup():
     out = blob.download_as_string().decode("utf-8")
 
     # tripDuration
-    assert re.search("[0-9] s", out)
-    assert re.search("[0-9] m", out)
-    assert re.search("[0-9] h", out)
+    assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
+    assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
+    assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
 
     # station latitude & longitude
-    assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+    assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
 
     # birth_year
-    assert re.search("19[0-9][0-9]\\|", out)
-    assert re.search("20[0-9][0-9]\\|", out)
+    assert is_in_table("19[0-9][0-9]", out)
+    assert is_in_table("20[0-9][0-9]", out)
 
     # gender
     assert is_in_table("M", out)

From 540acaae2d61907de35fa31bce089c832f8603d4 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Tue, 16 Jun 2020 11:54:59 -0400
Subject: [PATCH 12/59] fix linting errors

---
 data-science-onramp/data-ingestion/setup.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 33c7c728733..7308d13a37e 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -4,8 +4,8 @@
 from google.cloud import bigquery
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction, when, expr
-from pyspark.sql.types import IntegerType, StringType
+from pyspark.sql.functions import expr, UserDefinedFunction, when
+from pyspark.sql.types import StringType
 
 
 BUCKET_NAME = sys.argv[1]
@@ -23,8 +23,8 @@ def trip_duration(duration):
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
     return random.choices([seconds, minutes, hours,
-                         str(random.randint(-1000, -1))],
-                         weights=[0.3, 0.3, 0.3, 0.1])[0]
+                          str(random.randint(-1000, -1))],
+                          weights=[0.3, 0.3, 0.3, 0.1])[0]
 
 
 def station_name(name):
@@ -117,7 +117,7 @@ def main():
             'usertype': (user_type, StringType()),
             'gender': (gender, StringType()),
     }
-    
+
     # Declare which columns to set some values to null randomly
     null_columns = [
             'tripduration',
@@ -148,5 +148,6 @@ def main():
     else:
         df.sample(True, 0.0001).show(n=500, truncate=False)
 
+
 if __name__ == '__main__':
     main()

From a7e29723f004b3cc3c4092b26eb90286b6ee7318 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 16 Jun 2020 18:21:20 -0400
Subject: [PATCH 13/59] fix minor style issues

---
 data-science-onramp/data-ingestion/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 7308d13a37e..b142aa8f37a 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -10,7 +10,7 @@
 
 BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-RAW_DATASET_NAME = "new_york_citibike_trips5"
+RAW_DATASET_NAME = "new_york_citibike_trips"
 RAW_TABLE_NAME = "RAW_DATA"
 
 
@@ -77,7 +77,7 @@ def write_to_bigquery(df):
 
     # Saving the data to BigQuery
     df.write.format('bigquery') \
-        .option('table', dataset_id + f".{RAW_TABLE_NAME}") \
+        .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \
         .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 

From 3e5ba3bb464a8aaa0bbd9d9edeb420261e01aebb Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Fri, 19 Jun 2020 17:47:23 -0400
Subject: [PATCH 14/59] remove pip from cluster config

---
 data-science-onramp/data-ingestion/setup_test.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 8fb1938c843..d8def350c8e 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -34,9 +34,6 @@
     'config': {
         'gce_cluster_config': {
             'zone_uri': '',
-            "metadata": {
-                "PIP_PACKAGES": "google-cloud-storage"
-            },
         },
         'master_config': {
             'num_instances': 1,
@@ -46,12 +43,6 @@
             'num_instances': 6,
             'machine_type_uri': 'n1-standard-8'
         },
-        "initialization_actions": [
-            {
-                "executable_file": ("gs://dataproc-initialization-actions/"
-                                    "python/pip-install.sh"),
-            }
-        ],
         "software_config": {
             "image_version": "1.5.4-debian10",
             "optional_components": [

From 21061531d2c710f05ebe9e17910724e18bfedbc9 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Fri, 26 Jun 2020 19:24:21 -0400
Subject: [PATCH 15/59] load external datasets from url

---
 data-science-onramp/data-ingestion/setup.py   | 74 ++++++++++++++++---
 .../data-ingestion/setup_test.py              | 13 +++-
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index b142aa8f37a..06b8ce00689 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,17 +1,37 @@
 import random
 import sys
+import pandas as pd
 
 from google.cloud import bigquery
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import expr, UserDefinedFunction, when
-from pyspark.sql.types import StringType
+from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
 
 BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-RAW_DATASET_NAME = "new_york_citibike_trips"
-RAW_TABLE_NAME = "RAW_DATA"
+DATASET_NAME = "data_science_onramp"
+RAW_TABLE_NAME = "new_york_citibike_trips"
+EXTERNAL_DATASETS = {
+    "gas_prices": {
+        "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
+        "schema": StructType([
+            StructField("Date", StringType(), True),
+            StructField("New_York_State_Average_USD_per_Gal",
+                        FloatType(), True),
+            StructField("Albany_Average_USD_per_Gal", FloatType(), True),
+            StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True),
+            StructField("Buffalo_Average_USD_per_Gal", FloatType(), True),
+            StructField("Nassau_Average_USD_per_Gal", FloatType(), True),
+            StructField("New_York_City_Average_USD_per_Gal",
+                        FloatType(), True),
+            StructField("Rochester_Average_USD_per_Gal", FloatType(), True),
+            StructField("Syracuse_Average_USD_per_Gal", FloatType(), True),
+            StructField("Utica_Average_USD_per_Gal", FloatType(), True),
+        ]),
+    },
+}
 
 
 # START MAKING DATA DIRTY
@@ -65,23 +85,39 @@ def convert_angle(angle):
                           weights=[0.55, 0.45])[0]
 
 
-def write_to_bigquery(df):
-    '''Write a dataframe to BigQuery'''
-
+def create_bigquery_dataset():
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f'{client.project}.{RAW_DATASET_NAME}'
+    dataset_id = f'{client.project}.{DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
+
+def write_to_bigquery(df, table_name):
+    '''Write a dataframe to BigQuery'''
+    client = bigquery.Client()
+    dataset_id = f'{client.project}.{DATASET_NAME}'
+
     # Saving the data to BigQuery
     df.write.format('bigquery') \
-        .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \
+        .option('table', f"{dataset_id}.{table_name}") \
         .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
-    print("Table successfully written to BigQuery")
+    print(f"Table {table_name} successfully written to BigQuery")
+
+
+def print_df(df, table_name):
+    '''Print 20 rows from dataframe and a random sample'''
+    # first 100 rows for smaller tables
+    df.show()
+
+    # random sample for larger tables
+    # for small tables this will be empty
+    df.sample(True, 0.0001).show(n=500, truncate=False)
+
+    print(f"Table {table_name} printed")
 
 
 def main():
@@ -91,12 +127,25 @@ def main():
     upload = True  # Whether to upload data to BigQuery
 
     # Check whether or not results should be uploaded
-    if len(sys.argv) > 2:
+    if '--test' in sys.argv:
         upload = False
         print("Not uploading results to BigQuery")
     else:
+        create_bigquery_dataset()
         print("Results will be uploaded to BigQuery")
 
+    # Ingest External Datasets
+
+    for table_name, data in EXTERNAL_DATASETS.items():
+        print(f'Creating dataframe for {table_name}')
+        df = spark.createDataFrame(pd.read_csv(data["url"]),
+                                   schema=data["schema"])
+
+        if upload:
+            write_to_bigquery(df, table_name)
+        else:
+            print_df(df, table_name)
+
     # Check if table exists
     try:
         df = spark.read.format('bigquery').option('table', TABLE).load()
@@ -135,6 +184,7 @@ def main():
 
     # Randomly set about 5% of the values in some columns to null
     for name in null_columns:
+
         df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
 
     # Duplicate about 0.01% of the rows
@@ -144,9 +194,9 @@ def main():
     df = df.union(dup_df)
 
     if upload:
-        write_to_bigquery(df)
+        write_to_bigquery(df, RAW_TABLE_NAME)
     else:
-        df.sample(True, 0.0001).show(n=500, truncate=False)
+        print_df(df, RAW_TABLE_NAME)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index d8def350c8e..baec10a79a5 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -13,6 +13,10 @@
 BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
+TABLE_NAMES = [
+    "new_york_citibike_trips",
+    "gas_prices",
+]
 JOB_DETAILS = {  # Job configuration
     'placement': {
         'cluster_name': CLUSTER_NAME
@@ -97,9 +101,12 @@ def get_blob_from_path(path):
 
 
 def is_in_table(value, out):
-    return re.search(f"\\|{value} *\\|", out)
+    return re.search(f"\\| *{value} *\\|", out)
 
 
+def table_printed(table_name, out):
+    return re.search(f"Table {table_name} printed", out)
+
 def test_setup():
     '''Tests setup.py by submitting it to a dataproc cluster'''
 
@@ -118,6 +125,10 @@ def test_setup():
     blob = get_blob_from_path(output_location)
     out = blob.download_as_string().decode("utf-8")
 
+    # check that tables were printed
+    for table_name in TABLE_NAMES:
+        assert table_printed(table_name, out)
+
     # tripDuration
     assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
     assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)

From 9febbad28873cfd3010b9a28dd037a71e4a36c4d Mon Sep 17 00:00:00 2001
From: Tushar Khan <tusharkhan@google.com>
Date: Tue, 7 Jul 2020 12:54:45 -0400
Subject: [PATCH 16/59] added dry-run flag

---
 data-science-onramp/data-ingestion/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 06b8ce00689..bfa22087c39 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -127,7 +127,7 @@ def main():
     upload = True  # Whether to upload data to BigQuery
 
     # Check whether or not results should be uploaded
-    if '--test' in sys.argv:
+    if '--dry-run' in sys.argv:
         upload = False
         print("Not uploading results to BigQuery")
     else:

From 5d56b9777771406bf73f7bcdf1579a4df732e9c5 Mon Sep 17 00:00:00 2001
From: Symmetries <lopdie101@gmail.com>
Date: Wed, 8 Jul 2020 12:29:46 -0400
Subject: [PATCH 17/59] dry-run flag

---
 data-science-onramp/data-ingestion/setup_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index baec10a79a5..a8fbe0d014d 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -25,7 +25,7 @@
         'main_python_file_uri': JOB_FILE_NAME,
         'args': [
             BUCKET_NAME,
-            "--test",
+            "--dry-run",
         ],
         "jar_file_uris": [
                 "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"

From 22be5d3f8496da0f61f89107faa03e9536923e2b Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 9 Jul 2020 19:00:28 -0400
Subject: [PATCH 18/59] address some review comments

---
 data-science-onramp/data-ingestion/setup.py   | 57 +++++++++----------
 .../data-ingestion/setup_test.py              | 17 ++++--
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index bfa22087c39..ecffd628b50 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,3 +1,12 @@
+"""Setup Dataproc job for Data Science Onramp Sample Application
+This job ingests an external gas prices in NY dataset as well as
+takes a New York Citibike dataset available on BigQuery and
+"dirties" the dataset before uploading it back to BigQuery
+It needs the following arguments
+* the name of the Google Cloud Storage bucket to be used
+* an optional --test flag to upload a subset of the dataset for testing
+"""
+
 import random
 import sys
 import pandas as pd
@@ -37,11 +46,11 @@
 # START MAKING DATA DIRTY
 def trip_duration(duration):
     '''Converts trip duration to other units'''
-    if duration is None:
+    if not duration:
         return None
-    seconds = str(duration) + " s"
-    minutes = str(float(duration) / 60) + " min"
-    hours = str(float(duration) / 3600) + " h"
+    seconds = f"{str(duration)} s"
+    minutes = f"{str(float(duration) / 60)} min"
+    hours = f"{str(float(duration) / 3600)} h"
     return random.choices([seconds, minutes, hours,
                           str(random.randint(-1000, -1))],
                           weights=[0.3, 0.3, 0.3, 0.1])[0]
@@ -49,14 +58,14 @@ def trip_duration(duration):
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
-    if name is None:
+    if not name:
         return None
     return random.choice([name, name.replace("&", "/")])
 
 
 def user_type(user):
     '''Manipulates the user type string'''
-    if user is None:
+    if not user:
         return None
     return random.choice([user, user.upper(), user.lower(),
                           "sub" if user == "Subscriber" else user,
@@ -65,7 +74,7 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
-    if s is None:
+    if not s:
         return None
     return random.choice([s.upper(), s.lower(),
                          s[0].upper() if len(s) > 0 else "",
@@ -108,28 +117,16 @@ def write_to_bigquery(df, table_name):
     print(f"Table {table_name} successfully written to BigQuery")
 
 
-def print_df(df, table_name):
-    '''Print 20 rows from dataframe and a random sample'''
-    # first 100 rows for smaller tables
-    df.show()
-
-    # random sample for larger tables
-    # for small tables this will be empty
-    df.sample(True, 0.0001).show(n=500, truncate=False)
-
-    print(f"Table {table_name} printed")
-
-
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
-    upload = True  # Whether to upload data to BigQuery
+    test = False  # Whether we are running the job as a test 
 
-    # Check whether or not results should be uploaded
-    if '--dry-run' in sys.argv:
-        upload = False
-        print("Not uploading results to BigQuery")
+    # Check whether or not the job is running as a test
+    if '--test' in sys.argv:
+        test = True
+        print("Subset of whole dataset will be uploaded to BigQuery")
     else:
         create_bigquery_dataset()
         print("Results will be uploaded to BigQuery")
@@ -141,10 +138,7 @@ def main():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
-        if upload:
-            write_to_bigquery(df, table_name)
-        else:
-            print_df(df, table_name)
+        write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:
@@ -184,7 +178,6 @@ def main():
 
     # Randomly set about 5% of the values in some columns to null
     for name in null_columns:
-
         df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
 
     # Duplicate about 0.01% of the rows
@@ -193,10 +186,12 @@ def main():
     # Create final dirty dataframe
     df = df.union(dup_df)
 
-    if upload:
+    if not test:
         write_to_bigquery(df, RAW_TABLE_NAME)
     else:
-        print_df(df, RAW_TABLE_NAME)
+        # df.sample(True, 0.0001).show(n=500, truncate=False)
+        # Upload 0.001% of the table (about 600 rows)
+        write_to_bigquery(df.sample(False, 0.00001))
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index a8fbe0d014d..a0ae6fb2814 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -1,3 +1,9 @@
+"""Test file for the setup job in the Data Science Onramp sample application
+Creates a test Dataproc cluster and runs the job with a --test flag.
+The job uploads a subset of the data to BigQuery.
+Then, data is pulled from BigQuery and checks are made to see if the data is dirty.
+"""
+
 import os
 import re
 import uuid
@@ -25,7 +31,7 @@
         'main_python_file_uri': JOB_FILE_NAME,
         'args': [
             BUCKET_NAME,
-            "--dry-run",
+            "--test",
         ],
         "jar_file_uris": [
                 "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
@@ -104,8 +110,9 @@ def is_in_table(value, out):
     return re.search(f"\\| *{value} *\\|", out)
 
 
-def table_printed(table_name, out):
-    return re.search(f"Table {table_name} printed", out)
+def table_uploaded(table_name, out):
+    return re.search(f"Table {table_name} successfully written to BigQuery", out)
+
 
 def test_setup():
     '''Tests setup.py by submitting it to a dataproc cluster'''
@@ -125,9 +132,9 @@ def test_setup():
     blob = get_blob_from_path(output_location)
     out = blob.download_as_string().decode("utf-8")
 
-    # check that tables were printed
+    # Check if table upload success message was printed
     for table_name in TABLE_NAMES:
-        assert table_printed(table_name, out)
+        assert table_uploaded(table_name, out)
 
     # tripDuration
     assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)

From f040542e5274af839924520ce65a01ec8443eab3 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 14 Jul 2020 17:22:26 -0400
Subject: [PATCH 19/59] optimize setup test

---
 .../data-ingestion/requirements.txt           |   3 +-
 data-science-onramp/data-ingestion/setup.py   |  34 ++---
 data-science-onramp/data-ingestion/setup.sh   |   2 +-
 .../data-ingestion/setup_test.py              | 116 ++++++++++--------
 4 files changed, 86 insertions(+), 69 deletions(-)
 mode change 100644 => 100755 data-science-onramp/data-ingestion/setup.sh

diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
index f435423c623..e0328e4aec9 100644
--- a/data-science-onramp/data-ingestion/requirements.txt
+++ b/data-science-onramp/data-ingestion/requirements.txt
@@ -3,4 +3,5 @@ google-auth==1.16.0
 google-auth-httplib2==0.0.3
 google-cloud==0.34.0
 google-cloud-storage==1.28.1
-google-cloud-dataproc==0.8.0
\ No newline at end of file
+google-cloud-dataproc==0.8.0
+google-cloud-bigquery==1.25.0
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index ecffd628b50..bdad93720d2 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -4,14 +4,15 @@
 "dirties" the dataset before uploading it back to BigQuery
 It needs the following arguments
 * the name of the Google Cloud Storage bucket to be used
+* the name of the BigQuery dataset to be created
 * an optional --test flag to upload a subset of the dataset for testing
 """
 
 import random
 import sys
-import pandas as pd
 
 from google.cloud import bigquery
+import pandas as pd
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import expr, UserDefinedFunction, when
@@ -19,10 +20,10 @@
 
 
 BUCKET_NAME = sys.argv[1]
+DATASET_NAME = sys.argv[2]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-DATASET_NAME = "data_science_onramp"
-RAW_TABLE_NAME = "new_york_citibike_trips"
-EXTERNAL_DATASETS = {
+CITIBIKE_TABLE_NAME = "new_york_citibike_trips"
+EXTERNAL_TABLES = {
     "gas_prices": {
         "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
         "schema": StructType([
@@ -111,7 +112,6 @@ def write_to_bigquery(df, table_name):
     # Saving the data to BigQuery
     df.write.format('bigquery') \
         .option('table', f"{dataset_id}.{table_name}") \
-        .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
     print(f"Table {table_name} successfully written to BigQuery")
@@ -121,20 +121,22 @@ def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
-    test = False  # Whether we are running the job as a test 
+    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
+
+    create_bigquery_dataset()
+
+    # Whether we are running the job as a test
+    test = False
 
     # Check whether or not the job is running as a test
     if '--test' in sys.argv:
         test = True
-        print("Subset of whole dataset will be uploaded to BigQuery")
+        print("A subset of the whole dataset will be uploaded to BigQuery")
     else:
-        create_bigquery_dataset()
         print("Results will be uploaded to BigQuery")
 
     # Ingest External Datasets
-
-    for table_name, data in EXTERNAL_DATASETS.items():
-        print(f'Creating dataframe for {table_name}')
+    for table_name, data in EXTERNAL_TABLES.items():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
@@ -143,6 +145,8 @@ def main():
     # Check if table exists
     try:
         df = spark.read.format('bigquery').option('table', TABLE).load()
+        if test:
+            df = df.sample(False, 0.00001)
     except Py4JJavaError:
         print(f"{TABLE} does not exist. ")
         return
@@ -186,12 +190,8 @@ def main():
     # Create final dirty dataframe
     df = df.union(dup_df)
 
-    if not test:
-        write_to_bigquery(df, RAW_TABLE_NAME)
-    else:
-        # df.sample(True, 0.0001).show(n=500, truncate=False)
-        # Upload 0.001% of the table (about 600 rows)
-        write_to_bigquery(df.sample(False, 0.00001))
+    print('Uploading citibike dataset...')
+    write_to_bigquery(df, CITIBIKE_TABLE_NAME)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
old mode 100644
new mode 100755
index f78c8cd120b..336f3da729d
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
     --driver-log-levels root=FATAL \
-    setup.py -- ${BUCKET_NAME}
+    setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index a0ae6fb2814..7b0f0bc6be5 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -10,13 +10,17 @@
 
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
+from google.cloud import bigquery
 import pytest
 
 # Set global variables
+ID = uuid.uuid4()
+
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"
-CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
-BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
+CLUSTER_NAME = f'setup-test-{ID}'
+BUCKET_NAME = f'setup-test-{ID}'
+DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
 TABLE_NAMES = [
@@ -31,6 +35,7 @@
         'main_python_file_uri': JOB_FILE_NAME,
         'args': [
             BUCKET_NAME,
+            DATASET_NAME,
             "--test",
         ],
         "jar_file_uris": [
@@ -99,6 +104,17 @@ def setup_and_teardown_bucket():
     bucket.delete(force=True)
 
 
+@pytest.fixture(autouse=True)
+def setup_and_teardown_bq_dataset():
+    # Dataset is created by the client
+    bq_client = bigquery.Client(project=PROJECT)
+
+    yield
+
+    # Delete Dataset
+    bq_client.delete_dataset(DATASET_NAME, delete_contents=True)
+
+
 def get_blob_from_path(path):
     bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
     bucket = storage.Client().get_bucket(bucket_name)
@@ -106,8 +122,14 @@ def get_blob_from_path(path):
     return bucket.blob(output_location)
 
 
-def is_in_table(value, out):
-    return re.search(f"\\| *{value} *\\|", out)
+def get_dataproc_job_output(result):
+    output_location = result.driver_output_resource_uri + ".000000000"
+    blob = get_blob_from_path(output_location)
+    return blob.download_as_string().decode("utf-8")
+
+
+# def is_in_table(value, out):
+#     return re.search(f"\\| *{value} *\\|", out)
 
 
 def table_uploaded(table_name, out):
@@ -128,49 +150,43 @@ def test_setup():
     result = response.result()
 
     # Get job output
-    output_location = result.driver_output_resource_uri + ".000000000"
-    blob = get_blob_from_path(output_location)
-    out = blob.download_as_string().decode("utf-8")
-
-    # Check if table upload success message was printed
-    for table_name in TABLE_NAMES:
-        assert table_uploaded(table_name, out)
-
-    # tripDuration
-    assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
-    assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
-    assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
-
-    # station latitude & longitude
-    assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
-
-    # birth_year
-    assert is_in_table("19[0-9][0-9]", out)
-    assert is_in_table("20[0-9][0-9]", out)
-
-    # gender
-    assert is_in_table("M", out)
-    assert is_in_table("m", out)
-    assert is_in_table("male", out)
-    assert is_in_table("MALE", out)
-    assert is_in_table("F", out)
-    assert is_in_table("f", out)
-    assert is_in_table("female", out)
-    assert is_in_table("FEMALE", out)
-    assert is_in_table("U", out)
-    assert is_in_table("u", out)
-    assert is_in_table("unknown", out)
-    assert is_in_table("UNKNOWN", out)
-
-    # customer_plan
-    assert is_in_table("Subscriber", out)
-    assert is_in_table("subscriber", out)
-    assert is_in_table("SUBSCRIBER", out)
-    assert is_in_table("sub", out)
-    assert is_in_table("Customer", out)
-    assert is_in_table("customer", out)
-    assert is_in_table("CUSTOMER", out)
-    assert is_in_table("cust", out)
-
-    # Missing data
-    assert is_in_table("null", out)
+    out = get_dataproc_job_output(result)
+
+    # # tripDuration
+    # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
+    # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
+    # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
+
+    # # station latitude & longitude
+    # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+
+    # # birth_year
+    # assert is_in_table("19[0-9][0-9]", out)
+    # assert is_in_table("20[0-9][0-9]", out)
+
+    # # gender
+    # assert is_in_table("M", out)
+    # assert is_in_table("m", out)
+    # assert is_in_table("male", out)
+    # assert is_in_table("MALE", out)
+    # assert is_in_table("F", out)
+    # assert is_in_table("f", out)
+    # assert is_in_table("female", out)
+    # assert is_in_table("FEMALE", out)
+    # assert is_in_table("U", out)
+    # assert is_in_table("u", out)
+    # assert is_in_table("unknown", out)
+    # assert is_in_table("UNKNOWN", out)
+
+    # # customer_plan
+    # assert is_in_table("Subscriber", out)
+    # assert is_in_table("subscriber", out)
+    # assert is_in_table("SUBSCRIBER", out)
+    # assert is_in_table("sub", out)
+    # assert is_in_table("Customer", out)
+    # assert is_in_table("customer", out)
+    # assert is_in_table("CUSTOMER", out)
+    # assert is_in_table("cust", out)
+
+    # # Missing data
+    # assert is_in_table("null", out)

From 55354df6dd010e17198c9fe144b2c212f3b6c2e7 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Wed, 15 Jul 2020 18:40:18 -0400
Subject: [PATCH 20/59] query data in test

---
 data-science-onramp/data-ingestion/setup.sh   |   2 +-
 .../data-ingestion/setup_test.py              | 107 +++++++++++-------
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
index 336f3da729d..a69cda6a134 100755
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
     --driver-log-levels root=FATAL \
-    setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test
+    setup.py -- ${BUCKET_NAME} data_science_onramp
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 7b0f0bc6be5..ad9e756f8d1 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -21,6 +21,7 @@
 CLUSTER_NAME = f'setup-test-{ID}'
 BUCKET_NAME = f'setup-test-{ID}'
 DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
+CITIBIKE_TABLE = "new_york_citibike_trips"
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
 TABLE_NAMES = [
@@ -123,6 +124,7 @@ def get_blob_from_path(path):
 
 
 def get_dataproc_job_output(result):
+    """Get the dataproc job logs in plain text"""
     output_location = result.driver_output_resource_uri + ".000000000"
     blob = get_blob_from_path(output_location)
     return blob.download_as_string().decode("utf-8")
@@ -132,12 +134,50 @@ def get_dataproc_job_output(result):
 #     return re.search(f"\\| *{value} *\\|", out)
 
 
-def table_uploaded(table_name, out):
-    return re.search(f"Table {table_name} successfully written to BigQuery", out)
+def assert_table_success_message(table_name, out):
+    """Check table upload success message was printed in job logs."""
+    assert re.search(f"Table {table_name} successfully written to BigQuery", out), \
+        f"Table {table_name} sucess message not printed in job logs"
+
+
+
+def assert_regexes_in_table(regex_dict, query_result):
+    """Assert that at least one row satisfies each regex.
+    The arguments are
+    - regex_dict: a dictionary where the keys are column
+                    names and values are lists of regexes;
+    - query_result: the bigquery query result of the whole table.
+    """
+
+    # Create dictionary with keys column names and values dictionaries
+    # The dictionaries stored have keys regexes and values booleans
+    # `regex_found_dict[column][regex]` hold the truth value of
+    # whether the there is at least one row of column with name `column`
+    # which satisfies the regular expression `regex`.
+    regex_found_dict = {}
+    for column, regexes in regex_dict.items():
+        regex_found_dict[column] = {}
+        for regex in regexes:
+            regex_found_dict[column][regex] = False
+
+    # Outer loop is over `query_result` since this is
+    # an iterator which can only iterate once
+    for row in query_result:
+        for column_name, regexes in regex_dict.items():
+            for regex in regexes:
+                if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]):
+                    regex_found_dict[column_name][regex] = True
+
+    # Assert that all entries in regex_found_dict are true
+    for column_name in regex_found_dict:
+        for regex, found in regex_found_dict[column_name].items():
+            assert found, \
+                    f"No matches to regular expression \"{regex}\" found in column {column_name}"
 
 
 def test_setup():
-    '''Tests setup.py by submitting it to a dataproc cluster'''
+    """Test setup.py by submitting it to a dataproc cluster
+    Check table upload success message as well as data in the table itself"""
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
@@ -151,42 +191,27 @@ def test_setup():
 
     # Get job output
     out = get_dataproc_job_output(result)
+    
+    # Check logs to see if tables were uploaded
+    for table_name in TABLE_NAMES:
+        assert_table_success_message(table_name, out)
+
+    # Query BigQuery Table
+    client = bigquery.Client()
+    query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
+    query_job = client.query(query)
+
+    result = query_job.result()
+
+    regex_dict = {
+        "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"],
+        "gender": ['f', 'F', 'm', 'M', 'u', 'U', 'male', 'MALE', 'female', 'FEMALE', 'unknown', 'UNKNOWN'],
+        "start_station_latitude": ["[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\""],
+        "start_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
+        "end_station_latitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
+        "end_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
+        "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"],
+    }
+
+    assert_regexes_in_table(regex_dict, result)
 
-    # # tripDuration
-    # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
-    # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
-    # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
-
-    # # station latitude & longitude
-    # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
-
-    # # birth_year
-    # assert is_in_table("19[0-9][0-9]", out)
-    # assert is_in_table("20[0-9][0-9]", out)
-
-    # # gender
-    # assert is_in_table("M", out)
-    # assert is_in_table("m", out)
-    # assert is_in_table("male", out)
-    # assert is_in_table("MALE", out)
-    # assert is_in_table("F", out)
-    # assert is_in_table("f", out)
-    # assert is_in_table("female", out)
-    # assert is_in_table("FEMALE", out)
-    # assert is_in_table("U", out)
-    # assert is_in_table("u", out)
-    # assert is_in_table("unknown", out)
-    # assert is_in_table("UNKNOWN", out)
-
-    # # customer_plan
-    # assert is_in_table("Subscriber", out)
-    # assert is_in_table("subscriber", out)
-    # assert is_in_table("SUBSCRIBER", out)
-    # assert is_in_table("sub", out)
-    # assert is_in_table("Customer", out)
-    # assert is_in_table("customer", out)
-    # assert is_in_table("CUSTOMER", out)
-    # assert is_in_table("cust", out)
-
-    # # Missing data
-    # assert is_in_table("null", out)

From 5f80974d5407cf64021f1bae73b0b65c904566ea Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Fri, 17 Jul 2020 14:03:26 -0400
Subject: [PATCH 21/59] address live session comments

---
 .../data-ingestion/setup_test.py              | 64 ++++++-------------
 1 file changed, 18 insertions(+), 46 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index ad9e756f8d1..5ee77d5e1a3 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -8,9 +8,9 @@
 import re
 import uuid
 
+from google.cloud import bigquery
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-from google.cloud import bigquery
 import pytest
 
 # Set global variables
@@ -130,51 +130,12 @@ def get_dataproc_job_output(result):
     return blob.download_as_string().decode("utf-8")
 
 
-# def is_in_table(value, out):
-#     return re.search(f"\\| *{value} *\\|", out)
-
-
 def assert_table_success_message(table_name, out):
     """Check table upload success message was printed in job logs."""
     assert re.search(f"Table {table_name} successfully written to BigQuery", out), \
         f"Table {table_name} sucess message not printed in job logs"
 
 
-
-def assert_regexes_in_table(regex_dict, query_result):
-    """Assert that at least one row satisfies each regex.
-    The arguments are
-    - regex_dict: a dictionary where the keys are column
-                    names and values are lists of regexes;
-    - query_result: the bigquery query result of the whole table.
-    """
-
-    # Create dictionary with keys column names and values dictionaries
-    # The dictionaries stored have keys regexes and values booleans
-    # `regex_found_dict[column][regex]` hold the truth value of
-    # whether the there is at least one row of column with name `column`
-    # which satisfies the regular expression `regex`.
-    regex_found_dict = {}
-    for column, regexes in regex_dict.items():
-        regex_found_dict[column] = {}
-        for regex in regexes:
-            regex_found_dict[column][regex] = False
-
-    # Outer loop is over `query_result` since this is
-    # an iterator which can only iterate once
-    for row in query_result:
-        for column_name, regexes in regex_dict.items():
-            for regex in regexes:
-                if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]):
-                    regex_found_dict[column_name][regex] = True
-
-    # Assert that all entries in regex_found_dict are true
-    for column_name in regex_found_dict:
-        for regex, found in regex_found_dict[column_name].items():
-            assert found, \
-                    f"No matches to regular expression \"{regex}\" found in column {column_name}"
-
-
 def test_setup():
     """Test setup.py by submitting it to a dataproc cluster
     Check table upload success message as well as data in the table itself"""
@@ -191,17 +152,13 @@ def test_setup():
 
     # Get job output
     out = get_dataproc_job_output(result)
-    
+
     # Check logs to see if tables were uploaded
     for table_name in TABLE_NAMES:
         assert_table_success_message(table_name, out)
 
     # Query BigQuery Table
     client = bigquery.Client()
-    query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
-    query_job = client.query(query)
-
-    result = query_job.result()
 
     regex_dict = {
         "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"],
@@ -213,5 +170,20 @@ def test_setup():
         "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"],
     }
 
-    assert_regexes_in_table(regex_dict, result)
+    for column_name, regexes in regex_dict.items():
+        query = f"SELECT {column_name} FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
+        query_job = client.query(query)
 
+        result = query_job.result()
+
+        rows = []
+        for row in result:
+            rows.append(row[column_name])
+
+        for regex in regexes:
+            found = False
+            for row in rows:
+                if row and re.match(f"\\A{regex}\\Z", row):
+                    found = True
+            assert found, \
+                f"No matches to regular expression \"{regex}\" found in column {column_name}"

From e8837654c2e0bb29785c9aa6aaf03c23e8cc0745 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Mon, 20 Jul 2020 11:36:25 -0400
Subject: [PATCH 22/59] add break statement

---
 data-science-onramp/data-ingestion/setup_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 5ee77d5e1a3..978a5376480 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -185,5 +185,6 @@ def test_setup():
             for row in rows:
                 if row and re.match(f"\\A{regex}\\Z", row):
                     found = True
+                    break
             assert found, \
                 f"No matches to regular expression \"{regex}\" found in column {column_name}"

From 2ec8b30d60250b7eab737e0a6087ff734c8a50fd Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 23 Jul 2020 16:48:55 -0400
Subject: [PATCH 23/59] revert breaking table and dataset name change

---
 data-science-onramp/data-ingestion/setup.py      | 2 +-
 data-science-onramp/data-ingestion/setup.sh      | 2 +-
 data-science-onramp/data-ingestion/setup_test.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index bdad93720d2..a1f13dfa5ef 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -22,7 +22,7 @@
 BUCKET_NAME = sys.argv[1]
 DATASET_NAME = sys.argv[2]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-CITIBIKE_TABLE_NAME = "new_york_citibike_trips"
+CITIBIKE_TABLE_NAME = "RAW_DATA"
 EXTERNAL_TABLES = {
     "gas_prices": {
         "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
index a69cda6a134..2c4773f7272 100755
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
     --driver-log-levels root=FATAL \
-    setup.py -- ${BUCKET_NAME} data_science_onramp
+    setup.py -- ${BUCKET_NAME} new_york_citibike_trips
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 978a5376480..b1395af9793 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -21,11 +21,11 @@
 CLUSTER_NAME = f'setup-test-{ID}'
 BUCKET_NAME = f'setup-test-{ID}'
 DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
-CITIBIKE_TABLE = "new_york_citibike_trips"
+CITIBIKE_TABLE = "RAW_DATA"
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
 TABLE_NAMES = [
-    "new_york_citibike_trips",
+    CITIBIKE_TABLE,
     "gas_prices",
 ]
 JOB_DETAILS = {  # Job configuration

From b5ea09e00ca6cd6a9ef5150f4eea36285c8099be Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 4 Aug 2020 19:57:05 -0400
Subject: [PATCH 24/59] fix datetime formatting in setup job

---
 data-science-onramp/data-ingestion/setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index a1f13dfa5ef..8205f551c51 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -15,7 +15,7 @@
 import pandas as pd
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import expr, UserDefinedFunction, when
+from pyspark.sql.functions import expr, UserDefinedFunction, when, date_format
 from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
 
@@ -101,7 +101,7 @@ def create_bigquery_dataset():
     dataset_id = f'{client.project}.{DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
-    dataset = client.create_dataset(dataset)
+    #dataset = client.create_dataset(dataset)
 
 
 def write_to_bigquery(df, table_name):
@@ -140,7 +140,7 @@ def main():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
-        write_to_bigquery(df, table_name)
+        #write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:
@@ -180,6 +180,10 @@ def main():
     for name, udf in udf_map.items():
         df = df.withColumn(name, UserDefinedFunction(*udf)(name))
 
+    # Format the datetimes correctly
+    for name in ['starttime', 'stoptime']:
+        df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss"))
+
     # Randomly set about 5% of the values in some columns to null
     for name in null_columns:
         df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))

From 213dfcac25a8f4bba6dcee24a00a73b1c9a4f853 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 6 Aug 2020 14:03:13 -0400
Subject: [PATCH 25/59] uncomment commented dataset creation and writing

---
 data-science-onramp/data-ingestion/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 8205f551c51..352d8b029b4 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -101,7 +101,7 @@ def create_bigquery_dataset():
     dataset_id = f'{client.project}.{DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
-    #dataset = client.create_dataset(dataset)
+    dataset = client.create_dataset(dataset)
 
 
 def write_to_bigquery(df, table_name):
@@ -140,7 +140,7 @@ def main():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
-        #write_to_bigquery(df, table_name)
+        write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:

From 589568a55f1cb8df1b5ad20fb26202c7399ca8c8 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Fri, 5 Jun 2020 15:35:13 -0400
Subject: [PATCH 26/59] add data ingestion code

---
 .../data-ingestion/requirements-test.txt      |   1 +
 .../data-ingestion/requirements.txt           |   6 +
 .../data-ingestion/setup-test.py              | 210 ++++++++++++++++++
 data-science-onramp/data-ingestion/setup.py   | 149 +++++++++++++
 data-science-onramp/data-ingestion/setup.sh   |   6 +
 5 files changed, 372 insertions(+)
 create mode 100644 data-science-onramp/data-ingestion/requirements-test.txt
 create mode 100644 data-science-onramp/data-ingestion/requirements.txt
 create mode 100644 data-science-onramp/data-ingestion/setup-test.py
 create mode 100644 data-science-onramp/data-ingestion/setup.py
 create mode 100644 data-science-onramp/data-ingestion/setup.sh

diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt
new file mode 100644
index 00000000000..781d4326c94
--- /dev/null
+++ b/data-science-onramp/data-ingestion/requirements-test.txt
@@ -0,0 +1 @@
+pytest==5.3.2
diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
new file mode 100644
index 00000000000..f435423c623
--- /dev/null
+++ b/data-science-onramp/data-ingestion/requirements.txt
@@ -0,0 +1,6 @@
+grpcio==1.29.0
+google-auth==1.16.0
+google-auth-httplib2==0.0.3
+google-cloud==0.34.0
+google-cloud-storage==1.28.1
+google-cloud-dataproc==0.8.0
\ No newline at end of file
diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup-test.py
new file mode 100644
index 00000000000..d827c805818
--- /dev/null
+++ b/data-science-onramp/data-ingestion/setup-test.py
@@ -0,0 +1,210 @@
+import os
+import re
+
+import uuid
+
+from google.api_core.exceptions import GoogleAPICallError
+
+from google.cloud import dataproc_v1 as dataproc
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+
+import pytest
+
+waiting_cluster_callback = False
+
+# Set global variables
+project = os.environ['GCLOUD_PROJECT']
+region = "us-central1"
+zone = "us-central1-a"
+cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
+bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
+
+
+@pytest.fixture(autouse=True)
+def teardown():
+    yield
+
+    # Delete cluster
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{region}-dataproc.googleapis.com:443'
+    })
+
+    try:
+        operation = cluster_client.delete_cluster(project, region,
+                                                  cluster_name)
+        operation.result()
+    except GoogleAPICallError:
+        pass
+
+    # Delete GCS bucket
+    storage_client = storage.Client()
+    try:
+        bucket = storage_client.get_bucket(bucket_name)
+        bucket.delete(force=True)
+    except NotFound:
+        pass
+
+
+def test_setup(capsys):
+    '''Tests setup.py by submitting it to a dataproc cluster'''
+
+    # Create GCS Bucket
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(bucket_name)
+
+    # Upload file
+    destination_blob_name = "setup.py"
+    blob = bucket.blob(destination_blob_name)
+    blob.upload_from_filename("setup.py")
+
+    job_file_name = "gs://" + bucket_name + "/setup.py"
+
+    # Create cluster configuration
+    zone_uri = \
+        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
+            project, zone)
+    cluster_data = {
+        'project_id': project,
+        'cluster_name': cluster_name,
+        'config': {
+            'gce_cluster_config': {
+                'zone_uri': zone_uri,
+                "metadata": {
+                    "PIP_PACKAGES": "google-cloud-storage"
+                },
+            },
+            'master_config': {
+                'num_instances': 1,
+                'machine_type_uri': 'n1-standard-8'
+            },
+            'worker_config': {
+                'num_instances': 6,
+                'machine_type_uri': 'n1-standard-8'
+            },
+            "initialization_actions": [
+                {
+                    "executable_file": ("gs://dataproc-initialization-actions/"
+                                        "python/pip-install.sh"),
+                }
+            ],
+            "software_config": {
+                "image_version": "1.5.4-debian10",
+                "optional_components": [
+                    "ANACONDA"
+                ],
+            }
+        }
+    }
+
+    # Create cluster using cluster client
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+    })
+
+    cluster = cluster_client.create_cluster(project, region, cluster_data)
+    cluster.add_done_callback(callback)
+
+    # Wait for cluster to provision
+    global waiting_cluster_callback
+    waiting_cluster_callback = True
+
+    wait_for_cluster_creation()
+
+    # Create job configuration
+    job_details = {
+        'placement': {
+            'cluster_name': cluster_name
+        },
+        'pyspark_job': {
+            'main_python_file_uri': job_file_name,
+            'args': [
+                bucket_name,
+                "--test",
+            ],
+            "jar_file_uris": [
+                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
+            ],
+        },
+    }
+
+    # Submit job to dataproc cluster
+    job_client = dataproc.JobControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+    })
+
+    result = job_client.submit_job(project_id=project, region=region,
+                                   job=job_details)
+
+    job_id = result.reference.job_id
+    print('Submitted job \"{}\".'.format(job_id))
+
+    # Wait for job to complete
+    wait_for_job(job_client, job_id)
+
+    # Get job output
+    cluster_info = cluster_client.get_cluster(project, region, cluster_name)
+    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
+    output_blob = (
+        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
+        .format(cluster_info.cluster_uuid, job_id))
+    out = bucket.blob(output_blob).download_as_string().decode("utf-8")
+
+    # tripDuration
+    assert re.search("[0-9] s", out)
+    assert re.search("[0-9] m", out)
+    assert re.search("[0-9] h", out)
+
+    # station latitude & longitude
+    assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+
+    # birth_year
+    assert re.search("19[0-9][0-9]\\|", out)
+    assert re.search("20[0-9][0-9]\\|", out)
+
+    # gender
+    assert "M" in out
+    assert "male" in out
+    assert "MALE" in out
+    assert "F" in out
+    assert "female" in out
+    assert "FEMALE" in out
+    assert "u" in out
+    assert "unknown" in out
+    assert "UNKNOWN" in out
+
+    # customer_plan
+    assert "Subscriber" in out
+    assert "subscriber" in out
+    assert "SUBSCRIBER" in out
+    assert "sub" in out
+    assert "Customer" in out
+    assert "customer" in out
+    assert "CUSTOMER" in out
+    assert "cust" in out
+
+    # Missing data
+    assert "null" in out
+
+
+def callback(operation_future):
+    '''Sets a flag to stop waiting'''
+    global waiting_cluster_callback
+    waiting_cluster_callback = False
+
+
+def wait_for_cluster_creation():
+    '''Waits for cluster to create'''
+    while True:
+        if not waiting_cluster_callback:
+            break
+
+
+def wait_for_job(job_client, job_id):
+    '''Waits for job to finish'''
+    while True:
+        job = job_client.get_job(project, region, job_id)
+        assert job.status.State.Name(job.status.state) != "ERROR"
+
+        if job.status.State.Name(job.status.state) == "DONE":
+            return
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
new file mode 100644
index 00000000000..91a740b34d0
--- /dev/null
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -0,0 +1,149 @@
+from random import choice, choices, randint, seed
+import sys
+
+from time import time_ns
+
+from google.cloud import bigquery
+
+from py4j.protocol import Py4JJavaError
+from pyspark.sql import SparkSession
+
+from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.types import IntegerType, StringType
+
+
+# Create a SparkSession under the name "setup". Viewable via the Spark UI
+spark = SparkSession.builder.appName("setup").getOrCreate()
+
+bucket_name = sys.argv[1]
+upload = True  # Whether to upload data to BigQuery
+
+# Check whether or not results should be uploaded
+try:
+    sys.argv[2]
+    upload = False
+except IndexError:
+    print("Results will be uploaded to BigQuery")
+
+table = "bigquery-public-data.new_york_citibike.citibike_trips"
+
+# Check if table exists
+try:
+    df = spark.read.format('bigquery').option('table', table).load()
+except Py4JJavaError:
+    print(f"{table} does not exist. ")
+    sys.exit(0)
+
+# START MAKING DATA DIRTY
+
+
+def random_select(items, cum_weights):
+    '''Picks an item according to the cumulative weights'''
+    return choices(items, cum_weights=cum_weights, k=1)[0]
+
+
+def tripduration(duration):
+    '''Converts trip duration to other units'''
+    seconds = str(duration) + " s"
+    minutes = str(float(duration) / 60) + " min"
+    hours = str(float(duration) / 3600) + " h"
+    return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
+                         [0.3, 0.6, 0.9, 1])
+
+
+def station_name(name):
+    '''Replaces '&' with '/' with a 50% chance'''
+    return choice([name, name.replace("&", "/")])
+
+
+def usertype(user):
+    '''Manipulates the user type string'''
+    return choice([user, user.upper(), user.lower(),
+                  "sub" if user == "Subscriber" else user,
+                   "cust" if user == "Customer" else user])
+
+
+def gender(s):
+    '''Manipulates the gender string'''
+    return choice([s, s.upper(), s.lower(),
+                  s[0] if len(s) > 0 else "",
+                   s[0].lower() if len(s) > 0 else ""])
+
+
+def convertAngle(angle):
+    '''Converts long and lat to DMS notation'''
+    degrees = int(angle)
+    minutes = int((angle - degrees) * 60)
+    seconds = int((angle - degrees - minutes/60) * 3600)
+    new_angle = str(degrees) + u"\u00B0" + \
+        str(minutes) + "'" + str(seconds) + '"'
+    return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
+
+
+def dirty_data(proc_func, allow_none):
+    '''Master function returns a user defined function
+    that transforms the column data'''
+    def udf(col_value):
+        seed(hash(col_value) + time_ns())
+        if col_value is None:
+            return col_value
+        elif allow_none:
+            return random_select([None, proc_func(col_value)],
+                                 cum_weights=[0.05, 1])
+        else:
+            return proc_func(col_value)
+    return udf
+
+
+def id(x):
+    return x
+
+
+# Declare data transformations for each column in dataframe
+udfs = [
+    (dirty_data(tripduration, True), StringType()),  # tripduration
+    (dirty_data(id, True), StringType()),  # starttime
+    (dirty_data(id, True), StringType()),  # stoptime
+    (id, IntegerType()),  # start_station_id
+    (dirty_data(station_name, False), StringType()),  # start_station_name
+    (dirty_data(convertAngle, True), StringType()),  # start_station_latitude
+    (dirty_data(convertAngle, True), StringType()),  # start_station_longitude
+    (id, IntegerType()),  # end_station_id
+    (dirty_data(station_name, False), StringType()),  # end_station_name
+    (dirty_data(convertAngle, True), StringType()),  # end_station_latitude
+    (dirty_data(convertAngle, True), StringType()),  # end_station_longitude
+    (id, IntegerType()),  # bikeid
+    (dirty_data(usertype, False), StringType()),  # usertype
+    (id, IntegerType()),  # birth_year
+    (dirty_data(gender, False), StringType()),  # gender
+    (id, StringType()),  # customer_plan
+]
+
+# Apply dirty transformations to df
+names = df.schema.names
+new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
+                     for udf, column, name in zip(udfs, df.columns, names)])
+
+# Duplicate about 0.01% of the rows
+dup_df = new_df.sample(False, 0.0001, seed=42)
+
+# Create final dirty dataframe
+df = new_df.union(dup_df)
+df.sample(False, 0.0001, seed=50).show(n=200)
+print("Dataframe sample printed")
+
+# Write to BigQuery
+if upload:
+    # Create BigQuery Dataset
+    client = bigquery.Client()
+    dataset_id = '{}.new_york_citibike_trips'.format(client.project)
+    dataset = bigquery.Dataset(dataset_id)
+    dataset.location = "US"
+    dataset = client.create_dataset(dataset)
+
+    # Saving the data to BigQuery
+    spark.conf.set('temporaryGcsBucket', bucket_name)
+
+    df.write.format('bigquery') \
+        .option('table', dataset_id + ".RAW_DATA") \
+        .save()
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
new file mode 100644
index 00000000000..12730a3a6fe
--- /dev/null
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -0,0 +1,6 @@
+# Submit a PySpark job via the Cloud Dataproc Jobs API
+gcloud dataproc jobs submit pyspark \
+    --cluster ${CLUSTER_NAME} \
+    --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
+    --driver-log-levels root=FATAL \
+    setup.py -- ${BUCKET_NAME}

From 9148f5b1c1ee987833a425b2dfb683ec9fe8e95f Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Mon, 8 Jun 2020 10:24:56 -0400
Subject: [PATCH 27/59] begin addressing comments

---
 data-science-onramp/data-ingestion/setup.py   |  57 +++----
 data-science-onramp/data-ingestion/setup.sh   |   3 +
 .../{setup-test.py => setup_test.py}          | 145 +++++++-----------
 3 files changed, 90 insertions(+), 115 deletions(-)
 rename data-science-onramp/data-ingestion/{setup-test.py => setup_test.py} (59%)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 91a740b34d0..dc869903c84 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,4 +1,4 @@
-from random import choice, choices, randint, seed
+import random
 import sys
 
 from time import time_ns
@@ -19,10 +19,10 @@
 upload = True  # Whether to upload data to BigQuery
 
 # Check whether or not results should be uploaded
-try:
-    sys.argv[2]
+if len(sys.arv) > 1:
     upload = False
-except IndexError:
+    print("Not uploading results to BigQuery")
+else:
     print("Results will be uploaded to BigQuery")
 
 table = "bigquery-public-data.new_york_citibike.citibike_trips"
@@ -37,59 +37,60 @@
 # START MAKING DATA DIRTY
 
 
-def random_select(items, cum_weights):
+def random_select(items, weights):
     '''Picks an item according to the cumulative weights'''
-    return choices(items, cum_weights=cum_weights, k=1)[0]
+    return random.choices(items, weights=weights, k=1)[0]
 
 
-def tripduration(duration):
+def trip_duration(duration):
     '''Converts trip duration to other units'''
     seconds = str(duration) + " s"
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
-    return random_select([seconds, minutes, hours, str(randint(-1000, -1))],
-                         [0.3, 0.6, 0.9, 1])
+    return random_select([seconds, minutes, hours,
+                         str(random.randint(-1000, -1))],
+                         [0.3, 0.3, 0.3, 0.1])
 
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
-    return choice([name, name.replace("&", "/")])
+    return random.choice([name, name.replace("&", "/")])
 
 
-def usertype(user):
+def user_type(user):
     '''Manipulates the user type string'''
-    return choice([user, user.upper(), user.lower(),
-                  "sub" if user == "Subscriber" else user,
-                   "cust" if user == "Customer" else user])
+    return random.choice([user, user.upper(), user.lower(),
+                          "sub" if user == "Subscriber" else user,
+                          "cust" if user == "Customer" else user])
 
 
 def gender(s):
     '''Manipulates the gender string'''
-    return choice([s, s.upper(), s.lower(),
-                  s[0] if len(s) > 0 else "",
-                   s[0].lower() if len(s) > 0 else ""])
+    return random.choice([s, s.upper(), s.lower(),
+                         s[0] if len(s) > 0 else "",
+                         s[0].lower() if len(s) > 0 else ""])
 
 
-def convertAngle(angle):
+def convert_angle(angle):
     '''Converts long and lat to DMS notation'''
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
     new_angle = str(degrees) + u"\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
-    return random_select([str(angle), new_angle], cum_weights=[0.55, 1])
+    return random_select([str(angle), new_angle], [0.55, 0.45])
 
 
 def dirty_data(proc_func, allow_none):
     '''Master function returns a user defined function
     that transforms the column data'''
     def udf(col_value):
-        seed(hash(col_value) + time_ns())
+        random.seed(hash(col_value) + time_ns())
         if col_value is None:
             return col_value
         elif allow_none:
             return random_select([None, proc_func(col_value)],
-                                 cum_weights=[0.05, 1])
+                                 [0.05, 0.95])
         else:
             return proc_func(col_value)
     return udf
@@ -101,19 +102,19 @@ def id(x):
 
 # Declare data transformations for each column in dataframe
 udfs = [
-    (dirty_data(tripduration, True), StringType()),  # tripduration
+    (dirty_data(trip_duration, True), StringType()),  # tripduration
     (dirty_data(id, True), StringType()),  # starttime
     (dirty_data(id, True), StringType()),  # stoptime
     (id, IntegerType()),  # start_station_id
     (dirty_data(station_name, False), StringType()),  # start_station_name
-    (dirty_data(convertAngle, True), StringType()),  # start_station_latitude
-    (dirty_data(convertAngle, True), StringType()),  # start_station_longitude
+    (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
+    (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
     (id, IntegerType()),  # end_station_id
     (dirty_data(station_name, False), StringType()),  # end_station_name
-    (dirty_data(convertAngle, True), StringType()),  # end_station_latitude
-    (dirty_data(convertAngle, True), StringType()),  # end_station_longitude
+    (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
+    (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
     (id, IntegerType()),  # bikeid
-    (dirty_data(usertype, False), StringType()),  # usertype
+    (dirty_data(user_type, False), StringType()),  # usertype
     (id, IntegerType()),  # birth_year
     (dirty_data(gender, False), StringType()),  # gender
     (id, StringType()),  # customer_plan
@@ -136,7 +137,7 @@ def id(x):
 if upload:
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = '{}.new_york_citibike_trips'.format(client.project)
+    dataset_id = f'{client.project}.new_york_citibike_trips'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
index 12730a3a6fe..f78c8cd120b 100644
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -1,4 +1,7 @@
 # Submit a PySpark job via the Cloud Dataproc Jobs API
+# Requires having CLUSTER_NAME and BUCKET_NAME set as 
+# environment variables
+
 gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup_test.py
similarity index 59%
rename from data-science-onramp/data-ingestion/setup-test.py
rename to data-science-onramp/data-ingestion/setup_test.py
index d827c805818..54f3d20e902 100644
--- a/data-science-onramp/data-ingestion/setup-test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -11,62 +11,25 @@
 
 import pytest
 
-waiting_cluster_callback = False
 
 # Set global variables
-project = os.environ['GCLOUD_PROJECT']
-region = "us-central1"
-zone = "us-central1-a"
-cluster_name = 'setup-test-{}'.format(str(uuid.uuid4()))
-bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4()))
+PROJECT = os.environ['GCLOUD_PROJECT']
+REGION = "us-central1"
+ZONE = "us-central1-a"
+CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
+BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
 
+BUCKET = None
 
-@pytest.fixture(autouse=True)
-def teardown():
-    yield
-
-    # Delete cluster
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{region}-dataproc.googleapis.com:443'
-    })
-
-    try:
-        operation = cluster_client.delete_cluster(project, region,
-                                                  cluster_name)
-        operation.result()
-    except GoogleAPICallError:
-        pass
-
-    # Delete GCS bucket
-    storage_client = storage.Client()
-    try:
-        bucket = storage_client.get_bucket(bucket_name)
-        bucket.delete(force=True)
-    except NotFound:
-        pass
-
-
-def test_setup(capsys):
-    '''Tests setup.py by submitting it to a dataproc cluster'''
-
-    # Create GCS Bucket
-    storage_client = storage.Client()
-    bucket = storage_client.create_bucket(bucket_name)
-
-    # Upload file
-    destination_blob_name = "setup.py"
-    blob = bucket.blob(destination_blob_name)
-    blob.upload_from_filename("setup.py")
-
-    job_file_name = "gs://" + bucket_name + "/setup.py"
 
+@pytest.fixture(autouse=True)
+def setup_and_teardown_cluster():
     # Create cluster configuration
     zone_uri = \
-        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
-            project, zone)
+        f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
     cluster_data = {
-        'project_id': project,
-        'cluster_name': cluster_name,
+        'project_id': PROJECT,
+        'cluster_name': CLUSTER_NAME,
         'config': {
             'gce_cluster_config': {
                 'zone_uri': zone_uri,
@@ -99,27 +62,59 @@ def test_setup(capsys):
 
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    cluster = cluster_client.create_cluster(project, region, cluster_data)
-    cluster.add_done_callback(callback)
+    operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
 
     # Wait for cluster to provision
-    global waiting_cluster_callback
-    waiting_cluster_callback = True
+    operation.result()
 
-    wait_for_cluster_creation()
+    yield
+
+    # Delete cluster
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
+    })
+
+    operation = cluster_client.delete_cluster(PROJECT, REGION,
+                                              CLUSTER_NAME)
+    operation.result()
+
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown_bucket():
+    global BUCKET
+    # Create GCS Bucket
+    storage_client = storage.Client()
+    BUCKET = storage_client.create_bucket(BUCKET_NAME)
+
+    yield
+
+    # Delete GCS bucket
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(BUCKET_NAME)
+    bucket.delete(force=True)
+
+def test_setup(capsys):
+    '''Tests setup.py by submitting it to a dataproc cluster'''
+
+    # Upload file
+    destination_blob_name = "setup.py"
+    blob = BUCKET.blob(destination_blob_name)
+    blob.upload_from_filename("setup.py")
+
+    job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
 
     # Create job configuration
     job_details = {
         'placement': {
-            'cluster_name': cluster_name
+            'cluster_name': CLUSTER_NAME
         },
         'pyspark_job': {
             'main_python_file_uri': job_file_name,
             'args': [
-                bucket_name,
+                BUCKET_NAME,
                 "--test",
             ],
             "jar_file_uris": [
@@ -130,25 +125,21 @@ def test_setup(capsys):
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region)
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    result = job_client.submit_job(project_id=project, region=region,
+    response = job_client.submit_job(project_id=PROJECT, region=REGION,
                                    job=job_details)
 
-    job_id = result.reference.job_id
+    job_id = response.reference.job_id
     print('Submitted job \"{}\".'.format(job_id))
 
     # Wait for job to complete
-    wait_for_job(job_client, job_id)
+    result = response.add_done_callback(callback)
 
     # Get job output
-    cluster_info = cluster_client.get_cluster(project, region, cluster_name)
-    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
-    output_blob = (
-        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
-        .format(cluster_info.cluster_uuid, job_id))
-    out = bucket.blob(output_blob).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri() + ".000000000"
+    output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -186,25 +177,5 @@ def test_setup(capsys):
     # Missing data
     assert "null" in out
 
-
 def callback(operation_future):
-    '''Sets a flag to stop waiting'''
-    global waiting_cluster_callback
-    waiting_cluster_callback = False
-
-
-def wait_for_cluster_creation():
-    '''Waits for cluster to create'''
-    while True:
-        if not waiting_cluster_callback:
-            break
-
-
-def wait_for_job(job_client, job_id):
-    '''Waits for job to finish'''
-    while True:
-        job = job_client.get_job(project, region, job_id)
-        assert job.status.State.Name(job.status.state) != "ERROR"
-
-        if job.status.State.Name(job.status.state) == "DONE":
-            return
+    return operation_future.result()

From 1abf664ed47ec3656a293c46a9ff7385b748bbab Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Mon, 8 Jun 2020 11:26:33 -0400
Subject: [PATCH 28/59] change submit job

---
 data-science-onramp/data-ingestion/noxfile.py | 225 ++++++++++++++++++
 .../data-ingestion/setup_test.py              |   9 +-
 2 files changed, 228 insertions(+), 6 deletions(-)
 create mode 100644 data-science-onramp/data-ingestion/noxfile.py

diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py
new file mode 100644
index 00000000000..b23055f14a6
--- /dev/null
+++ b/data-science-onramp/data-ingestion/noxfile.py
@@ -0,0 +1,225 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+from pathlib import Path
+import sys
+
+import nox
+
+
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+#           DO NOT EDIT THIS FILE EVER!
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+
+# Copy `noxfile_config.py` to your directory and modify it instead.
+
+
+# `TEST_CONFIG` dict is a configuration hook that allows users to
+# modify the test configurations. The values here should be in sync
+# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
+# their directory and modify it.
+
+TEST_CONFIG = {
+    # You can opt out from the test for specific Python versions.
+    'ignored_versions': ["2.7"],
+
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    'gcloud_project_env': 'GCLOUD_PROJECT',
+    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    'envs': {},
+}
+
+
+try:
+    # Ensure we can import noxfile_config in the project's directory.
+    sys.path.append('.')
+    from noxfile_config import TEST_CONFIG_OVERRIDE
+except ImportError as e:
+    print("No user noxfile_config found: detail: {}".format(e))
+    TEST_CONFIG_OVERRIDE = {}
+
+# Update the TEST_CONFIG with the user supplied values.
+TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
+
+
+def get_pytest_env_vars():
+    """Returns a dict for pytest invocation."""
+    ret = {}
+
+    # Override the GCLOUD_PROJECT and the alias.
+    env_key = TEST_CONFIG['gcloud_project_env']
+    # This should error out if not set.
+    ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
+    ret['GCLOUD_PROJECT'] = os.environ[env_key]
+
+    # Apply user supplied envs.
+    ret.update(TEST_CONFIG['envs'])
+    return ret
+
+
+# DO NOT EDIT - automatically generated.
+# All versions used to tested samples.
+ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
+
+# Any default versions that should be ignored.
+IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
+
+TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
+
+INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False))
+#
+# Style Checks
+#
+
+
+def _determine_local_import_names(start_dir):
+    """Determines all import names that should be considered "local".
+
+    This is used when running the linter to insure that import order is
+    properly checked.
+    """
+    file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)]
+    return [
+        basename
+        for basename, extension in file_ext_pairs
+        if extension == ".py"
+        or os.path.isdir(os.path.join(start_dir, basename))
+        and basename not in ("__pycache__")
+    ]
+
+
+# Linting with flake8.
+#
+# We ignore the following rules:
+#   E203: whitespace before ‘:’
+#   E266: too many leading ‘#’ for block comment
+#   E501: line too long
+#   I202: Additional newline in a section of imports
+#
+# We also need to specify the rules which are ignored by default:
+# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
+FLAKE8_COMMON_ARGS = [
+    "--show-source",
+    "--builtin=gettext",
+    "--max-complexity=20",
+    "--import-order-style=google",
+    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
+    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
+    "--max-line-length=88",
+]
+
+
+@nox.session
+def lint(session):
+    session.install("flake8", "flake8-import-order")
+
+    local_names = _determine_local_import_names(".")
+    args = FLAKE8_COMMON_ARGS + [
+        "--application-import-names",
+        ",".join(local_names),
+        "."
+    ]
+    session.run("flake8", *args)
+
+
+#
+# Sample Tests
+#
+
+
+PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
+
+
+def _session_tests(session, post_install=None):
+    """Runs py.test for a particular project."""
+    if os.path.exists("requirements.txt"):
+        session.install("-r", "requirements.txt")
+
+    if os.path.exists("requirements-test.txt"):
+        session.install("-r", "requirements-test.txt")
+
+    if INSTALL_LIBRARY_FROM_SOURCE:
+        session.install("-e", _get_repo_root())
+
+    if post_install:
+        post_install(session)
+
+    session.run(
+        "pytest",
+        *(PYTEST_COMMON_ARGS + session.posargs),
+        # Pytest will return 5 when no tests are collected. This can happen
+        # on travis where slow and flaky tests are excluded.
+        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
+        success_codes=[0, 5],
+        env=get_pytest_env_vars()
+    )
+
+
+@nox.session(python=ALL_VERSIONS)
+def py(session):
+    """Runs py.test for a sample using the specified version of Python."""
+    if session.python in TESTED_VERSIONS:
+        _session_tests(session)
+    else:
+        session.skip("SKIPPED: {} tests are disabled for this sample.".format(
+            session.python
+        ))
+
+
+#
+# Readmegen
+#
+
+
+def _get_repo_root():
+    """ Returns the root folder of the project. """
+    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
+    p = Path(os.getcwd())
+    for i in range(10):
+        if p is None:
+            break
+        if Path(p / ".git").exists():
+            return str(p)
+        p = p.parent
+    raise Exception("Unable to detect repository root.")
+
+
+GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
+
+
+@nox.session
+@nox.parametrize("path", GENERATED_READMES)
+def readmegen(session, path):
+    """(Re-)generates the readme for a sample."""
+    session.install("jinja2", "pyyaml")
+    dir_ = os.path.dirname(path)
+
+    if os.path.exists(os.path.join(dir_, "requirements.txt")):
+        session.install("-r", os.path.join(dir_, "requirements.txt"))
+
+    in_file = os.path.join(dir_, "README.rst.in")
+    session.run(
+        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
+    )
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 54f3d20e902..919dcc4f35c 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -128,17 +128,14 @@ def test_setup(capsys):
         'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
     })
 
-    response = job_client.submit_job(project_id=PROJECT, region=REGION,
+    response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
                                    job=job_details)
 
-    job_id = response.reference.job_id
-    print('Submitted job \"{}\".'.format(job_id))
-
     # Wait for job to complete
-    result = response.add_done_callback(callback)
+    result = response.result()
 
     # Get job output
-    output_location = result.driver_output_resource_uri() + ".000000000"
+    output_location = result.driver_output_resource_uri + ".000000000"
     output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration

From c6007249ebff5d5f827fa8482c8c488c9eb46a06 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Mon, 8 Jun 2020 14:43:48 -0400
Subject: [PATCH 29/59] address code structure and global variable issues

---
 data-science-onramp/data-ingestion/noxfile.py | 225 ------------------
 data-science-onramp/data-ingestion/setup.py   | 125 +++++-----
 .../data-ingestion/setup_test.py              |  17 +-
 3 files changed, 78 insertions(+), 289 deletions(-)
 delete mode 100644 data-science-onramp/data-ingestion/noxfile.py

diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py
deleted file mode 100644
index b23055f14a6..00000000000
--- a/data-science-onramp/data-ingestion/noxfile.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-from pathlib import Path
-import sys
-
-import nox
-
-
-# WARNING - WARNING - WARNING - WARNING - WARNING
-# WARNING - WARNING - WARNING - WARNING - WARNING
-#           DO NOT EDIT THIS FILE EVER!
-# WARNING - WARNING - WARNING - WARNING - WARNING
-# WARNING - WARNING - WARNING - WARNING - WARNING
-
-# Copy `noxfile_config.py` to your directory and modify it instead.
-
-
-# `TEST_CONFIG` dict is a configuration hook that allows users to
-# modify the test configurations. The values here should be in sync
-# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
-# their directory and modify it.
-
-TEST_CONFIG = {
-    # You can opt out from the test for specific Python versions.
-    'ignored_versions': ["2.7"],
-
-    # An envvar key for determining the project id to use. Change it
-    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
-    # build specific Cloud project. You can also use your own string
-    # to use your own Cloud project.
-    'gcloud_project_env': 'GCLOUD_PROJECT',
-    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
-
-    # A dictionary you want to inject into your test. Don't put any
-    # secrets here. These values will override predefined values.
-    'envs': {},
-}
-
-
-try:
-    # Ensure we can import noxfile_config in the project's directory.
-    sys.path.append('.')
-    from noxfile_config import TEST_CONFIG_OVERRIDE
-except ImportError as e:
-    print("No user noxfile_config found: detail: {}".format(e))
-    TEST_CONFIG_OVERRIDE = {}
-
-# Update the TEST_CONFIG with the user supplied values.
-TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
-
-
-def get_pytest_env_vars():
-    """Returns a dict for pytest invocation."""
-    ret = {}
-
-    # Override the GCLOUD_PROJECT and the alias.
-    env_key = TEST_CONFIG['gcloud_project_env']
-    # This should error out if not set.
-    ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
-    ret['GCLOUD_PROJECT'] = os.environ[env_key]
-
-    # Apply user supplied envs.
-    ret.update(TEST_CONFIG['envs'])
-    return ret
-
-
-# DO NOT EDIT - automatically generated.
-# All versions used to tested samples.
-ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
-
-# Any default versions that should be ignored.
-IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
-
-TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
-
-INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False))
-#
-# Style Checks
-#
-
-
-def _determine_local_import_names(start_dir):
-    """Determines all import names that should be considered "local".
-
-    This is used when running the linter to insure that import order is
-    properly checked.
-    """
-    file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)]
-    return [
-        basename
-        for basename, extension in file_ext_pairs
-        if extension == ".py"
-        or os.path.isdir(os.path.join(start_dir, basename))
-        and basename not in ("__pycache__")
-    ]
-
-
-# Linting with flake8.
-#
-# We ignore the following rules:
-#   E203: whitespace before ‘:’
-#   E266: too many leading ‘#’ for block comment
-#   E501: line too long
-#   I202: Additional newline in a section of imports
-#
-# We also need to specify the rules which are ignored by default:
-# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
-FLAKE8_COMMON_ARGS = [
-    "--show-source",
-    "--builtin=gettext",
-    "--max-complexity=20",
-    "--import-order-style=google",
-    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
-    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
-    "--max-line-length=88",
-]
-
-
-@nox.session
-def lint(session):
-    session.install("flake8", "flake8-import-order")
-
-    local_names = _determine_local_import_names(".")
-    args = FLAKE8_COMMON_ARGS + [
-        "--application-import-names",
-        ",".join(local_names),
-        "."
-    ]
-    session.run("flake8", *args)
-
-
-#
-# Sample Tests
-#
-
-
-PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
-
-
-def _session_tests(session, post_install=None):
-    """Runs py.test for a particular project."""
-    if os.path.exists("requirements.txt"):
-        session.install("-r", "requirements.txt")
-
-    if os.path.exists("requirements-test.txt"):
-        session.install("-r", "requirements-test.txt")
-
-    if INSTALL_LIBRARY_FROM_SOURCE:
-        session.install("-e", _get_repo_root())
-
-    if post_install:
-        post_install(session)
-
-    session.run(
-        "pytest",
-        *(PYTEST_COMMON_ARGS + session.posargs),
-        # Pytest will return 5 when no tests are collected. This can happen
-        # on travis where slow and flaky tests are excluded.
-        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
-        success_codes=[0, 5],
-        env=get_pytest_env_vars()
-    )
-
-
-@nox.session(python=ALL_VERSIONS)
-def py(session):
-    """Runs py.test for a sample using the specified version of Python."""
-    if session.python in TESTED_VERSIONS:
-        _session_tests(session)
-    else:
-        session.skip("SKIPPED: {} tests are disabled for this sample.".format(
-            session.python
-        ))
-
-
-#
-# Readmegen
-#
-
-
-def _get_repo_root():
-    """ Returns the root folder of the project. """
-    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
-    p = Path(os.getcwd())
-    for i in range(10):
-        if p is None:
-            break
-        if Path(p / ".git").exists():
-            return str(p)
-        p = p.parent
-    raise Exception("Unable to detect repository root.")
-
-
-GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
-
-
-@nox.session
-@nox.parametrize("path", GENERATED_READMES)
-def readmegen(session, path):
-    """(Re-)generates the readme for a sample."""
-    session.install("jinja2", "pyyaml")
-    dir_ = os.path.dirname(path)
-
-    if os.path.exists(os.path.join(dir_, "requirements.txt")):
-        session.install("-r", os.path.join(dir_, "requirements.txt"))
-
-    in_file = os.path.join(dir_, "README.rst.in")
-    session.run(
-        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
-    )
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index dc869903c84..da162e1c91d 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -12,31 +12,11 @@
 from pyspark.sql.types import IntegerType, StringType
 
 
-# Create a SparkSession under the name "setup". Viewable via the Spark UI
-spark = SparkSession.builder.appName("setup").getOrCreate()
+BUCKET_NAME = sys.argv[1] 
+TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 
-bucket_name = sys.argv[1]
-upload = True  # Whether to upload data to BigQuery
-
-# Check whether or not results should be uploaded
-if len(sys.arv) > 1:
-    upload = False
-    print("Not uploading results to BigQuery")
-else:
-    print("Results will be uploaded to BigQuery")
-
-table = "bigquery-public-data.new_york_citibike.citibike_trips"
-
-# Check if table exists
-try:
-    df = spark.read.format('bigquery').option('table', table).load()
-except Py4JJavaError:
-    print(f"{table} does not exist. ")
-    sys.exit(0)
 
 # START MAKING DATA DIRTY
-
-
 def random_select(items, weights):
     '''Picks an item according to the cumulative weights'''
     return random.choices(items, weights=weights, k=1)[0]
@@ -81,6 +61,8 @@ def convert_angle(angle):
     return random_select([str(angle), new_angle], [0.55, 0.45])
 
 
+# This function is nested since a UserDefinedFunction is
+# expected to take a single argument
 def dirty_data(proc_func, allow_none):
     '''Master function returns a user defined function
     that transforms the column data'''
@@ -99,42 +81,9 @@ def udf(col_value):
 def id(x):
     return x
 
+def write_to_bigquery(df):
+    '''Write a dataframe to BigQuery'''
 
-# Declare data transformations for each column in dataframe
-udfs = [
-    (dirty_data(trip_duration, True), StringType()),  # tripduration
-    (dirty_data(id, True), StringType()),  # starttime
-    (dirty_data(id, True), StringType()),  # stoptime
-    (id, IntegerType()),  # start_station_id
-    (dirty_data(station_name, False), StringType()),  # start_station_name
-    (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
-    (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-    (id, IntegerType()),  # end_station_id
-    (dirty_data(station_name, False), StringType()),  # end_station_name
-    (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
-    (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-    (id, IntegerType()),  # bikeid
-    (dirty_data(user_type, False), StringType()),  # usertype
-    (id, IntegerType()),  # birth_year
-    (dirty_data(gender, False), StringType()),  # gender
-    (id, StringType()),  # customer_plan
-]
-
-# Apply dirty transformations to df
-names = df.schema.names
-new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
-                     for udf, column, name in zip(udfs, df.columns, names)])
-
-# Duplicate about 0.01% of the rows
-dup_df = new_df.sample(False, 0.0001, seed=42)
-
-# Create final dirty dataframe
-df = new_df.union(dup_df)
-df.sample(False, 0.0001, seed=50).show(n=200)
-print("Dataframe sample printed")
-
-# Write to BigQuery
-if upload:
     # Create BigQuery Dataset
     client = bigquery.Client()
     dataset_id = f'{client.project}.new_york_citibike_trips'
@@ -143,8 +92,68 @@ def id(x):
     dataset = client.create_dataset(dataset)
 
     # Saving the data to BigQuery
-    spark.conf.set('temporaryGcsBucket', bucket_name)
+    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
 
     df.write.format('bigquery') \
         .option('table', dataset_id + ".RAW_DATA") \
         .save()
+
+def main():
+    # Create a SparkSession under the name "setup". Viewable via the Spark UI
+    spark = SparkSession.builder.appName("setup").getOrCreate()
+
+    upload = True  # Whether to upload data to BigQuery
+
+    # Check whether or not results should be uploaded
+    if len(sys.argv) > 1:
+        upload = False
+        print("Not uploading results to BigQuery")
+    else:
+        print("Results will be uploaded to BigQuery")
+
+    # Check if table exists
+    try:
+        df = spark.read.format('bigquery').option('table', TABLE).load()
+    except Py4JJavaError:
+        print(f"{TABLE} does not exist. ")
+        sys.exit(0)
+
+    # Declare data transformations for each column in dataframe
+    udfs = [
+        (dirty_data(trip_duration, True), StringType()),  # tripduration
+        (dirty_data(id, True), StringType()),  # starttime
+        (dirty_data(id, True), StringType()),  # stoptime
+        (id, IntegerType()),  # start_station_id
+        (dirty_data(station_name, False), StringType()),  # start_station_name
+        (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
+        (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
+        (id, IntegerType()),  # end_station_id
+        (dirty_data(station_name, False), StringType()),  # end_station_name
+        (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
+        (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
+        (id, IntegerType()),  # bikeid
+        (dirty_data(user_type, False), StringType()),  # usertype
+        (id, IntegerType()),  # birth_year
+        (dirty_data(gender, False), StringType()),  # gender
+        (id, StringType()),  # customer_plan
+    ]
+
+    # Apply dirty transformations to df
+    names = df.schema.names
+    new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
+                         for udf, column, name in zip(udfs, df.columns, names)])
+
+    # Duplicate about 0.01% of the rows
+    dup_df = new_df.sample(False, 0.0001, seed=42)
+
+    # Create final dirty dataframe
+    df = new_df.union(dup_df)
+    df.sample(False, 0.0001, seed=50).show(n=200)
+    print("Dataframe sample printed")
+
+    if upload:
+        write_to_bigquery(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 919dcc4f35c..e62d2cc1355 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -52,7 +52,7 @@ def setup_and_teardown_cluster():
                 }
             ],
             "software_config": {
-                "image_version": "1.5.4-debian10",
+                "image_version": "1.4-debian10",
                 "optional_components": [
                     "ANACONDA"
                 ],
@@ -134,9 +134,17 @@ def test_setup(capsys):
     # Wait for job to complete
     result = response.result()
 
+    cluster_client = dataproc.ClusterControllerClient(client_options={
+        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+    })
+
+    cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME)
+
     # Get job output
-    output_location = result.driver_output_resource_uri + ".000000000"
-    output = BUCKET.blob(output_location).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000"
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
+    output = bucket.blob(output_location).download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -173,6 +181,3 @@ def test_setup(capsys):
 
     # Missing data
     assert "null" in out
-
-def callback(operation_future):
-    return operation_future.result()

From ce04a6f8a578d17d6d7c592ce916bd700684666c Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Mon, 8 Jun 2020 19:12:09 -0400
Subject: [PATCH 30/59] get dataproc job output and fix linting

---
 .gitignore                                    |  1 +
 data-science-onramp/data-ingestion/setup.py   | 14 ++++++----
 .../data-ingestion/setup_test.py              | 28 +++++++++----------
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore
index c827e035649..369e7983b52 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ credentials.dat
 .DS_store
 env/
 .idea
+data-science-onramp/data-ingestion/noxfile.py
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index da162e1c91d..09046b75879 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -12,7 +12,7 @@
 from pyspark.sql.types import IntegerType, StringType
 
 
-BUCKET_NAME = sys.argv[1] 
+BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 
 
@@ -81,7 +81,8 @@ def udf(col_value):
 def id(x):
     return x
 
-def write_to_bigquery(df):
+
+def write_to_bigquery(spark, df):
     '''Write a dataframe to BigQuery'''
 
     # Create BigQuery Dataset
@@ -98,6 +99,7 @@ def write_to_bigquery(df):
         .option('table', dataset_id + ".RAW_DATA") \
         .save()
 
+
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
@@ -143,16 +145,16 @@ def main():
     new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
                          for udf, column, name in zip(udfs, df.columns, names)])
 
+    new_df.sample(False, 0.0001, seed=50).show(n=100)
+
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(False, 0.0001, seed=42)
+    dup_df = new_df.sample(True, 0.0001, seed=42)
 
     # Create final dirty dataframe
     df = new_df.union(dup_df)
-    df.sample(False, 0.0001, seed=50).show(n=200)
-    print("Dataframe sample printed")
 
     if upload:
-        write_to_bigquery(df)
+        write_to_bigquery(spark, df)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index e62d2cc1355..f55a155bc75 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -3,11 +3,8 @@
 
 import uuid
 
-from google.api_core.exceptions import GoogleAPICallError
-
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-from google.cloud.exceptions import NotFound
 
 import pytest
 
@@ -52,7 +49,7 @@ def setup_and_teardown_cluster():
                 }
             ],
             "software_config": {
-                "image_version": "1.4-debian10",
+                "image_version": "1.5.4-debian10",
                 "optional_components": [
                     "ANACONDA"
                 ],
@@ -96,6 +93,7 @@ def setup_and_teardown_bucket():
     bucket = storage_client.get_bucket(BUCKET_NAME)
     bucket.delete(force=True)
 
+
 def test_setup(capsys):
     '''Tests setup.py by submitting it to a dataproc cluster'''
 
@@ -129,22 +127,15 @@ def test_setup(capsys):
     })
 
     response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                   job=job_details)
+                                                  job=job_details)
 
     # Wait for job to complete
     result = response.result()
 
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
-    })
-
-    cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME)
-
     # Get job output
-    output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000"
-    storage_client = storage.Client()
-    bucket = storage_client.get_bucket(cluster_info.config.config_bucket)
-    output = bucket.blob(output_location).download_as_string().decode("utf-8")
+    output_location = result.driver_output_resource_uri + ".000000000"
+    blob = get_blob_from_path(output_location)
+    out = blob.download_as_string().decode("utf-8")
 
     # tripDuration
     assert re.search("[0-9] s", out)
@@ -181,3 +172,10 @@ def test_setup(capsys):
 
     # Missing data
     assert "null" in out
+
+
+def get_blob_from_path(path):
+    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
+    bucket = storage.Client().get_bucket(bucket_name)
+    output_location = re.search("google-cloud-dataproc.+", path).group(0)
+    return bucket.blob(output_location)

From ef2d2b3514c5c83ccc403e93baad4b4951168ea3 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Tue, 9 Jun 2020 15:32:02 -0400
Subject: [PATCH 31/59] fix PR comments

---
 .gitignore                                    |  30 -----
 data-science-onramp/data-ingestion/setup.py   |  28 ++--
 .../data-ingestion/setup_test.py              | 125 ++++++++----------
 3 files changed, 73 insertions(+), 110 deletions(-)
 delete mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 369e7983b52..00000000000
--- a/.gitignore
+++ /dev/null
@@ -1,30 +0,0 @@
-.coveralls.yml
-*.pyc
-.coverage
-.tox
-.pytest_cache
-.ipynb_checkpoints
-.executed_notebooks
-coverage.xml
-python-docs-samples.json
-service-account.json
-client-secrets.json
-__pycache__
-*db\.sqlite3
-managed_vms/django_tutorial/static/*
-**/migrations/*
-lib
-testing/resources/test-env.sh
-testing/resources/service-account.json
-testing/resources/client-secrets.json
-secrets.tar
-.cache
-junit.xml
-credentials.dat
-.nox
-.vscode/
-*sponge_log.xml
-.DS_store
-env/
-.idea
-data-science-onramp/data-ingestion/noxfile.py
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 09046b75879..cf61f81562a 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -46,8 +46,8 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
-    return random.choice([s, s.upper(), s.lower(),
-                         s[0] if len(s) > 0 else "",
+    return random.choice([s.upper(), s.lower(),
+                         s[0].upper() if len(s) > 0 else "",
                          s[0].lower() if len(s) > 0 else ""])
 
 
@@ -78,7 +78,9 @@ def udf(col_value):
     return udf
 
 
-def id(x):
+# This function is required because we need to apply a
+# function for every column and some columns do not change
+def identity(x):
     return x
 
 
@@ -118,26 +120,26 @@ def main():
         df = spark.read.format('bigquery').option('table', TABLE).load()
     except Py4JJavaError:
         print(f"{TABLE} does not exist. ")
-        sys.exit(0)
+        return
 
     # Declare data transformations for each column in dataframe
     udfs = [
         (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(id, True), StringType()),  # starttime
-        (dirty_data(id, True), StringType()),  # stoptime
-        (id, IntegerType()),  # start_station_id
+        (dirty_data(identity, True), StringType()),  # starttime
+        (dirty_data(identity, True), StringType()),  # stoptime
+        (identity, IntegerType()),  # start_station_id
         (dirty_data(station_name, False), StringType()),  # start_station_name
         (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (id, IntegerType()),  # end_station_id
+        (identity, IntegerType()),  # end_station_id
         (dirty_data(station_name, False), StringType()),  # end_station_name
         (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (id, IntegerType()),  # bikeid
+        (identity, IntegerType()),  # bikeid
         (dirty_data(user_type, False), StringType()),  # usertype
-        (id, IntegerType()),  # birth_year
+        (identity, IntegerType()),  # birth_year
         (dirty_data(gender, False), StringType()),  # gender
-        (id, StringType()),  # customer_plan
+        (identity, StringType()),  # customer_plan
     ]
 
     # Apply dirty transformations to df
@@ -145,10 +147,10 @@ def main():
     new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
                          for udf, column, name in zip(udfs, df.columns, names)])
 
-    new_df.sample(False, 0.0001, seed=50).show(n=100)
+    new_df.sample(False, 0.0001).show(n=100)
 
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(True, 0.0001, seed=42)
+    dup_df = new_df.sample(True, 0.0001)
 
     # Create final dirty dataframe
     df = new_df.union(dup_df)
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index f55a155bc75..aab08230028 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -12,24 +12,36 @@
 # Set global variables
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"
-ZONE = "us-central1-a"
 CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
 BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
-
-BUCKET = None
+DESTINATION_BLOB_NAME = "setup.py"
+JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
+JOB_DETAILS = {  # Job configuration
+    'placement': {
+        'cluster_name': CLUSTER_NAME
+    },
+    'pyspark_job': {
+        'main_python_file_uri': JOB_FILE_NAME,
+        'args': [
+            BUCKET_NAME,
+            "--test",
+        ],
+        "jar_file_uris": [
+                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
+        ],
+    },
+}
 
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_cluster():
     # Create cluster configuration
-    zone_uri = \
-        f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}'
     cluster_data = {
         'project_id': PROJECT,
         'cluster_name': CLUSTER_NAME,
         'config': {
             'gce_cluster_config': {
-                'zone_uri': zone_uri,
+                'zone_uri': '',
                 "metadata": {
                     "PIP_PACKAGES": "google-cloud-storage"
                 },
@@ -59,9 +71,8 @@ def setup_and_teardown_cluster():
 
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-
     operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
 
     # Wait for cluster to provision
@@ -70,10 +81,6 @@ def setup_and_teardown_cluster():
     yield
 
     # Delete cluster
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
-    })
-
     operation = cluster_client.delete_cluster(PROJECT, REGION,
                                               CLUSTER_NAME)
     operation.result()
@@ -81,53 +88,41 @@ def setup_and_teardown_cluster():
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_bucket():
-    global BUCKET
     # Create GCS Bucket
     storage_client = storage.Client()
-    BUCKET = storage_client.create_bucket(BUCKET_NAME)
+    bucket = storage_client.create_bucket(BUCKET_NAME)
+
+    # Upload file
+    blob = bucket.blob(DESTINATION_BLOB_NAME)
+    blob.upload_from_filename("setup.py")
 
     yield
 
     # Delete GCS bucket
-    storage_client = storage.Client()
     bucket = storage_client.get_bucket(BUCKET_NAME)
     bucket.delete(force=True)
 
 
-def test_setup(capsys):
-    '''Tests setup.py by submitting it to a dataproc cluster'''
+def get_blob_from_path(path):
+    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
+    bucket = storage.Client().get_bucket(bucket_name)
+    output_location = re.search("google-cloud-dataproc.+", path).group(0)
+    return bucket.blob(output_location)
 
-    # Upload file
-    destination_blob_name = "setup.py"
-    blob = BUCKET.blob(destination_blob_name)
-    blob.upload_from_filename("setup.py")
 
-    job_file_name = "gs://" + BUCKET_NAME + "/setup.py"
-
-    # Create job configuration
-    job_details = {
-        'placement': {
-            'cluster_name': CLUSTER_NAME
-        },
-        'pyspark_job': {
-            'main_python_file_uri': job_file_name,
-            'args': [
-                BUCKET_NAME,
-                "--test",
-            ],
-            "jar_file_uris": [
-                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
-            ],
-        },
-    }
+def is_in_table(value, out):
+    return re.search(f"\| *{value}\|", out)
+
+
+def test_setup():
+    '''Tests setup.py by submitting it to a dataproc cluster'''
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION)
+        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-
     response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                                  job=job_details)
+                                                  job=JOB_DETAILS)
 
     # Wait for job to complete
     result = response.result()
@@ -150,32 +145,28 @@ def test_setup(capsys):
     assert re.search("20[0-9][0-9]\\|", out)
 
     # gender
-    assert "M" in out
-    assert "male" in out
-    assert "MALE" in out
-    assert "F" in out
-    assert "female" in out
-    assert "FEMALE" in out
-    assert "u" in out
-    assert "unknown" in out
-    assert "UNKNOWN" in out
+    assert is_in_table("M", out)
+    assert is_in_table("m", out)
+    assert is_in_table("male", out)
+    assert is_in_table("MALE", out)
+    assert is_in_table("F", out)
+    assert is_in_table("f", out)
+    assert is_in_table("female", out)
+    assert is_in_table("FEMALE", out)
+    assert is_in_table("U", out)
+    assert is_in_table("u", out)
+    assert is_in_table("unknown", out)
+    assert is_in_table("UNKNOWN", out)
 
     # customer_plan
-    assert "Subscriber" in out
-    assert "subscriber" in out
-    assert "SUBSCRIBER" in out
-    assert "sub" in out
-    assert "Customer" in out
-    assert "customer" in out
-    assert "CUSTOMER" in out
-    assert "cust" in out
+    assert is_in_table("Subscriber", out)
+    assert is_in_table("subscriber", out)
+    assert is_in_table("SUBSCRIBER", out)
+    assert is_in_table("sub", out)
+    assert is_in_table("Customer", out)
+    assert is_in_table("customer", out)
+    assert is_in_table("CUSTOMER", out)
+    assert is_in_table("cust", out)
 
     # Missing data
-    assert "null" in out
-
-
-def get_blob_from_path(path):
-    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
-    bucket = storage.Client().get_bucket(bucket_name)
-    output_location = re.search("google-cloud-dataproc.+", path).group(0)
-    return bucket.blob(output_location)
+    assert is_in_table("null", out)

From 93394a3aa0e8bfa9c0264b094d4e64033decd3a6 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Tue, 9 Jun 2020 16:01:52 -0400
Subject: [PATCH 32/59] linting and global vars

---
 .../data-ingestion/setup_test.py              | 68 +++++++++----------
 1 file changed, 33 insertions(+), 35 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index aab08230028..e9358de912c 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -31,49 +31,47 @@
         ],
     },
 }
-
-
-@pytest.fixture(autouse=True)
-def setup_and_teardown_cluster():
-    # Create cluster configuration
-    cluster_data = {
-        'project_id': PROJECT,
-        'cluster_name': CLUSTER_NAME,
-        'config': {
-            'gce_cluster_config': {
-                'zone_uri': '',
-                "metadata": {
-                    "PIP_PACKAGES": "google-cloud-storage"
-                },
+CLUSTER_DATA = {  # Create cluster configuration
+    'project_id': PROJECT,
+    'cluster_name': CLUSTER_NAME,
+    'config': {
+        'gce_cluster_config': {
+            'zone_uri': '',
+            "metadata": {
+                "PIP_PACKAGES": "google-cloud-storage"
             },
-            'master_config': {
-                'num_instances': 1,
-                'machine_type_uri': 'n1-standard-8'
-            },
-            'worker_config': {
-                'num_instances': 6,
-                'machine_type_uri': 'n1-standard-8'
-            },
-            "initialization_actions": [
-                {
-                    "executable_file": ("gs://dataproc-initialization-actions/"
-                                        "python/pip-install.sh"),
-                }
-            ],
-            "software_config": {
-                "image_version": "1.5.4-debian10",
-                "optional_components": [
-                    "ANACONDA"
-                ],
+        },
+        'master_config': {
+            'num_instances': 1,
+            'machine_type_uri': 'n1-standard-8'
+        },
+        'worker_config': {
+            'num_instances': 6,
+            'machine_type_uri': 'n1-standard-8'
+        },
+        "initialization_actions": [
+            {
+                "executable_file": ("gs://dataproc-initialization-actions/"
+                                    "python/pip-install.sh"),
             }
+        ],
+        "software_config": {
+            "image_version": "1.5.4-debian10",
+            "optional_components": [
+                "ANACONDA"
+            ],
         }
     }
+}
 
+
+@pytest.fixture(autouse=True)
+def setup_and_teardown_cluster():
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(client_options={
         'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
     })
-    operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data)
+    operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA)
 
     # Wait for cluster to provision
     operation.result()
@@ -111,7 +109,7 @@ def get_blob_from_path(path):
 
 
 def is_in_table(value, out):
-    return re.search(f"\| *{value}\|", out)
+    return re.search(f"\\| *{value}\\|", out)
 
 
 def test_setup():

From a6fc6e644316422da6d2c7e602676465920d46b6 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Wed, 10 Jun 2020 11:27:11 -0400
Subject: [PATCH 33/59] address Brad PR comments

---
 data-science-onramp/data-ingestion/setup.py   | 34 +++++++------------
 .../data-ingestion/setup_test.py              |  3 --
 2 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index cf61f81562a..7f5efa28e0a 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,13 +1,10 @@
 import random
 import sys
-
 from time import time_ns
 
 from google.cloud import bigquery
-
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-
 from pyspark.sql.functions import UserDefinedFunction
 from pyspark.sql.types import IntegerType, StringType
 
@@ -56,7 +53,7 @@ def convert_angle(angle):
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
-    new_angle = str(degrees) + u"\u00B0" + \
+    new_angle = str(degrees) + "\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
     return random_select([str(angle), new_angle], [0.55, 0.45])
 
@@ -78,13 +75,7 @@ def udf(col_value):
     return udf
 
 
-# This function is required because we need to apply a
-# function for every column and some columns do not change
-def identity(x):
-    return x
-
-
-def write_to_bigquery(spark, df):
+def write_to_bigquery(df):
     '''Write a dataframe to BigQuery'''
 
     # Create BigQuery Dataset
@@ -95,10 +86,9 @@ def write_to_bigquery(spark, df):
     dataset = client.create_dataset(dataset)
 
     # Saving the data to BigQuery
-    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
-
     df.write.format('bigquery') \
         .option('table', dataset_id + ".RAW_DATA") \
+        .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
 
@@ -109,7 +99,7 @@ def main():
     upload = True  # Whether to upload data to BigQuery
 
     # Check whether or not results should be uploaded
-    if len(sys.argv) > 1:
+    if len(sys.argv) > 2:
         upload = False
         print("Not uploading results to BigQuery")
     else:
@@ -125,21 +115,21 @@ def main():
     # Declare data transformations for each column in dataframe
     udfs = [
         (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(identity, True), StringType()),  # starttime
-        (dirty_data(identity, True), StringType()),  # stoptime
-        (identity, IntegerType()),  # start_station_id
+        (dirty_data(lambda x: x, True), StringType()),  # starttime
+        (dirty_data(lambda x: x, True), StringType()),  # stoptime
+        (lambda x: x, IntegerType()),  # start_station_id
         (dirty_data(station_name, False), StringType()),  # start_station_name
         (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (identity, IntegerType()),  # end_station_id
+        (lambda x: x, IntegerType()),  # end_station_id
         (dirty_data(station_name, False), StringType()),  # end_station_name
         (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
         (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (identity, IntegerType()),  # bikeid
+        (lambda x: x, IntegerType()),  # bikeid
         (dirty_data(user_type, False), StringType()),  # usertype
-        (identity, IntegerType()),  # birth_year
+        (lambda x: x, IntegerType()),  # birth_year
         (dirty_data(gender, False), StringType()),  # gender
-        (identity, StringType()),  # customer_plan
+        (lambda x: x, StringType()),  # customer_plan
     ]
 
     # Apply dirty transformations to df
@@ -156,7 +146,7 @@ def main():
     df = new_df.union(dup_df)
 
     if upload:
-        write_to_bigquery(spark, df)
+        write_to_bigquery(df)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index e9358de912c..2aa82535d79 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -1,14 +1,11 @@
 import os
 import re
-
 import uuid
 
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-
 import pytest
 
-
 # Set global variables
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"

From 1c9f52694d286b7646af538193efdcb51d9b24a2 Mon Sep 17 00:00:00 2001
From: Tushar Khan <tusharkhan@google.com>
Date: Thu, 11 Jun 2020 11:45:10 -0400
Subject: [PATCH 34/59] broken clean.py

---
 data-science-onramp/data-processing/clean.py | 44 ++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 data-science-onramp/data-processing/clean.py

diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py
new file mode 100644
index 00000000000..0bca32d3299
--- /dev/null
+++ b/data-science-onramp/data-processing/clean.py
@@ -0,0 +1,44 @@
+import os
+import sys
+
+from py4j.protocol import Py4JJavaError
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import UserDefinedFunction, lit
+from pyspark.sql.types import IntegerType, StringType
+
+
+PROJECT_ID = sys.argv[1]
+BUCKET_NAME = sys.argv[2]
+TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA'
+
+def station_name(name):
+    if name:
+        return name.replace('/', '&')
+    else:
+        return ''
+
+def main():
+    '''...'''
+    # Create a SparkSession under the name 'clean'. Viewable via the Spark UI
+    spark = SparkSession.builder.appName('clean').getOrCreate()
+
+    # Check if table exists
+    
+    try:
+        df = spark.read.format('bigquery').option('table', TABLE).load()
+    except Py4JJavaError:
+        print(f"{TABLE} does not exist. ")
+        return
+
+    udf_map = {
+        'start_station_name': (station_name, StringType())
+    }
+
+    for name, (func, col_type) in udf_map.items():
+        df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name))
+    
+    df = spark.createDataframe
+    df.show(n=100)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 327cf5b678a4bba2358281bcdb8be2b26aaf5ca1 Mon Sep 17 00:00:00 2001
From: Tushar Khan <tusharkhan@google.com>
Date: Thu, 11 Jun 2020 11:49:49 -0400
Subject: [PATCH 35/59] Revert "broken clean.py"

This reverts commit 580c8e1078e9480ef30d3083522fd2c467c4f1b1.
---
 data-science-onramp/data-processing/clean.py | 44 --------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 data-science-onramp/data-processing/clean.py

diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py
deleted file mode 100644
index 0bca32d3299..00000000000
--- a/data-science-onramp/data-processing/clean.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-
-from py4j.protocol import Py4JJavaError
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction, lit
-from pyspark.sql.types import IntegerType, StringType
-
-
-PROJECT_ID = sys.argv[1]
-BUCKET_NAME = sys.argv[2]
-TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA'
-
-def station_name(name):
-    if name:
-        return name.replace('/', '&')
-    else:
-        return ''
-
-def main():
-    '''...'''
-    # Create a SparkSession under the name 'clean'. Viewable via the Spark UI
-    spark = SparkSession.builder.appName('clean').getOrCreate()
-
-    # Check if table exists
-    
-    try:
-        df = spark.read.format('bigquery').option('table', TABLE).load()
-    except Py4JJavaError:
-        print(f"{TABLE} does not exist. ")
-        return
-
-    udf_map = {
-        'start_station_name': (station_name, StringType())
-    }
-
-    for name, (func, col_type) in udf_map.items():
-        df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name))
-    
-    df = spark.createDataframe
-    df.show(n=100)
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file

From 4bf07ee52c06da34a387f65b8e63d83911364a8d Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 16 Jun 2020 11:29:46 -0400
Subject: [PATCH 36/59] optimize data ingestion

---
 data-science-onramp/data-ingestion/setup.py   | 109 +++++++++---------
 .../data-ingestion/setup_test.py              |  14 +--
 2 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 7f5efa28e0a..33c7c728733 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,41 +1,43 @@
 import random
 import sys
-from time import time_ns
 
 from google.cloud import bigquery
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction
+from pyspark.sql.functions import UserDefinedFunction, when, expr
 from pyspark.sql.types import IntegerType, StringType
 
 
 BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
+RAW_DATASET_NAME = "new_york_citibike_trips5"
+RAW_TABLE_NAME = "RAW_DATA"
 
 
 # START MAKING DATA DIRTY
-def random_select(items, weights):
-    '''Picks an item according to the cumulative weights'''
-    return random.choices(items, weights=weights, k=1)[0]
-
-
 def trip_duration(duration):
     '''Converts trip duration to other units'''
+    if duration is None:
+        return None
     seconds = str(duration) + " s"
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
-    return random_select([seconds, minutes, hours,
+    return random.choices([seconds, minutes, hours,
                          str(random.randint(-1000, -1))],
-                         [0.3, 0.3, 0.3, 0.1])
+                         weights=[0.3, 0.3, 0.3, 0.1])[0]
 
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
+    if name is None:
+        return None
     return random.choice([name, name.replace("&", "/")])
 
 
 def user_type(user):
     '''Manipulates the user type string'''
+    if user is None:
+        return None
     return random.choice([user, user.upper(), user.lower(),
                           "sub" if user == "Subscriber" else user,
                           "cust" if user == "Customer" else user])
@@ -43,6 +45,8 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
+    if s is None:
+        return None
     return random.choice([s.upper(), s.lower(),
                          s[0].upper() if len(s) > 0 else "",
                          s[0].lower() if len(s) > 0 else ""])
@@ -50,29 +54,15 @@ def gender(s):
 
 def convert_angle(angle):
     '''Converts long and lat to DMS notation'''
+    if angle is None:
+        return None
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
     seconds = int((angle - degrees - minutes/60) * 3600)
     new_angle = str(degrees) + "\u00B0" + \
         str(minutes) + "'" + str(seconds) + '"'
-    return random_select([str(angle), new_angle], [0.55, 0.45])
-
-
-# This function is nested since a UserDefinedFunction is
-# expected to take a single argument
-def dirty_data(proc_func, allow_none):
-    '''Master function returns a user defined function
-    that transforms the column data'''
-    def udf(col_value):
-        random.seed(hash(col_value) + time_ns())
-        if col_value is None:
-            return col_value
-        elif allow_none:
-            return random_select([None, proc_func(col_value)],
-                                 [0.05, 0.95])
-        else:
-            return proc_func(col_value)
-    return udf
+    return random.choices([str(angle), new_angle],
+                          weights=[0.55, 0.45])[0]
 
 
 def write_to_bigquery(df):
@@ -80,17 +70,19 @@ def write_to_bigquery(df):
 
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f'{client.project}.new_york_citibike_trips'
+    dataset_id = f'{client.project}.{RAW_DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
     # Saving the data to BigQuery
     df.write.format('bigquery') \
-        .option('table', dataset_id + ".RAW_DATA") \
+        .option('table', dataset_id + f".{RAW_TABLE_NAME}") \
         .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
+    print("Table successfully written to BigQuery")
+
 
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
@@ -112,42 +104,49 @@ def main():
         print(f"{TABLE} does not exist. ")
         return
 
-    # Declare data transformations for each column in dataframe
-    udfs = [
-        (dirty_data(trip_duration, True), StringType()),  # tripduration
-        (dirty_data(lambda x: x, True), StringType()),  # starttime
-        (dirty_data(lambda x: x, True), StringType()),  # stoptime
-        (lambda x: x, IntegerType()),  # start_station_id
-        (dirty_data(station_name, False), StringType()),  # start_station_name
-        (dirty_data(convert_angle, True), StringType()),  # start_station_latitude
-        (dirty_data(convert_angle, True), StringType()),  # start_station_longitude
-        (lambda x: x, IntegerType()),  # end_station_id
-        (dirty_data(station_name, False), StringType()),  # end_station_name
-        (dirty_data(convert_angle, True), StringType()),  # end_station_latitude
-        (dirty_data(convert_angle, True), StringType()),  # end_station_longitude
-        (lambda x: x, IntegerType()),  # bikeid
-        (dirty_data(user_type, False), StringType()),  # usertype
-        (lambda x: x, IntegerType()),  # birth_year
-        (dirty_data(gender, False), StringType()),  # gender
-        (lambda x: x, StringType()),  # customer_plan
+    # Declare dictionary with keys column names and values user defined
+    #  functions and return types
+    udf_map = {
+            'tripduration': (trip_duration, StringType()),
+            'start_station_name': (station_name, StringType()),
+            'start_station_latitude': (convert_angle, StringType()),
+            'start_station_longitude': (convert_angle, StringType()),
+            'end_station_name': (station_name, StringType()),
+            'end_station_latitude': (convert_angle, StringType()),
+            'end_station_longitude': (convert_angle, StringType()),
+            'usertype': (user_type, StringType()),
+            'gender': (gender, StringType()),
+    }
+    
+    # Declare which columns to set some values to null randomly
+    null_columns = [
+            'tripduration',
+            'starttime',
+            'stoptime',
+            'start_station_latitude',
+            'start_station_longitude',
+            'end_station_latitude',
+            'end_station_longitude',
     ]
 
-    # Apply dirty transformations to df
-    names = df.schema.names
-    new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name)
-                         for udf, column, name in zip(udfs, df.columns, names)])
+    # Dirty the columns
+    for name, udf in udf_map.items():
+        df = df.withColumn(name, UserDefinedFunction(*udf)(name))
 
-    new_df.sample(False, 0.0001).show(n=100)
+    # Randomly set about 5% of the values in some columns to null
+    for name in null_columns:
+        df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
 
     # Duplicate about 0.01% of the rows
-    dup_df = new_df.sample(True, 0.0001)
+    dup_df = df.sample(True, 0.0001)
 
     # Create final dirty dataframe
-    df = new_df.union(dup_df)
+    df = df.union(dup_df)
 
     if upload:
         write_to_bigquery(df)
-
+    else:
+        df.sample(True, 0.0001).show(n=500, truncate=False)
 
 if __name__ == '__main__':
     main()
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 2aa82535d79..8fb1938c843 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -106,7 +106,7 @@ def get_blob_from_path(path):
 
 
 def is_in_table(value, out):
-    return re.search(f"\\| *{value}\\|", out)
+    return re.search(f"\\|{value} *\\|", out)
 
 
 def test_setup():
@@ -128,16 +128,16 @@ def test_setup():
     out = blob.download_as_string().decode("utf-8")
 
     # tripDuration
-    assert re.search("[0-9] s", out)
-    assert re.search("[0-9] m", out)
-    assert re.search("[0-9] h", out)
+    assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
+    assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
+    assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
 
     # station latitude & longitude
-    assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+    assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
 
     # birth_year
-    assert re.search("19[0-9][0-9]\\|", out)
-    assert re.search("20[0-9][0-9]\\|", out)
+    assert is_in_table("19[0-9][0-9]", out)
+    assert is_in_table("20[0-9][0-9]", out)
 
     # gender
     assert is_in_table("M", out)

From 8dbd3bc4a359cf4959c668aa802fa5c2e5dd2195 Mon Sep 17 00:00:00 2001
From: vuppalli <vu8hh@virginia.edu>
Date: Tue, 16 Jun 2020 11:54:59 -0400
Subject: [PATCH 37/59] fix linting errors

---
 data-science-onramp/data-ingestion/setup.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 33c7c728733..7308d13a37e 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -4,8 +4,8 @@
 from google.cloud import bigquery
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction, when, expr
-from pyspark.sql.types import IntegerType, StringType
+from pyspark.sql.functions import expr, UserDefinedFunction, when
+from pyspark.sql.types import StringType
 
 
 BUCKET_NAME = sys.argv[1]
@@ -23,8 +23,8 @@ def trip_duration(duration):
     minutes = str(float(duration) / 60) + " min"
     hours = str(float(duration) / 3600) + " h"
     return random.choices([seconds, minutes, hours,
-                         str(random.randint(-1000, -1))],
-                         weights=[0.3, 0.3, 0.3, 0.1])[0]
+                          str(random.randint(-1000, -1))],
+                          weights=[0.3, 0.3, 0.3, 0.1])[0]
 
 
 def station_name(name):
@@ -117,7 +117,7 @@ def main():
             'usertype': (user_type, StringType()),
             'gender': (gender, StringType()),
     }
-    
+
     # Declare which columns to set some values to null randomly
     null_columns = [
             'tripduration',
@@ -148,5 +148,6 @@ def main():
     else:
         df.sample(True, 0.0001).show(n=500, truncate=False)
 
+
 if __name__ == '__main__':
     main()

From 4cdd733184fc26fd380d978d21425f046385efc2 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 16 Jun 2020 18:21:20 -0400
Subject: [PATCH 38/59] fix minor style issues

---
 data-science-onramp/data-ingestion/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 7308d13a37e..b142aa8f37a 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -10,7 +10,7 @@
 
 BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-RAW_DATASET_NAME = "new_york_citibike_trips5"
+RAW_DATASET_NAME = "new_york_citibike_trips"
 RAW_TABLE_NAME = "RAW_DATA"
 
 
@@ -77,7 +77,7 @@ def write_to_bigquery(df):
 
     # Saving the data to BigQuery
     df.write.format('bigquery') \
-        .option('table', dataset_id + f".{RAW_TABLE_NAME}") \
+        .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \
         .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 

From 0769754bfeb70dc114bc4c176bfc8919517d9ed0 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Fri, 19 Jun 2020 17:47:23 -0400
Subject: [PATCH 39/59] remove pip from cluster config

---
 data-science-onramp/data-ingestion/setup_test.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 8fb1938c843..d8def350c8e 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -34,9 +34,6 @@
     'config': {
         'gce_cluster_config': {
             'zone_uri': '',
-            "metadata": {
-                "PIP_PACKAGES": "google-cloud-storage"
-            },
         },
         'master_config': {
             'num_instances': 1,
@@ -46,12 +43,6 @@
             'num_instances': 6,
             'machine_type_uri': 'n1-standard-8'
         },
-        "initialization_actions": [
-            {
-                "executable_file": ("gs://dataproc-initialization-actions/"
-                                    "python/pip-install.sh"),
-            }
-        ],
         "software_config": {
             "image_version": "1.5.4-debian10",
             "optional_components": [

From 52da79a9d687b4e39a575bb70552c966930e50ea Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Fri, 26 Jun 2020 19:24:21 -0400
Subject: [PATCH 40/59] load external datasets from url

---
 data-science-onramp/data-ingestion/setup.py   | 74 ++++++++++++++++---
 .../data-ingestion/setup_test.py              | 13 +++-
 2 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index b142aa8f37a..06b8ce00689 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,17 +1,37 @@
 import random
 import sys
+import pandas as pd
 
 from google.cloud import bigquery
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import expr, UserDefinedFunction, when
-from pyspark.sql.types import StringType
+from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
 
 BUCKET_NAME = sys.argv[1]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-RAW_DATASET_NAME = "new_york_citibike_trips"
-RAW_TABLE_NAME = "RAW_DATA"
+DATASET_NAME = "data_science_onramp"
+RAW_TABLE_NAME = "new_york_citibike_trips"
+EXTERNAL_DATASETS = {
+    "gas_prices": {
+        "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
+        "schema": StructType([
+            StructField("Date", StringType(), True),
+            StructField("New_York_State_Average_USD_per_Gal",
+                        FloatType(), True),
+            StructField("Albany_Average_USD_per_Gal", FloatType(), True),
+            StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True),
+            StructField("Buffalo_Average_USD_per_Gal", FloatType(), True),
+            StructField("Nassau_Average_USD_per_Gal", FloatType(), True),
+            StructField("New_York_City_Average_USD_per_Gal",
+                        FloatType(), True),
+            StructField("Rochester_Average_USD_per_Gal", FloatType(), True),
+            StructField("Syracuse_Average_USD_per_Gal", FloatType(), True),
+            StructField("Utica_Average_USD_per_Gal", FloatType(), True),
+        ]),
+    },
+}
 
 
 # START MAKING DATA DIRTY
@@ -65,23 +85,39 @@ def convert_angle(angle):
                           weights=[0.55, 0.45])[0]
 
 
-def write_to_bigquery(df):
-    '''Write a dataframe to BigQuery'''
-
+def create_bigquery_dataset():
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f'{client.project}.{RAW_DATASET_NAME}'
+    dataset_id = f'{client.project}.{DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
+
+def write_to_bigquery(df, table_name):
+    '''Write a dataframe to BigQuery'''
+    client = bigquery.Client()
+    dataset_id = f'{client.project}.{DATASET_NAME}'
+
     # Saving the data to BigQuery
     df.write.format('bigquery') \
-        .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \
+        .option('table', f"{dataset_id}.{table_name}") \
         .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
-    print("Table successfully written to BigQuery")
+    print(f"Table {table_name} successfully written to BigQuery")
+
+
+def print_df(df, table_name):
+    '''Print 20 rows from dataframe and a random sample'''
+    # first 100 rows for smaller tables
+    df.show()
+
+    # random sample for larger tables
+    # for small tables this will be empty
+    df.sample(True, 0.0001).show(n=500, truncate=False)
+
+    print(f"Table {table_name} printed")
 
 
 def main():
@@ -91,12 +127,25 @@ def main():
     upload = True  # Whether to upload data to BigQuery
 
     # Check whether or not results should be uploaded
-    if len(sys.argv) > 2:
+    if '--test' in sys.argv:
         upload = False
         print("Not uploading results to BigQuery")
     else:
+        create_bigquery_dataset()
         print("Results will be uploaded to BigQuery")
 
+    # Ingest External Datasets
+
+    for table_name, data in EXTERNAL_DATASETS.items():
+        print(f'Creating dataframe for {table_name}')
+        df = spark.createDataFrame(pd.read_csv(data["url"]),
+                                   schema=data["schema"])
+
+        if upload:
+            write_to_bigquery(df, table_name)
+        else:
+            print_df(df, table_name)
+
     # Check if table exists
     try:
         df = spark.read.format('bigquery').option('table', TABLE).load()
@@ -135,6 +184,7 @@ def main():
 
     # Randomly set about 5% of the values in some columns to null
     for name in null_columns:
+
         df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
 
     # Duplicate about 0.01% of the rows
@@ -144,9 +194,9 @@ def main():
     df = df.union(dup_df)
 
     if upload:
-        write_to_bigquery(df)
+        write_to_bigquery(df, RAW_TABLE_NAME)
     else:
-        df.sample(True, 0.0001).show(n=500, truncate=False)
+        print_df(df, RAW_TABLE_NAME)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index d8def350c8e..baec10a79a5 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -13,6 +13,10 @@
 BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
+TABLE_NAMES = [
+    "new_york_citibike_trips",
+    "gas_prices",
+]
 JOB_DETAILS = {  # Job configuration
     'placement': {
         'cluster_name': CLUSTER_NAME
@@ -97,9 +101,12 @@ def get_blob_from_path(path):
 
 
 def is_in_table(value, out):
-    return re.search(f"\\|{value} *\\|", out)
+    return re.search(f"\\| *{value} *\\|", out)
 
 
+def table_printed(table_name, out):
+    return re.search(f"Table {table_name} printed", out)
+
 def test_setup():
     '''Tests setup.py by submitting it to a dataproc cluster'''
 
@@ -118,6 +125,10 @@ def test_setup():
     blob = get_blob_from_path(output_location)
     out = blob.download_as_string().decode("utf-8")
 
+    # check that tables were printed
+    for table_name in TABLE_NAMES:
+        assert table_printed(table_name, out)
+
     # tripDuration
     assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
     assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)

From 2ac38ab67d4fecaad7b6a6db05a11c1db8258b21 Mon Sep 17 00:00:00 2001
From: Tushar Khan <tusharkhan@google.com>
Date: Tue, 7 Jul 2020 12:54:45 -0400
Subject: [PATCH 41/59] added dry-run flag

---
 data-science-onramp/data-ingestion/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 06b8ce00689..bfa22087c39 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -127,7 +127,7 @@ def main():
     upload = True  # Whether to upload data to BigQuery
 
     # Check whether or not results should be uploaded
-    if '--test' in sys.argv:
+    if '--dry-run' in sys.argv:
         upload = False
         print("Not uploading results to BigQuery")
     else:

From 5ead6b239f955516f0b3e80242ef9278d1ff1779 Mon Sep 17 00:00:00 2001
From: Symmetries <lopdie101@gmail.com>
Date: Wed, 8 Jul 2020 12:29:46 -0400
Subject: [PATCH 42/59] dry-run flag

---
 data-science-onramp/data-ingestion/setup_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index baec10a79a5..a8fbe0d014d 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -25,7 +25,7 @@
         'main_python_file_uri': JOB_FILE_NAME,
         'args': [
             BUCKET_NAME,
-            "--test",
+            "--dry-run",
         ],
         "jar_file_uris": [
                 "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"

From 3bb0f79c3727b2310208bed55ee9e8607f6e508b Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 9 Jul 2020 19:00:28 -0400
Subject: [PATCH 43/59] address some review comments

---
 data-science-onramp/data-ingestion/setup.py   | 57 +++++++++----------
 .../data-ingestion/setup_test.py              | 17 ++++--
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index bfa22087c39..ecffd628b50 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -1,3 +1,12 @@
+"""Setup Dataproc job for Data Science Onramp Sample Application
+This job ingests an external gas prices in NY dataset as well as
+takes a New York Citibike dataset available on BigQuery and
+"dirties" the dataset before uploading it back to BigQuery
+It needs the following arguments
+* the name of the Google Cloud Storage bucket to be used
+* an optional --test flag to upload a subset of the dataset for testing
+"""
+
 import random
 import sys
 import pandas as pd
@@ -37,11 +46,11 @@
 # START MAKING DATA DIRTY
 def trip_duration(duration):
     '''Converts trip duration to other units'''
-    if duration is None:
+    if not duration:
         return None
-    seconds = str(duration) + " s"
-    minutes = str(float(duration) / 60) + " min"
-    hours = str(float(duration) / 3600) + " h"
+    seconds = f"{str(duration)} s"
+    minutes = f"{str(float(duration) / 60)} min"
+    hours = f"{str(float(duration) / 3600)} h"
     return random.choices([seconds, minutes, hours,
                           str(random.randint(-1000, -1))],
                           weights=[0.3, 0.3, 0.3, 0.1])[0]
@@ -49,14 +58,14 @@ def trip_duration(duration):
 
 def station_name(name):
     '''Replaces '&' with '/' with a 50% chance'''
-    if name is None:
+    if not name:
         return None
     return random.choice([name, name.replace("&", "/")])
 
 
 def user_type(user):
     '''Manipulates the user type string'''
-    if user is None:
+    if not user:
         return None
     return random.choice([user, user.upper(), user.lower(),
                           "sub" if user == "Subscriber" else user,
@@ -65,7 +74,7 @@ def user_type(user):
 
 def gender(s):
     '''Manipulates the gender string'''
-    if s is None:
+    if not s:
         return None
     return random.choice([s.upper(), s.lower(),
                          s[0].upper() if len(s) > 0 else "",
@@ -108,28 +117,16 @@ def write_to_bigquery(df, table_name):
     print(f"Table {table_name} successfully written to BigQuery")
 
 
-def print_df(df, table_name):
-    '''Print 20 rows from dataframe and a random sample'''
-    # first 100 rows for smaller tables
-    df.show()
-
-    # random sample for larger tables
-    # for small tables this will be empty
-    df.sample(True, 0.0001).show(n=500, truncate=False)
-
-    print(f"Table {table_name} printed")
-
-
 def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
-    upload = True  # Whether to upload data to BigQuery
+    test = False  # Whether we are running the job as a test 
 
-    # Check whether or not results should be uploaded
-    if '--dry-run' in sys.argv:
-        upload = False
-        print("Not uploading results to BigQuery")
+    # Check whether or not the job is running as a test
+    if '--test' in sys.argv:
+        test = True
+        print("Subset of whole dataset will be uploaded to BigQuery")
     else:
         create_bigquery_dataset()
         print("Results will be uploaded to BigQuery")
@@ -141,10 +138,7 @@ def main():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
-        if upload:
-            write_to_bigquery(df, table_name)
-        else:
-            print_df(df, table_name)
+        write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:
@@ -184,7 +178,6 @@ def main():
 
     # Randomly set about 5% of the values in some columns to null
     for name in null_columns:
-
         df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))
 
     # Duplicate about 0.01% of the rows
@@ -193,10 +186,12 @@ def main():
     # Create final dirty dataframe
     df = df.union(dup_df)
 
-    if upload:
+    if not test:
         write_to_bigquery(df, RAW_TABLE_NAME)
     else:
-        print_df(df, RAW_TABLE_NAME)
+        # df.sample(True, 0.0001).show(n=500, truncate=False)
+        # Upload 0.001% of the table (about 600 rows)
+        write_to_bigquery(df.sample(False, 0.00001))
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index a8fbe0d014d..a0ae6fb2814 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -1,3 +1,9 @@
+"""Test file for the setup job in the Data Science Onramp sample application
+Creates a test Dataproc cluster and runs the job with a --test flag.
+The job uploads a subset of the data to BigQuery.
+Then, data is pulled from BigQuery and checks are made to see if the data is dirty.
+"""
+
 import os
 import re
 import uuid
@@ -25,7 +31,7 @@
         'main_python_file_uri': JOB_FILE_NAME,
         'args': [
             BUCKET_NAME,
-            "--dry-run",
+            "--test",
         ],
         "jar_file_uris": [
                 "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
@@ -104,8 +110,9 @@ def is_in_table(value, out):
     return re.search(f"\\| *{value} *\\|", out)
 
 
-def table_printed(table_name, out):
-    return re.search(f"Table {table_name} printed", out)
+def table_uploaded(table_name, out):
+    return re.search(f"Table {table_name} successfully written to BigQuery", out)
+
 
 def test_setup():
     '''Tests setup.py by submitting it to a dataproc cluster'''
@@ -125,9 +132,9 @@ def test_setup():
     blob = get_blob_from_path(output_location)
     out = blob.download_as_string().decode("utf-8")
 
-    # check that tables were printed
+    # Check if table upload success message was printed
     for table_name in TABLE_NAMES:
-        assert table_printed(table_name, out)
+        assert table_uploaded(table_name, out)
 
     # tripDuration
     assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)

From c753ed723a5b7a7a53b12d81498493373db16d3e Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 14 Jul 2020 17:22:26 -0400
Subject: [PATCH 44/59] optimize setup test

---
 .../data-ingestion/requirements.txt           |   3 +-
 data-science-onramp/data-ingestion/setup.py   |  34 ++---
 data-science-onramp/data-ingestion/setup.sh   |   2 +-
 .../data-ingestion/setup_test.py              | 116 ++++++++++--------
 4 files changed, 86 insertions(+), 69 deletions(-)
 mode change 100644 => 100755 data-science-onramp/data-ingestion/setup.sh

diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
index f435423c623..e0328e4aec9 100644
--- a/data-science-onramp/data-ingestion/requirements.txt
+++ b/data-science-onramp/data-ingestion/requirements.txt
@@ -3,4 +3,5 @@ google-auth==1.16.0
 google-auth-httplib2==0.0.3
 google-cloud==0.34.0
 google-cloud-storage==1.28.1
-google-cloud-dataproc==0.8.0
\ No newline at end of file
+google-cloud-dataproc==0.8.0
+google-cloud-bigquery==1.25.0
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index ecffd628b50..bdad93720d2 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -4,14 +4,15 @@
 "dirties" the dataset before uploading it back to BigQuery
 It needs the following arguments
 * the name of the Google Cloud Storage bucket to be used
+* the name of the BigQuery dataset to be created
 * an optional --test flag to upload a subset of the dataset for testing
 """
 
 import random
 import sys
-import pandas as pd
 
 from google.cloud import bigquery
+import pandas as pd
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import expr, UserDefinedFunction, when
@@ -19,10 +20,10 @@
 
 
 BUCKET_NAME = sys.argv[1]
+DATASET_NAME = sys.argv[2]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-DATASET_NAME = "data_science_onramp"
-RAW_TABLE_NAME = "new_york_citibike_trips"
-EXTERNAL_DATASETS = {
+CITIBIKE_TABLE_NAME = "new_york_citibike_trips"
+EXTERNAL_TABLES = {
     "gas_prices": {
         "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
         "schema": StructType([
@@ -111,7 +112,6 @@ def write_to_bigquery(df, table_name):
     # Saving the data to BigQuery
     df.write.format('bigquery') \
         .option('table', f"{dataset_id}.{table_name}") \
-        .option("temporaryGcsBucket", BUCKET_NAME) \
         .save()
 
     print(f"Table {table_name} successfully written to BigQuery")
@@ -121,20 +121,22 @@ def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
-    test = False  # Whether we are running the job as a test 
+    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
+
+    create_bigquery_dataset()
+
+    # Whether we are running the job as a test
+    test = False
 
     # Check whether or not the job is running as a test
     if '--test' in sys.argv:
         test = True
-        print("Subset of whole dataset will be uploaded to BigQuery")
+        print("A subset of the whole dataset will be uploaded to BigQuery")
     else:
-        create_bigquery_dataset()
         print("Results will be uploaded to BigQuery")
 
     # Ingest External Datasets
-
-    for table_name, data in EXTERNAL_DATASETS.items():
-        print(f'Creating dataframe for {table_name}')
+    for table_name, data in EXTERNAL_TABLES.items():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
@@ -143,6 +145,8 @@ def main():
     # Check if table exists
     try:
         df = spark.read.format('bigquery').option('table', TABLE).load()
+        if test:
+            df = df.sample(False, 0.00001)
     except Py4JJavaError:
         print(f"{TABLE} does not exist. ")
         return
@@ -186,12 +190,8 @@ def main():
     # Create final dirty dataframe
     df = df.union(dup_df)
 
-    if not test:
-        write_to_bigquery(df, RAW_TABLE_NAME)
-    else:
-        # df.sample(True, 0.0001).show(n=500, truncate=False)
-        # Upload 0.001% of the table (about 600 rows)
-        write_to_bigquery(df.sample(False, 0.00001))
+    print('Uploading citibike dataset...')
+    write_to_bigquery(df, CITIBIKE_TABLE_NAME)
 
 
 if __name__ == '__main__':
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
old mode 100644
new mode 100755
index f78c8cd120b..336f3da729d
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
     --driver-log-levels root=FATAL \
-    setup.py -- ${BUCKET_NAME}
+    setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index a0ae6fb2814..7b0f0bc6be5 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -10,13 +10,17 @@
 
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
+from google.cloud import bigquery
 import pytest
 
 # Set global variables
+ID = uuid.uuid4()
+
 PROJECT = os.environ['GCLOUD_PROJECT']
 REGION = "us-central1"
-CLUSTER_NAME = f'setup-test-{uuid.uuid4()}'
-BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}'
+CLUSTER_NAME = f'setup-test-{ID}'
+BUCKET_NAME = f'setup-test-{ID}'
+DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
 TABLE_NAMES = [
@@ -31,6 +35,7 @@
         'main_python_file_uri': JOB_FILE_NAME,
         'args': [
             BUCKET_NAME,
+            DATASET_NAME,
             "--test",
         ],
         "jar_file_uris": [
@@ -99,6 +104,17 @@ def setup_and_teardown_bucket():
     bucket.delete(force=True)
 
 
+@pytest.fixture(autouse=True)
+def setup_and_teardown_bq_dataset():
+    # Dataset is created by the client
+    bq_client = bigquery.Client(project=PROJECT)
+
+    yield
+
+    # Delete Dataset
+    bq_client.delete_dataset(DATASET_NAME, delete_contents=True)
+
+
 def get_blob_from_path(path):
     bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
     bucket = storage.Client().get_bucket(bucket_name)
@@ -106,8 +122,14 @@ def get_blob_from_path(path):
     return bucket.blob(output_location)
 
 
-def is_in_table(value, out):
-    return re.search(f"\\| *{value} *\\|", out)
+def get_dataproc_job_output(result):
+    output_location = result.driver_output_resource_uri + ".000000000"
+    blob = get_blob_from_path(output_location)
+    return blob.download_as_string().decode("utf-8")
+
+
+# def is_in_table(value, out):
+#     return re.search(f"\\| *{value} *\\|", out)
 
 
 def table_uploaded(table_name, out):
@@ -128,49 +150,43 @@ def test_setup():
     result = response.result()
 
     # Get job output
-    output_location = result.driver_output_resource_uri + ".000000000"
-    blob = get_blob_from_path(output_location)
-    out = blob.download_as_string().decode("utf-8")
-
-    # Check if table upload success message was printed
-    for table_name in TABLE_NAMES:
-        assert table_uploaded(table_name, out)
-
-    # tripDuration
-    assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
-    assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
-    assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
-
-    # station latitude & longitude
-    assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
-
-    # birth_year
-    assert is_in_table("19[0-9][0-9]", out)
-    assert is_in_table("20[0-9][0-9]", out)
-
-    # gender
-    assert is_in_table("M", out)
-    assert is_in_table("m", out)
-    assert is_in_table("male", out)
-    assert is_in_table("MALE", out)
-    assert is_in_table("F", out)
-    assert is_in_table("f", out)
-    assert is_in_table("female", out)
-    assert is_in_table("FEMALE", out)
-    assert is_in_table("U", out)
-    assert is_in_table("u", out)
-    assert is_in_table("unknown", out)
-    assert is_in_table("UNKNOWN", out)
-
-    # customer_plan
-    assert is_in_table("Subscriber", out)
-    assert is_in_table("subscriber", out)
-    assert is_in_table("SUBSCRIBER", out)
-    assert is_in_table("sub", out)
-    assert is_in_table("Customer", out)
-    assert is_in_table("customer", out)
-    assert is_in_table("CUSTOMER", out)
-    assert is_in_table("cust", out)
-
-    # Missing data
-    assert is_in_table("null", out)
+    out = get_dataproc_job_output(result)
+
+    # # tripDuration
+    # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
+    # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
+    # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
+
+    # # station latitude & longitude
+    # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
+
+    # # birth_year
+    # assert is_in_table("19[0-9][0-9]", out)
+    # assert is_in_table("20[0-9][0-9]", out)
+
+    # # gender
+    # assert is_in_table("M", out)
+    # assert is_in_table("m", out)
+    # assert is_in_table("male", out)
+    # assert is_in_table("MALE", out)
+    # assert is_in_table("F", out)
+    # assert is_in_table("f", out)
+    # assert is_in_table("female", out)
+    # assert is_in_table("FEMALE", out)
+    # assert is_in_table("U", out)
+    # assert is_in_table("u", out)
+    # assert is_in_table("unknown", out)
+    # assert is_in_table("UNKNOWN", out)
+
+    # # customer_plan
+    # assert is_in_table("Subscriber", out)
+    # assert is_in_table("subscriber", out)
+    # assert is_in_table("SUBSCRIBER", out)
+    # assert is_in_table("sub", out)
+    # assert is_in_table("Customer", out)
+    # assert is_in_table("customer", out)
+    # assert is_in_table("CUSTOMER", out)
+    # assert is_in_table("cust", out)
+
+    # # Missing data
+    # assert is_in_table("null", out)

From e0ffb41cc7494c22465b371b23085aa54b133147 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Wed, 15 Jul 2020 18:40:18 -0400
Subject: [PATCH 45/59] query data in test

---
 data-science-onramp/data-ingestion/setup.sh   |   2 +-
 .../data-ingestion/setup_test.py              | 107 +++++++++++-------
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
index 336f3da729d..a69cda6a134 100755
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
     --driver-log-levels root=FATAL \
-    setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test
+    setup.py -- ${BUCKET_NAME} data_science_onramp
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 7b0f0bc6be5..ad9e756f8d1 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -21,6 +21,7 @@
 CLUSTER_NAME = f'setup-test-{ID}'
 BUCKET_NAME = f'setup-test-{ID}'
 DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
+CITIBIKE_TABLE = "new_york_citibike_trips"
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
 TABLE_NAMES = [
@@ -123,6 +124,7 @@ def get_blob_from_path(path):
 
 
 def get_dataproc_job_output(result):
+    """Get the dataproc job logs in plain text"""
     output_location = result.driver_output_resource_uri + ".000000000"
     blob = get_blob_from_path(output_location)
     return blob.download_as_string().decode("utf-8")
@@ -132,12 +134,50 @@ def get_dataproc_job_output(result):
 #     return re.search(f"\\| *{value} *\\|", out)
 
 
-def table_uploaded(table_name, out):
-    return re.search(f"Table {table_name} successfully written to BigQuery", out)
+def assert_table_success_message(table_name, out):
+    """Check table upload success message was printed in job logs."""
+    assert re.search(f"Table {table_name} successfully written to BigQuery", out), \
+        f"Table {table_name} sucess message not printed in job logs"
+
+
+
+def assert_regexes_in_table(regex_dict, query_result):
+    """Assert that at least one row satisfies each regex.
+    The arguments are
+    - regex_dict: a dictionary where the keys are column
+                    names and values are lists of regexes;
+    - query_result: the bigquery query result of the whole table.
+    """
+
+    # Create dictionary with keys column names and values dictionaries
+    # The dictionaries stored have keys regexes and values booleans
+    # `regex_found_dict[column][regex]` hold the truth value of
+    # whether the there is at least one row of column with name `column`
+    # which satisfies the regular expression `regex`.
+    regex_found_dict = {}
+    for column, regexes in regex_dict.items():
+        regex_found_dict[column] = {}
+        for regex in regexes:
+            regex_found_dict[column][regex] = False
+
+    # Outer loop is over `query_result` since this is
+    # an iterator which can only iterate once
+    for row in query_result:
+        for column_name, regexes in regex_dict.items():
+            for regex in regexes:
+                if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]):
+                    regex_found_dict[column_name][regex] = True
+
+    # Assert that all entries in regex_found_dict are true
+    for column_name in regex_found_dict:
+        for regex, found in regex_found_dict[column_name].items():
+            assert found, \
+                    f"No matches to regular expression \"{regex}\" found in column {column_name}"
 
 
 def test_setup():
-    '''Tests setup.py by submitting it to a dataproc cluster'''
+    """Test setup.py by submitting it to a dataproc cluster
+    Check table upload success message as well as data in the table itself"""
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(client_options={
@@ -151,42 +191,27 @@ def test_setup():
 
     # Get job output
     out = get_dataproc_job_output(result)
+    
+    # Check logs to see if tables were uploaded
+    for table_name in TABLE_NAMES:
+        assert_table_success_message(table_name, out)
+
+    # Query BigQuery Table
+    client = bigquery.Client()
+    query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
+    query_job = client.query(query)
+
+    result = query_job.result()
+
+    regex_dict = {
+        "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"],
+        "gender": ['f', 'F', 'm', 'M', 'u', 'U', 'male', 'MALE', 'female', 'FEMALE', 'unknown', 'UNKNOWN'],
+        "start_station_latitude": ["[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\""],
+        "start_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
+        "end_station_latitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
+        "end_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
+        "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"],
+    }
+
+    assert_regexes_in_table(regex_dict, result)
 
-    # # tripDuration
-    # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out)
-    # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out)
-    # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out)
-
-    # # station latitude & longitude
-    # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out)
-
-    # # birth_year
-    # assert is_in_table("19[0-9][0-9]", out)
-    # assert is_in_table("20[0-9][0-9]", out)
-
-    # # gender
-    # assert is_in_table("M", out)
-    # assert is_in_table("m", out)
-    # assert is_in_table("male", out)
-    # assert is_in_table("MALE", out)
-    # assert is_in_table("F", out)
-    # assert is_in_table("f", out)
-    # assert is_in_table("female", out)
-    # assert is_in_table("FEMALE", out)
-    # assert is_in_table("U", out)
-    # assert is_in_table("u", out)
-    # assert is_in_table("unknown", out)
-    # assert is_in_table("UNKNOWN", out)
-
-    # # customer_plan
-    # assert is_in_table("Subscriber", out)
-    # assert is_in_table("subscriber", out)
-    # assert is_in_table("SUBSCRIBER", out)
-    # assert is_in_table("sub", out)
-    # assert is_in_table("Customer", out)
-    # assert is_in_table("customer", out)
-    # assert is_in_table("CUSTOMER", out)
-    # assert is_in_table("cust", out)
-
-    # # Missing data
-    # assert is_in_table("null", out)

From b0d334be5aabd792bc8f904fff05d6ef5b1ecfe8 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Fri, 17 Jul 2020 14:03:26 -0400
Subject: [PATCH 46/59] address live session comments

---
 .../data-ingestion/setup_test.py              | 64 ++++++-------------
 1 file changed, 18 insertions(+), 46 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index ad9e756f8d1..5ee77d5e1a3 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -8,9 +8,9 @@
 import re
 import uuid
 
+from google.cloud import bigquery
 from google.cloud import dataproc_v1 as dataproc
 from google.cloud import storage
-from google.cloud import bigquery
 import pytest
 
 # Set global variables
@@ -130,51 +130,12 @@ def get_dataproc_job_output(result):
     return blob.download_as_string().decode("utf-8")
 
 
-# def is_in_table(value, out):
-#     return re.search(f"\\| *{value} *\\|", out)
-
-
 def assert_table_success_message(table_name, out):
     """Check table upload success message was printed in job logs."""
     assert re.search(f"Table {table_name} successfully written to BigQuery", out), \
         f"Table {table_name} sucess message not printed in job logs"
 
 
-
-def assert_regexes_in_table(regex_dict, query_result):
-    """Assert that at least one row satisfies each regex.
-    The arguments are
-    - regex_dict: a dictionary where the keys are column
-                    names and values are lists of regexes;
-    - query_result: the bigquery query result of the whole table.
-    """
-
-    # Create dictionary with keys column names and values dictionaries
-    # The dictionaries stored have keys regexes and values booleans
-    # `regex_found_dict[column][regex]` hold the truth value of
-    # whether the there is at least one row of column with name `column`
-    # which satisfies the regular expression `regex`.
-    regex_found_dict = {}
-    for column, regexes in regex_dict.items():
-        regex_found_dict[column] = {}
-        for regex in regexes:
-            regex_found_dict[column][regex] = False
-
-    # Outer loop is over `query_result` since this is
-    # an iterator which can only iterate once
-    for row in query_result:
-        for column_name, regexes in regex_dict.items():
-            for regex in regexes:
-                if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]):
-                    regex_found_dict[column_name][regex] = True
-
-    # Assert that all entries in regex_found_dict are true
-    for column_name in regex_found_dict:
-        for regex, found in regex_found_dict[column_name].items():
-            assert found, \
-                    f"No matches to regular expression \"{regex}\" found in column {column_name}"
-
-
 def test_setup():
     """Test setup.py by submitting it to a dataproc cluster
     Check table upload success message as well as data in the table itself"""
@@ -191,17 +152,13 @@ def test_setup():
 
     # Get job output
     out = get_dataproc_job_output(result)
-    
+
     # Check logs to see if tables were uploaded
     for table_name in TABLE_NAMES:
         assert_table_success_message(table_name, out)
 
     # Query BigQuery Table
     client = bigquery.Client()
-    query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
-    query_job = client.query(query)
-
-    result = query_job.result()
 
     regex_dict = {
         "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"],
@@ -213,5 +170,20 @@ def test_setup():
         "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"],
     }
 
-    assert_regexes_in_table(regex_dict, result)
+    for column_name, regexes in regex_dict.items():
+        query = f"SELECT {column_name} FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
+        query_job = client.query(query)
 
+        result = query_job.result()
+
+        rows = []
+        for row in result:
+            rows.append(row[column_name])
+
+        for regex in regexes:
+            found = False
+            for row in rows:
+                if row and re.match(f"\\A{regex}\\Z", row):
+                    found = True
+            assert found, \
+                f"No matches to regular expression \"{regex}\" found in column {column_name}"

From 33afd6c9677f2865e222b72e0d9ab14408b72b27 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Mon, 20 Jul 2020 11:36:25 -0400
Subject: [PATCH 47/59] add break statement

---
 data-science-onramp/data-ingestion/setup_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 5ee77d5e1a3..978a5376480 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -185,5 +185,6 @@ def test_setup():
             for row in rows:
                 if row and re.match(f"\\A{regex}\\Z", row):
                     found = True
+                    break
             assert found, \
                 f"No matches to regular expression \"{regex}\" found in column {column_name}"

From 9acb94ef8c00a5a9fd7a61fb702422197761329c Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 23 Jul 2020 16:48:55 -0400
Subject: [PATCH 48/59] revert breaking table and dataset name change

---
 data-science-onramp/data-ingestion/setup.py      | 2 +-
 data-science-onramp/data-ingestion/setup.sh      | 2 +-
 data-science-onramp/data-ingestion/setup_test.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index bdad93720d2..a1f13dfa5ef 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -22,7 +22,7 @@
 BUCKET_NAME = sys.argv[1]
 DATASET_NAME = sys.argv[2]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
-CITIBIKE_TABLE_NAME = "new_york_citibike_trips"
+CITIBIKE_TABLE_NAME = "RAW_DATA"
 EXTERNAL_TABLES = {
     "gas_prices": {
         "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh
index a69cda6a134..2c4773f7272 100755
--- a/data-science-onramp/data-ingestion/setup.sh
+++ b/data-science-onramp/data-ingestion/setup.sh
@@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \
     --cluster ${CLUSTER_NAME} \
     --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
     --driver-log-levels root=FATAL \
-    setup.py -- ${BUCKET_NAME} data_science_onramp
+    setup.py -- ${BUCKET_NAME} new_york_citibike_trips
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 978a5376480..b1395af9793 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -21,11 +21,11 @@
 CLUSTER_NAME = f'setup-test-{ID}'
 BUCKET_NAME = f'setup-test-{ID}'
 DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
-CITIBIKE_TABLE = "new_york_citibike_trips"
+CITIBIKE_TABLE = "RAW_DATA"
 DESTINATION_BLOB_NAME = "setup.py"
 JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
 TABLE_NAMES = [
-    "new_york_citibike_trips",
+    CITIBIKE_TABLE,
     "gas_prices",
 ]
 JOB_DETAILS = {  # Job configuration

From c97d45497dcd8ee32c96167c3ba0a6e401c9f841 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Tue, 4 Aug 2020 19:57:05 -0400
Subject: [PATCH 49/59] fix datetime formatting in setup job

---
 data-science-onramp/data-ingestion/setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index a1f13dfa5ef..8205f551c51 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -15,7 +15,7 @@
 import pandas as pd
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import expr, UserDefinedFunction, when
+from pyspark.sql.functions import expr, UserDefinedFunction, when, date_format
 from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
 
@@ -101,7 +101,7 @@ def create_bigquery_dataset():
     dataset_id = f'{client.project}.{DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
-    dataset = client.create_dataset(dataset)
+    #dataset = client.create_dataset(dataset)
 
 
 def write_to_bigquery(df, table_name):
@@ -140,7 +140,7 @@ def main():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
-        write_to_bigquery(df, table_name)
+        #write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:
@@ -180,6 +180,10 @@ def main():
     for name, udf in udf_map.items():
         df = df.withColumn(name, UserDefinedFunction(*udf)(name))
 
+    # Format the datetimes correctly
+    for name in ['starttime', 'stoptime']:
+        df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss"))
+
     # Randomly set about 5% of the values in some columns to null
     for name in null_columns:
         df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name]))

From 41406f9b44ad3e9d292d6f540ec754794d94c147 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 6 Aug 2020 14:03:13 -0400
Subject: [PATCH 50/59] uncomment commented dataset creation and writing

---
 data-science-onramp/data-ingestion/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 8205f551c51..352d8b029b4 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -101,7 +101,7 @@ def create_bigquery_dataset():
     dataset_id = f'{client.project}.{DATASET_NAME}'
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
-    #dataset = client.create_dataset(dataset)
+    dataset = client.create_dataset(dataset)
 
 
 def write_to_bigquery(df, table_name):
@@ -140,7 +140,7 @@ def main():
         df = spark.createDataFrame(pd.read_csv(data["url"]),
                                    schema=data["schema"])
 
-        #write_to_bigquery(df, table_name)
+        write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:

From ca3c592b36192527757618d374f5037660d99a22 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 6 Aug 2020 23:34:51 -0400
Subject: [PATCH 51/59] fix import order

---
 data-science-onramp/data-ingestion/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 352d8b029b4..135026e6e3b 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -15,7 +15,7 @@
 import pandas as pd
 from py4j.protocol import Py4JJavaError
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import expr, UserDefinedFunction, when, date_format
+from pyspark.sql.functions import date_format, expr, UserDefinedFunction, when
 from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
 

From cf3aae393fbebb27153de556c0e3509dcc8f010a Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 6 Aug 2020 23:36:17 -0400
Subject: [PATCH 52/59] use GOOGLE_CLOUD_PROJECT environment variable

---
 data-science-onramp/data-ingestion/setup_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index b1395af9793..298576f6039 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -16,7 +16,7 @@
 # Set global variables
 ID = uuid.uuid4()
 
-PROJECT = os.environ['GCLOUD_PROJECT']
+PROJECT = os.environ['GOOGLE_CLOUD_PROJECT']
 REGION = "us-central1"
 CLUSTER_NAME = f'setup-test-{ID}'
 BUCKET_NAME = f'setup-test-{ID}'

From dc11440a5f4efcf51831f66e3f41a6f1d84052be Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Wed, 12 Aug 2020 13:13:52 -0400
Subject: [PATCH 53/59] blacken and add f-strings to dms notation

---
 data-science-onramp/data-ingestion/setup.py   | 136 +++++++++---------
 .../data-ingestion/setup_test.py              | 128 +++++++++--------
 2 files changed, 142 insertions(+), 122 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index 135026e6e3b..cc5245dfdcf 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -26,93 +26,101 @@
 EXTERNAL_TABLES = {
     "gas_prices": {
         "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv",
-        "schema": StructType([
-            StructField("Date", StringType(), True),
-            StructField("New_York_State_Average_USD_per_Gal",
-                        FloatType(), True),
-            StructField("Albany_Average_USD_per_Gal", FloatType(), True),
-            StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True),
-            StructField("Buffalo_Average_USD_per_Gal", FloatType(), True),
-            StructField("Nassau_Average_USD_per_Gal", FloatType(), True),
-            StructField("New_York_City_Average_USD_per_Gal",
-                        FloatType(), True),
-            StructField("Rochester_Average_USD_per_Gal", FloatType(), True),
-            StructField("Syracuse_Average_USD_per_Gal", FloatType(), True),
-            StructField("Utica_Average_USD_per_Gal", FloatType(), True),
-        ]),
+        "schema": StructType(
+            [
+                StructField("Date", StringType(), True),
+                StructField("New_York_State_Average_USD_per_Gal", FloatType(), True),
+                StructField("Albany_Average_USD_per_Gal", FloatType(), True),
+                StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True),
+                StructField("Buffalo_Average_USD_per_Gal", FloatType(), True),
+                StructField("Nassau_Average_USD_per_Gal", FloatType(), True),
+                StructField("New_York_City_Average_USD_per_Gal", FloatType(), True),
+                StructField("Rochester_Average_USD_per_Gal", FloatType(), True),
+                StructField("Syracuse_Average_USD_per_Gal", FloatType(), True),
+                StructField("Utica_Average_USD_per_Gal", FloatType(), True),
+            ]
+        ),
     },
 }
 
 
 # START MAKING DATA DIRTY
 def trip_duration(duration):
-    '''Converts trip duration to other units'''
+    """Converts trip duration to other units"""
     if not duration:
         return None
     seconds = f"{str(duration)} s"
     minutes = f"{str(float(duration) / 60)} min"
     hours = f"{str(float(duration) / 3600)} h"
-    return random.choices([seconds, minutes, hours,
-                          str(random.randint(-1000, -1))],
-                          weights=[0.3, 0.3, 0.3, 0.1])[0]
+    return random.choices(
+        [seconds, minutes, hours, str(random.randint(-1000, -1))],
+        weights=[0.3, 0.3, 0.3, 0.1],
+    )[0]
 
 
 def station_name(name):
-    '''Replaces '&' with '/' with a 50% chance'''
+    """Replaces '&' with '/' with a 50% chance"""
     if not name:
         return None
     return random.choice([name, name.replace("&", "/")])
 
 
 def user_type(user):
-    '''Manipulates the user type string'''
+    """Manipulates the user type string"""
     if not user:
         return None
-    return random.choice([user, user.upper(), user.lower(),
-                          "sub" if user == "Subscriber" else user,
-                          "cust" if user == "Customer" else user])
+    return random.choice(
+        [
+            user,
+            user.upper(),
+            user.lower(),
+            "sub" if user == "Subscriber" else user,
+            "cust" if user == "Customer" else user,
+        ]
+    )
 
 
 def gender(s):
-    '''Manipulates the gender string'''
+    """Manipulates the gender string"""
     if not s:
         return None
-    return random.choice([s.upper(), s.lower(),
-                         s[0].upper() if len(s) > 0 else "",
-                         s[0].lower() if len(s) > 0 else ""])
+    return random.choice(
+        [
+            s.upper(),
+            s.lower(),
+            s[0].upper() if len(s) > 0 else "",
+            s[0].lower() if len(s) > 0 else "",
+        ]
+    )
 
 
 def convert_angle(angle):
-    '''Converts long and lat to DMS notation'''
+    """Converts long and lat to DMS notation"""
     if angle is None:
         return None
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
-    seconds = int((angle - degrees - minutes/60) * 3600)
-    new_angle = str(degrees) + "\u00B0" + \
-        str(minutes) + "'" + str(seconds) + '"'
-    return random.choices([str(angle), new_angle],
-                          weights=[0.55, 0.45])[0]
+    seconds = int((angle - degrees - minutes / 60) * 3600)
+    new_angle = f"{degrees}\u00B0{minutes}'{seconds}\""
+    return random.choices([str(angle), new_angle], weights=[0.55, 0.45])[0]
 
 
 def create_bigquery_dataset():
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f'{client.project}.{DATASET_NAME}'
+    dataset_id = f"{client.project}.{DATASET_NAME}"
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
 
 def write_to_bigquery(df, table_name):
-    '''Write a dataframe to BigQuery'''
+    """Write a dataframe to BigQuery"""
     client = bigquery.Client()
-    dataset_id = f'{client.project}.{DATASET_NAME}'
+    dataset_id = f"{client.project}.{DATASET_NAME}"
 
     # Saving the data to BigQuery
-    df.write.format('bigquery') \
-        .option('table', f"{dataset_id}.{table_name}") \
-        .save()
+    df.write.format("bigquery").option("table", f"{dataset_id}.{table_name}").save()
 
     print(f"Table {table_name} successfully written to BigQuery")
 
@@ -121,7 +129,7 @@ def main():
     # Create a SparkSession under the name "setup". Viewable via the Spark UI
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
-    spark.conf.set('temporaryGcsBucket', BUCKET_NAME)
+    spark.conf.set("temporaryGcsBucket", BUCKET_NAME)
 
     create_bigquery_dataset()
 
@@ -129,7 +137,7 @@ def main():
     test = False
 
     # Check whether or not the job is running as a test
-    if '--test' in sys.argv:
+    if "--test" in sys.argv:
         test = True
         print("A subset of the whole dataset will be uploaded to BigQuery")
     else:
@@ -137,14 +145,14 @@ def main():
 
     # Ingest External Datasets
     for table_name, data in EXTERNAL_TABLES.items():
-        df = spark.createDataFrame(pd.read_csv(data["url"]),
-                                   schema=data["schema"])
+        df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"])
 
         write_to_bigquery(df, table_name)
 
     # Check if table exists
     try:
-        df = spark.read.format('bigquery').option('table', TABLE).load()
+        df = spark.read.format("bigquery").option("table", TABLE).load()
+        # if we are running a test, perform computations on a subset of the data
         if test:
             df = df.sample(False, 0.00001)
     except Py4JJavaError:
@@ -152,28 +160,28 @@ def main():
         return
 
     # Declare dictionary with keys column names and values user defined
-    #  functions and return types
+    # functions and return types
     udf_map = {
-            'tripduration': (trip_duration, StringType()),
-            'start_station_name': (station_name, StringType()),
-            'start_station_latitude': (convert_angle, StringType()),
-            'start_station_longitude': (convert_angle, StringType()),
-            'end_station_name': (station_name, StringType()),
-            'end_station_latitude': (convert_angle, StringType()),
-            'end_station_longitude': (convert_angle, StringType()),
-            'usertype': (user_type, StringType()),
-            'gender': (gender, StringType()),
+        "tripduration": (trip_duration, StringType()),
+        "start_station_name": (station_name, StringType()),
+        "start_station_latitude": (convert_angle, StringType()),
+        "start_station_longitude": (convert_angle, StringType()),
+        "end_station_name": (station_name, StringType()),
+        "end_station_latitude": (convert_angle, StringType()),
+        "end_station_longitude": (convert_angle, StringType()),
+        "usertype": (user_type, StringType()),
+        "gender": (gender, StringType()),
     }
 
     # Declare which columns to set some values to null randomly
     null_columns = [
-            'tripduration',
-            'starttime',
-            'stoptime',
-            'start_station_latitude',
-            'start_station_longitude',
-            'end_station_latitude',
-            'end_station_longitude',
+        "tripduration",
+        "starttime",
+        "stoptime",
+        "start_station_latitude",
+        "start_station_longitude",
+        "end_station_latitude",
+        "end_station_longitude",
     ]
 
     # Dirty the columns
@@ -181,7 +189,7 @@ def main():
         df = df.withColumn(name, UserDefinedFunction(*udf)(name))
 
     # Format the datetimes correctly
-    for name in ['starttime', 'stoptime']:
+    for name in ["starttime", "stoptime"]:
         df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss"))
 
     # Randomly set about 5% of the values in some columns to null
@@ -194,9 +202,9 @@ def main():
     # Create final dirty dataframe
     df = df.union(dup_df)
 
-    print('Uploading citibike dataset...')
+    print("Uploading citibike dataset...")
     write_to_bigquery(df, CITIBIKE_TABLE_NAME)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 298576f6039..20dd0d9ad85 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -16,65 +16,47 @@
 # Set global variables
 ID = uuid.uuid4()
 
-PROJECT = os.environ['GOOGLE_CLOUD_PROJECT']
+PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
 REGION = "us-central1"
-CLUSTER_NAME = f'setup-test-{ID}'
-BUCKET_NAME = f'setup-test-{ID}'
-DATASET_NAME = f'setup-test-{ID}'.replace("-", "_")
+CLUSTER_NAME = f"setup-test-{ID}"
+BUCKET_NAME = f"setup-test-{ID}"
+DATASET_NAME = f"setup-test-{ID}".replace("-", "_")
 CITIBIKE_TABLE = "RAW_DATA"
 DESTINATION_BLOB_NAME = "setup.py"
-JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py'
+JOB_FILE_NAME = f"gs://{BUCKET_NAME}/setup.py"
 TABLE_NAMES = [
     CITIBIKE_TABLE,
     "gas_prices",
 ]
 JOB_DETAILS = {  # Job configuration
-    'placement': {
-        'cluster_name': CLUSTER_NAME
-    },
-    'pyspark_job': {
-        'main_python_file_uri': JOB_FILE_NAME,
-        'args': [
-            BUCKET_NAME,
-            DATASET_NAME,
-            "--test",
-        ],
-        "jar_file_uris": [
-                "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"
-        ],
+    "placement": {"cluster_name": CLUSTER_NAME},
+    "pyspark_job": {
+        "main_python_file_uri": JOB_FILE_NAME,
+        "args": [BUCKET_NAME, DATASET_NAME, "--test",],
+        "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
     },
 }
 CLUSTER_DATA = {  # Create cluster configuration
-    'project_id': PROJECT,
-    'cluster_name': CLUSTER_NAME,
-    'config': {
-        'gce_cluster_config': {
-            'zone_uri': '',
-        },
-        'master_config': {
-            'num_instances': 1,
-            'machine_type_uri': 'n1-standard-8'
-        },
-        'worker_config': {
-            'num_instances': 6,
-            'machine_type_uri': 'n1-standard-8'
-        },
+    "project_id": PROJECT,
+    "cluster_name": CLUSTER_NAME,
+    "config": {
+        "gce_cluster_config": {"zone_uri": "",},
+        "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"},
+        "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"},
         "software_config": {
             "image_version": "1.5.4-debian10",
-            "optional_components": [
-                "ANACONDA"
-            ],
-        }
-    }
+            "optional_components": ["ANACONDA"],
+        },
+    },
 }
 
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_cluster():
     # Create cluster using cluster client
-    cluster_client = dataproc.ClusterControllerClient(client_options={
-        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
-    })
+    cluster_client = dataproc.ClusterControllerClient(
+        client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"}
+    )
     operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA)
 
     # Wait for cluster to provision
@@ -83,8 +65,7 @@ def setup_and_teardown_cluster():
     yield
 
     # Delete cluster
-    operation = cluster_client.delete_cluster(PROJECT, REGION,
-                                              CLUSTER_NAME)
+    operation = cluster_client.delete_cluster(PROJECT, REGION, CLUSTER_NAME)
     operation.result()
 
 
@@ -132,8 +113,9 @@ def get_dataproc_job_output(result):
 
 def assert_table_success_message(table_name, out):
     """Check table upload success message was printed in job logs."""
-    assert re.search(f"Table {table_name} successfully written to BigQuery", out), \
-        f"Table {table_name} sucess message not printed in job logs"
+    assert re.search(
+        f"Table {table_name} successfully written to BigQuery", out
+    ), f"Table {table_name} sucess message not printed in job logs"
 
 
 def test_setup():
@@ -141,11 +123,12 @@ def test_setup():
     Check table upload success message as well as data in the table itself"""
 
     # Submit job to dataproc cluster
-    job_client = dataproc.JobControllerClient(client_options={
-        'api_endpoint': f'{REGION}-dataproc.googleapis.com:443'
-    })
-    response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION,
-                                                  job=JOB_DETAILS)
+    job_client = dataproc.JobControllerClient(
+        client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"}
+    )
+    response = job_client.submit_job_as_operation(
+        project_id=PROJECT, region=REGION, job=JOB_DETAILS
+    )
 
     # Wait for job to complete
     result = response.result()
@@ -160,14 +143,42 @@ def test_setup():
     # Query BigQuery Table
     client = bigquery.Client()
 
+    dms_regex = "-?[0-9]+\u00B0-?[0-9]+'-?[0-9]+\""
+
     regex_dict = {
-        "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"],
-        "gender": ['f', 'F', 'm', 'M', 'u', 'U', 'male', 'MALE', 'female', 'FEMALE', 'unknown', 'UNKNOWN'],
-        "start_station_latitude": ["[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\""],
-        "start_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
-        "end_station_latitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
-        "end_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""],
-        "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"],
+        "tripduration": [
+            "(\\d+(?:\\.\\d+)?) s",
+            "(\\d+(?:\\.\\d+)?) min",
+            "(\\d+(?:\\.\\d+)?) h",
+        ],
+        "gender": [
+            "f",
+            "F",
+            "m",
+            "M",
+            "u",
+            "U",
+            "male",
+            "MALE",
+            "female",
+            "FEMALE",
+            "unknown",
+            "UNKNOWN",
+        ],
+        "start_station_latitude": [dms_regex],
+        "start_station_longitude": [dms_regex],
+        "end_station_latitude": [dms_regex],
+        "end_station_longitude": [dms_regex],
+        "usertype": [
+            "Subscriber",
+            "subscriber",
+            "SUBSCRIBER",
+            "sub",
+            "Customer",
+            "customer",
+            "CUSTOMER",
+            "cust",
+        ],
     }
 
     for column_name, regexes in regex_dict.items():
@@ -186,5 +197,6 @@ def test_setup():
                 if row and re.match(f"\\A{regex}\\Z", row):
                     found = True
                     break
-            assert found, \
-                f"No matches to regular expression \"{regex}\" found in column {column_name}"
+            assert (
+                found
+            ), f'No matches to regular expression "{regex}" found in column {column_name}'

From d35b85506f0754ac3dcbf6b686f44a75cb28bb22 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Wed, 12 Aug 2020 17:28:21 -0400
Subject: [PATCH 54/59] change test variables names to match data cleaning

---
 .../data-ingestion/setup_test.py              | 77 ++++++++++---------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 20dd0d9ad85..039b22a88d2 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -1,5 +1,4 @@
-"""Test file for the setup job in the Data Science Onramp sample application
-Creates a test Dataproc cluster and runs the job with a --test flag.
+"""Test file for the setup job in the Data Science Onramp sample application Creates a test Dataproc cluster and runs the job with a --test flag.
 The job uploads a subset of the data to BigQuery.
 Then, data is pulled from BigQuery and checks are made to see if the data is dirty.
 """
@@ -13,51 +12,55 @@
 from google.cloud import storage
 import pytest
 
-# Set global variables
-ID = uuid.uuid4()
-
-PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
-REGION = "us-central1"
-CLUSTER_NAME = f"setup-test-{ID}"
-BUCKET_NAME = f"setup-test-{ID}"
-DATASET_NAME = f"setup-test-{ID}".replace("-", "_")
-CITIBIKE_TABLE = "RAW_DATA"
-DESTINATION_BLOB_NAME = "setup.py"
-JOB_FILE_NAME = f"gs://{BUCKET_NAME}/setup.py"
-TABLE_NAMES = [
-    CITIBIKE_TABLE,
+# GCP Project
+PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
+TEST_ID = uuid.uuid4()
+
+# Google Cloud Storage constants
+BUCKET_NAME = f"setup-test-{TEST_ID}"
+BUCKET_BLOB = "setup.py"
+
+BQ_DATASET = f"setup-test-{TEST_ID}".replace("-", "_")
+BQ_CITIBIKE_TABLE = "RAW_DATA"
+BQ_TABLES = [
+    BQ_CITIBIKE_TABLE,
     "gas_prices",
 ]
-JOB_DETAILS = {  # Job configuration
-    "placement": {"cluster_name": CLUSTER_NAME},
-    "pyspark_job": {
-        "main_python_file_uri": JOB_FILE_NAME,
-        "args": [BUCKET_NAME, DATASET_NAME, "--test",],
-        "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
-    },
-}
-CLUSTER_DATA = {  # Create cluster configuration
-    "project_id": PROJECT,
-    "cluster_name": CLUSTER_NAME,
+
+# Dataproc constants
+DATAPROC_CLUSTER = f"setup-test-{TEST_ID}"
+CLUSTER_REGION = "us-central1"
+CLUSTER_IMAGE = "1.5.4-debian10"
+CLUSTER_CONFIG = {  # Dataproc cluster configuration
+    "project_id": PROJECT_ID,
+    "cluster_name": DATAPROC_CLUSTER,
     "config": {
         "gce_cluster_config": {"zone_uri": "",},
         "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"},
         "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"},
         "software_config": {
-            "image_version": "1.5.4-debian10",
+            "image_version": CLUSTER_IMAGE,
             "optional_components": ["ANACONDA"],
         },
     },
 }
+DATAPROC_JOB = {    # Dataproc job configuration
+    "placement": {"cluster_name": DATAPROC_CLUSTER},
+    "pyspark_job": {
+        "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}",
+        "args": [BUCKET_NAME, BQ_DATASET, "--test",],
+        "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
+    },
+}
 
 
 @pytest.fixture(autouse=True)
 def setup_and_teardown_cluster():
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(
-        client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"}
+        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
-    operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA)
+    operation = cluster_client.create_cluster(PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG)
 
     # Wait for cluster to provision
     operation.result()
@@ -65,7 +68,7 @@ def setup_and_teardown_cluster():
     yield
 
     # Delete cluster
-    operation = cluster_client.delete_cluster(PROJECT, REGION, CLUSTER_NAME)
+    operation = cluster_client.delete_cluster(PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER)
     operation.result()
 
 
@@ -76,7 +79,7 @@ def setup_and_teardown_bucket():
     bucket = storage_client.create_bucket(BUCKET_NAME)
 
     # Upload file
-    blob = bucket.blob(DESTINATION_BLOB_NAME)
+    blob = bucket.blob(BUCKET_BLOB)
     blob.upload_from_filename("setup.py")
 
     yield
@@ -89,12 +92,12 @@ def setup_and_teardown_bucket():
 @pytest.fixture(autouse=True)
 def setup_and_teardown_bq_dataset():
     # Dataset is created by the client
-    bq_client = bigquery.Client(project=PROJECT)
+    bq_client = bigquery.Client(project=PROJECT_ID)
 
     yield
 
     # Delete Dataset
-    bq_client.delete_dataset(DATASET_NAME, delete_contents=True)
+    bq_client.delete_dataset(BQ_DATASET, delete_contents=True)
 
 
 def get_blob_from_path(path):
@@ -124,10 +127,10 @@ def test_setup():
 
     # Submit job to dataproc cluster
     job_client = dataproc.JobControllerClient(
-        client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"}
+        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
     response = job_client.submit_job_as_operation(
-        project_id=PROJECT, region=REGION, job=JOB_DETAILS
+        project_id=PROJECT_ID, region=CLUSTER_REGION, job=JOB_DETAILS
     )
 
     # Wait for job to complete
@@ -137,7 +140,7 @@ def test_setup():
     out = get_dataproc_job_output(result)
 
     # Check logs to see if tables were uploaded
-    for table_name in TABLE_NAMES:
+    for table_name in BQ_TABLES:
         assert_table_success_message(table_name, out)
 
     # Query BigQuery Table
@@ -182,7 +185,7 @@ def test_setup():
     }
 
     for column_name, regexes in regex_dict.items():
-        query = f"SELECT {column_name} FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`"
+        query = f"SELECT {column_name} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_CITIBIKE_TABLE}`"
         query_job = client.query(query)
 
         result = query_job.result()

From 6105f79f052f6738d2416eec981f18c933191db0 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Wed, 12 Aug 2020 17:30:16 -0400
Subject: [PATCH 55/59] blacken setup_test file

---
 data-science-onramp/data-ingestion/setup_test.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 039b22a88d2..fd5f3ce75fc 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -44,7 +44,7 @@
         },
     },
 }
-DATAPROC_JOB = {    # Dataproc job configuration
+DATAPROC_JOB = {  # Dataproc job configuration
     "placement": {"cluster_name": DATAPROC_CLUSTER},
     "pyspark_job": {
         "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}",
@@ -60,7 +60,9 @@ def setup_and_teardown_cluster():
     cluster_client = dataproc.ClusterControllerClient(
         client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
-    operation = cluster_client.create_cluster(PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG)
+    operation = cluster_client.create_cluster(
+        PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG
+    )
 
     # Wait for cluster to provision
     operation.result()
@@ -68,7 +70,9 @@ def setup_and_teardown_cluster():
     yield
 
     # Delete cluster
-    operation = cluster_client.delete_cluster(PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER)
+    operation = cluster_client.delete_cluster(
+        PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER
+    )
     operation.result()
 
 
@@ -185,7 +189,9 @@ def test_setup():
     }
 
     for column_name, regexes in regex_dict.items():
-        query = f"SELECT {column_name} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_CITIBIKE_TABLE}`"
+        query = (
+            f"SELECT {column_name} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_CITIBIKE_TABLE}`"
+        )
         query_job = client.query(query)
 
         result = query_job.result()

From 35ec8cb10582077fca453e03a8e194f1a4e53655 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Wed, 12 Aug 2020 17:59:11 -0400
Subject: [PATCH 56/59] fix unchanged variable name

---
 data-science-onramp/data-ingestion/setup_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index fd5f3ce75fc..8e7ab8a40d7 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -134,7 +134,7 @@ def test_setup():
         client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
     response = job_client.submit_job_as_operation(
-        project_id=PROJECT_ID, region=CLUSTER_REGION, job=JOB_DETAILS
+        project_id=PROJECT_ID, region=CLUSTER_REGION, job=DATAPROC_JOB
     )
 
     # Wait for job to complete

From 9561f35ee29130cd9df9c0438e3e0657a7b23c5a Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 13 Aug 2020 11:31:10 -0400
Subject: [PATCH 57/59] WIP: address PR comments

---
 .../data-ingestion/requirements-test.txt      |  2 +-
 .../data-ingestion/requirements.txt           |  9 +++----
 data-science-onramp/data-ingestion/setup.py   | 25 ++++++++++---------
 .../data-ingestion/setup_test.py              | 11 ++++----
 4 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt
index 781d4326c94..2018c08113a 100644
--- a/data-science-onramp/data-ingestion/requirements-test.txt
+++ b/data-science-onramp/data-ingestion/requirements-test.txt
@@ -1 +1 @@
-pytest==5.3.2
+pytest==6.0.0
diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
index e0328e4aec9..b5edbdf1ad7 100644
--- a/data-science-onramp/data-ingestion/requirements.txt
+++ b/data-science-onramp/data-ingestion/requirements.txt
@@ -1,7 +1,6 @@
-grpcio==1.29.0
-google-auth==1.16.0
-google-auth-httplib2==0.0.3
-google-cloud==0.34.0
+#grpcio==1.29.0
+#google-auth==1.16.0
+#google-auth-httplib2==0.0.3
 google-cloud-storage==1.28.1
-google-cloud-dataproc==0.8.0
+google-cloud-dataproc==2.0.0
 google-cloud-bigquery==1.25.0
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
index cc5245dfdcf..6921947ddca 100644
--- a/data-science-onramp/data-ingestion/setup.py
+++ b/data-science-onramp/data-ingestion/setup.py
@@ -18,9 +18,6 @@
 from pyspark.sql.functions import date_format, expr, UserDefinedFunction, when
 from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
-
-BUCKET_NAME = sys.argv[1]
-DATASET_NAME = sys.argv[2]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 CITIBIKE_TABLE_NAME = "RAW_DATA"
 EXTERNAL_TABLES = {
@@ -96,7 +93,7 @@ def gender(s):
 
 def convert_angle(angle):
     """Converts long and lat to DMS notation"""
-    if angle is None:
+    if not angle:
         return None
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
@@ -105,19 +102,19 @@ def convert_angle(angle):
     return random.choices([str(angle), new_angle], weights=[0.55, 0.45])[0]
 
 
-def create_bigquery_dataset():
+def create_bigquery_dataset(dataset_name):
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f"{client.project}.{DATASET_NAME}"
+    dataset_id = f"{client.project}.{dataset_name}"
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
 
-def write_to_bigquery(df, table_name):
+def write_to_bigquery(df, table_name, dataset_name):
     """Write a dataframe to BigQuery"""
     client = bigquery.Client()
-    dataset_id = f"{client.project}.{DATASET_NAME}"
+    dataset_id = f"{client.project}.{dataset_name}"
 
     # Saving the data to BigQuery
     df.write.format("bigquery").option("table", f"{dataset_id}.{table_name}").save()
@@ -126,12 +123,16 @@ def write_to_bigquery(df, table_name):
 
 
 def main():
-    # Create a SparkSession under the name "setup". Viewable via the Spark UI
+    # Get command line arguments
+    BUCKET_NAME = sys.argv[1]
+    DATASET_NAME = sys.argv[2]
+
+    # Create a SparkSession under the name "setup"
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
     spark.conf.set("temporaryGcsBucket", BUCKET_NAME)
 
-    create_bigquery_dataset()
+    create_bigquery_dataset(DATASET_NAME)
 
     # Whether we are running the job as a test
     test = False
@@ -147,7 +148,7 @@ def main():
     for table_name, data in EXTERNAL_TABLES.items():
         df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"])
 
-        write_to_bigquery(df, table_name)
+        write_to_bigquery(df, table_name, DATASET_NAME)
 
     # Check if table exists
     try:
@@ -203,7 +204,7 @@ def main():
     df = df.union(dup_df)
 
     print("Uploading citibike dataset...")
-    write_to_bigquery(df, CITIBIKE_TABLE_NAME)
+    write_to_bigquery(df, CITIBIKE_TABLE_NAME, DATASET_NAME)
 
 
 if __name__ == "__main__":
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index 8e7ab8a40d7..d72fdb92ec1 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -35,7 +35,7 @@
     "project_id": PROJECT_ID,
     "cluster_name": DATAPROC_CLUSTER,
     "config": {
-        "gce_cluster_config": {"zone_uri": "",},
+        "gce_cluster_config": {"zone_uri": ""},
         "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"},
         "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"},
         "software_config": {
@@ -48,7 +48,7 @@
     "placement": {"cluster_name": DATAPROC_CLUSTER},
     "pyspark_job": {
         "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}",
-        "args": [BUCKET_NAME, BQ_DATASET, "--test",],
+        "args": [BUCKET_NAME, BQ_DATASET, "--test"],
         "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
     },
 }
@@ -58,10 +58,11 @@
 def setup_and_teardown_cluster():
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(
-        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
+        #client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
+
     operation = cluster_client.create_cluster(
-        PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG
+        project_id=PROJECT_ID, region=CLUSTER_REGION, cluster=CLUSTER_CONFIG
     )
 
     # Wait for cluster to provision
@@ -71,7 +72,7 @@ def setup_and_teardown_cluster():
 
     # Delete cluster
     operation = cluster_client.delete_cluster(
-        PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER
+        project_id=PROJECT_ID, region=CLUSTER_REGION, name=DATAPROC_CLUSTER
     )
     operation.result()
 

From 3242654158e0c01a24f2f004933345d2ce043ab6 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 13 Aug 2020 17:40:10 -0400
Subject: [PATCH 58/59] apply temporary fix for ANACONDA optional component

---
 data-science-onramp/data-ingestion/setup_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
index d72fdb92ec1..c325b5a0e98 100644
--- a/data-science-onramp/data-ingestion/setup_test.py
+++ b/data-science-onramp/data-ingestion/setup_test.py
@@ -40,7 +40,7 @@
         "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"},
         "software_config": {
             "image_version": CLUSTER_IMAGE,
-            "optional_components": ["ANACONDA"],
+            "optional_components": [5],
         },
     },
 }
@@ -58,7 +58,7 @@
 def setup_and_teardown_cluster():
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(
-        #client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
+        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
 
     operation = cluster_client.create_cluster(
@@ -72,7 +72,7 @@ def setup_and_teardown_cluster():
 
     # Delete cluster
     operation = cluster_client.delete_cluster(
-        project_id=PROJECT_ID, region=CLUSTER_REGION, name=DATAPROC_CLUSTER
+        project_id=PROJECT_ID, region=CLUSTER_REGION, cluster_name=DATAPROC_CLUSTER
     )
     operation.result()
 

From b82059b1de61c3b1f3eae112629098ce378fa0e3 Mon Sep 17 00:00:00 2001
From: Diego Lopez <lodiego@google.com>
Date: Thu, 13 Aug 2020 18:24:09 -0400
Subject: [PATCH 59/59] remove data cleaning files

---
 data-science-onramp/data-cleaning/clean.py    | 226 ------------------
 data-science-onramp/data-cleaning/clean.sh    |  12 -
 .../data-cleaning/clean_test.py               | 144 -----------
 .../data-cleaning/requirements-test.txt       |   1 -
 .../data-cleaning/requirements.txt            |   6 -
 5 files changed, 389 deletions(-)
 delete mode 100644 data-science-onramp/data-cleaning/clean.py
 delete mode 100755 data-science-onramp/data-cleaning/clean.sh
 delete mode 100644 data-science-onramp/data-cleaning/clean_test.py
 delete mode 100644 data-science-onramp/data-cleaning/requirements-test.txt
 delete mode 100644 data-science-onramp/data-cleaning/requirements.txt

diff --git a/data-science-onramp/data-cleaning/clean.py b/data-science-onramp/data-cleaning/clean.py
deleted file mode 100644
index 3c928dd8e9d..00000000000
--- a/data-science-onramp/data-cleaning/clean.py
+++ /dev/null
@@ -1,226 +0,0 @@
-import datetime
-import re
-import sys
-import time
-
-from google.cloud import storage
-from py4j.protocol import Py4JJavaError
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import UserDefinedFunction
-from pyspark.sql.types import FloatType, IntegerType, StringType
-
-
-PROJECT_ID = sys.argv[1]
-BUCKET_NAME = sys.argv[2]
-TABLE = f"{PROJECT_ID}.new_york_citibike_trips.RAW_DATA"
-
-
-def trip_duration_udf(duration):
-    """Convert trip duration to seconds. Return None if negative."""
-    if not duration:
-        return None
-
-    time = re.match(r"\d*.\d*", duration)
-
-    if not time:
-        return None
-
-    time = float(time[0])
-
-    if time < 0:
-        return None
-
-    if "m" in duration:
-        time *= 60
-    elif "h" in duration:
-        time *= 60 * 60
-
-    return int(time)
-
-
-def station_name_udf(name):
-    """Replaces '/' with '&'."""
-    return name.replace("/", "&") if name else None
-
-
-def user_type_udf(user):
-    """Converts user type to 'Subscriber' or 'Customer'."""
-    if not user:
-        return None
-
-    if user.lower().startswith("sub"):
-        return "Subscriber"
-    elif user.lower().startswith("cust"):
-        return "Customer"
-
-
-def gender_udf(gender):
-    """Converts gender to 'Male' or 'Female'."""
-    if not gender:
-        return None
-
-    if gender.lower().startswith("m"):
-        return "Male"
-    elif gender.lower().startswith("f"):
-        return "Female"
-
-
-def angle_udf(angle):
-    """Converts DMS notation to degrees. Return None if not in DMS or degrees notation."""
-    if not angle:
-        return None
-
-    dms = re.match(r'(-?\d*).(-?\d*)\'(-?\d*)"', angle)
-    if dms:
-        return int(dms[1]) + int(dms[2]) / 60 + int(dms[3]) / (60 * 60)
-
-    degrees = re.match(r"\d*.\d*", angle)
-    if degrees:
-        return float(degrees[0])
-
-
-def compute_time(duration, start, end):
-    """Calculates duration, start time, and end time from each other if one value is null."""
-    time_format = "%Y-%m-%dT%H:%M:%S"
-
-    # Transform to datetime objects
-    if start:
-        # Round to nearest second
-        if "." in start:
-            start = start[: start.index(".")]
-        # Convert to datetime
-        start = datetime.datetime.strptime(start, time_format)
-    if end:
-        # Round to nearest second
-        if "." in end:
-            end = end[: end.index(".")]
-        # Convert to datetime
-        end = datetime.datetime.strptime(end, time_format)
-    if duration:
-        # Convert to timedelta
-        duration = datetime.timedelta(seconds=duration)
-
-    # Calculate missing value
-    if start and end and not duration:
-        duration = end - start
-    elif duration and end and not start:
-        start = end - duration
-    elif duration and start and not end:
-        end = start + duration
-
-    # Transform to primitive types
-    if duration:
-        duration = int(duration.total_seconds())
-    if start:
-        start = start.strftime(time_format)
-    if end:
-        end = end.strftime(time_format)
-
-    return (duration, start, end)
-
-
-def compute_duration_udf(duration, start, end):
-    """Calculates duration from start and end time if null."""
-    return compute_time(duration, start, end)[0]
-
-
-def compute_start_udf(duration, start, end):
-    """Calculates start time from duration and end time if null."""
-    return compute_time(duration, start, end)[1]
-
-
-def compute_end_udf(duration, start, end):
-    """Calculates end time from duration and start time if null."""
-    return compute_time(duration, start, end)[2]
-
-
-if __name__ == "__main__":
-    # Create a SparkSession, viewable via the Spark UI
-    spark = SparkSession.builder.appName("data_cleaning").getOrCreate()
-
-    # Load data into dataframe if table exists
-    try:
-        df = spark.read.format("bigquery").option("table", TABLE).load()
-    except Py4JJavaError as e:
-        raise Exception(f"Error reading {TABLE}") from e
-
-    # Single-parameter udfs
-    udfs = {
-        "start_station_name": UserDefinedFunction(station_name_udf, StringType()),
-        "end_station_name": UserDefinedFunction(station_name_udf, StringType()),
-        "tripduration": UserDefinedFunction(trip_duration_udf, IntegerType()),
-        "usertype": UserDefinedFunction(user_type_udf, StringType()),
-        "gender": UserDefinedFunction(gender_udf, StringType()),
-        "start_station_latitude": UserDefinedFunction(angle_udf, FloatType()),
-        "start_station_longitude": UserDefinedFunction(angle_udf, FloatType()),
-        "end_station_latitude": UserDefinedFunction(angle_udf, FloatType()),
-        "end_station_longitude": UserDefinedFunction(angle_udf, FloatType()),
-    }
-
-    for name, udf in udfs.items():
-        df = df.withColumn(name, udf(name))
-
-    # Multi-parameter udfs
-    multi_udfs = {
-        "tripduration": {
-            "udf": UserDefinedFunction(compute_duration_udf, IntegerType()),
-            "params": ("tripduration", "starttime", "stoptime"),
-        },
-        "starttime": {
-            "udf": UserDefinedFunction(compute_start_udf, StringType()),
-            "params": ("tripduration", "starttime", "stoptime"),
-        },
-        "stoptime": {
-            "udf": UserDefinedFunction(compute_end_udf, StringType()),
-            "params": ("tripduration", "starttime", "stoptime"),
-        },
-    }
-
-    for name, obj in multi_udfs.items():
-        df = df.withColumn(name, obj["udf"](*obj["params"]))
-
-    # Display sample of rows
-    df.sample(False, 0.001).show(n=100)
-
-    # Write results to GCS
-    if "--dry-run" in sys.argv:
-        print("Data will not be uploaded to GCS")
-    else:
-        # Set GCS temp location
-        path = str(time.time())
-        temp_path = "gs://" + BUCKET_NAME + "/" + path
-
-        # Write dataframe to temp location to preserve the data in final location
-        # This takes time, so final location should not be overwritten with partial data
-        print("Uploading data to GCS...")
-        (
-            df.write
-            # gzip the output file
-            .options(codec="org.apache.hadoop.io.compress.GzipCodec")
-            # write as csv
-            .csv(temp_path)
-        )
-
-        # Get GCS bucket
-        storage_client = storage.Client()
-        source_bucket = storage_client.get_bucket(BUCKET_NAME)
-
-        # Get all files in temp location
-        blobs = list(source_bucket.list_blobs(prefix=path))
-
-        # Copy files from temp location to the final location
-        # This is much quicker than the original write to the temp location
-        final_path = "clean_data/"
-        for blob in blobs:
-            file_match = re.match(path + r"/(part-\d*)[0-9a-zA-Z\-]*.csv.gz", blob.name)
-            if file_match:
-                new_blob = final_path + file_match[1] + ".csv.gz"
-                source_bucket.copy_blob(blob, source_bucket, new_blob)
-
-        # Delete the temp location
-        for blob in blobs:
-            blob.delete()
-
-        print(
-            "Data successfully uploaded to " + "gs://" + BUCKET_NAME + "/" + final_path
-        )
diff --git a/data-science-onramp/data-cleaning/clean.sh b/data-science-onramp/data-cleaning/clean.sh
deleted file mode 100755
index cff45237ce6..00000000000
--- a/data-science-onramp/data-cleaning/clean.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-# Submit a PySpark job via the Cloud Dataproc Jobs API
-# Requires having PROJECT_ID, CLUSTER_NAME and BUCKET_NAME set as 
-# environment variables
-
-export CLUSTER_NAME=data-cleaning
-export PROJECT_ID=data-science-onramp
-export BUCKET_NAME=citibikevd
-
-gcloud dataproc jobs submit pyspark --cluster ${CLUSTER_NAME} \
-    --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
-    --driver-log-levels root=FATAL \
-    clean.py -- ${PROJECT_ID} ${BUCKET_NAME} --dry-run
\ No newline at end of file
diff --git a/data-science-onramp/data-cleaning/clean_test.py b/data-science-onramp/data-cleaning/clean_test.py
deleted file mode 100644
index ee28a65da01..00000000000
--- a/data-science-onramp/data-cleaning/clean_test.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import os
-import re
-import uuid
-
-from google.cloud import dataproc_v1 as dataproc
-from google.cloud import storage
-import pytest
-
-# Set global variables
-PROJECT = os.environ["GCLOUD_PROJECT"]
-DATAPROC_CLUSTER = f"clean-test-{uuid.uuid4()}"
-BUCKET_NAME = f"clean-test-code-{uuid.uuid4()}"
-CLUSTER_REGION = "us-east4"
-DESTINATION_BLOB_NAME = "clean.py"
-JOB_FILE_NAME = f"gs://{BUCKET_NAME}/clean.py"
-JOB_DETAILS = {  # Job configuration
-    "placement": {"cluster_name": DATAPROC_CLUSTER},
-    "pyspark_job": {
-        "main_python_file_uri": JOB_FILE_NAME,
-        "args": [PROJECT, BUCKET_NAME, "--dry-run",],
-        "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
-    },
-}
-CLUSTER_IMAGE = "1.5.4-debian10"
-CLUSTER_DATA = {  # Create cluster configuration
-    "project_id": PROJECT,
-    "cluster_name": DATAPROC_CLUSTER,
-    "config": {
-        "gce_cluster_config": {
-            "zone_uri": "",
-            "metadata": {"PIP_PACKAGES": "google-cloud-storage"},
-        },
-        "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"},
-        "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"},
-        "software_config": {
-            "image_version": CLUSTER_IMAGE,
-            "optional_components": ["ANACONDA"],
-        },
-    },
-}
-
-
-@pytest.fixture(autouse=True)
-def setup_and_teardown_cluster():
-    # Create cluster using cluster client
-    cluster_client = dataproc.ClusterControllerClient(
-        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
-    )
-    operation = cluster_client.create_cluster(PROJECT, CLUSTER_REGION, CLUSTER_DATA)
-
-    # Wait for cluster to provision
-    operation.result()
-
-    yield
-
-    # Delete cluster
-    operation = cluster_client.delete_cluster(PROJECT, CLUSTER_REGION, DATAPROC_CLUSTER)
-    operation.result()
-
-
-@pytest.fixture(autouse=True)
-def setup_and_teardown_bucket():
-    # Create GCS Bucket
-    storage_client = storage.Client()
-    bucket = storage_client.create_bucket(BUCKET_NAME)
-
-    # Upload file
-    blob = bucket.blob(DESTINATION_BLOB_NAME)
-    blob.upload_from_filename("clean.py")
-
-    yield
-
-    # Delete GCS bucket
-    bucket = storage_client.get_bucket(BUCKET_NAME)
-    bucket.delete(force=True)
-
-
-def is_in_table(value, out):
-    return re.search(f"\\| *{value} *\\|", out)
-
-
-def get_blob_from_path(path):
-    bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1]
-    bucket = storage.Client().get_bucket(bucket_name)
-    output_location = re.search("google-cloud-dataproc.+", path).group(0)
-    return bucket.blob(output_location)
-
-
-def test_clean():
-    """Tests clean.py by submitting it to a Dataproc cluster"""
-
-    # Submit job to Dataproc cluster
-    job_client = dataproc.JobControllerClient(
-        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
-    )
-    response = job_client.submit_job_as_operation(
-        project_id=PROJECT, region=CLUSTER_REGION, job=JOB_DETAILS
-    )
-
-    # Wait for job to complete
-    result = response.result()
-
-    # Get job output
-    output_location = result.driver_output_resource_uri + ".000000000"
-    blob = get_blob_from_path(output_location)
-    out = blob.download_as_string().decode("utf-8")
-
-    # trip duration
-    assert not is_in_table(r"\d*.\d* s", out)
-    assert not is_in_table(r"\d*.\d* min", out)
-    assert not is_in_table(r"\d*.\d* h", out)
-
-    # station latitude & longitude
-    assert not is_in_table(r"\d+" + "\u00B0" + r"\d+\'\d+\"", out)
-
-    assert is_in_table(r"\d*.\d*", out)
-
-    # gender
-    assert not is_in_table("M", out)
-    assert not is_in_table("m", out)
-    assert not is_in_table("male", out)
-    assert not is_in_table("MALE", out)
-    assert not is_in_table("F", out)
-    assert not is_in_table("f", out)
-    assert not is_in_table("female", out)
-    assert not is_in_table("FEMALE", out)
-    assert not is_in_table("U", out)
-    assert not is_in_table("u", out)
-    assert not is_in_table("unknown", out)
-    assert not is_in_table("UNKNOWN", out)
-
-    assert is_in_table("Male", out)
-    assert is_in_table("Female", out)
-
-    # customer_plan
-    assert not is_in_table("subscriber", out)
-    assert not is_in_table("SUBSCRIBER", out)
-    assert not is_in_table("sub", out)
-    assert not is_in_table("customer", out)
-    assert not is_in_table("CUSTOMER", out)
-    assert not is_in_table("cust", out)
-
-    assert is_in_table("Subscriber", out)
-    assert is_in_table("Customer", out)
diff --git a/data-science-onramp/data-cleaning/requirements-test.txt b/data-science-onramp/data-cleaning/requirements-test.txt
deleted file mode 100644
index 781d4326c94..00000000000
--- a/data-science-onramp/data-cleaning/requirements-test.txt
+++ /dev/null
@@ -1 +0,0 @@
-pytest==5.3.2
diff --git a/data-science-onramp/data-cleaning/requirements.txt b/data-science-onramp/data-cleaning/requirements.txt
deleted file mode 100644
index f435423c623..00000000000
--- a/data-science-onramp/data-cleaning/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-grpcio==1.29.0
-google-auth==1.16.0
-google-auth-httplib2==0.0.3
-google-cloud==0.34.0
-google-cloud-storage==1.28.1
-google-cloud-dataproc==0.8.0
\ No newline at end of file