From 92cf76335d74e89324d0a7b3a3f979db5f50c50e Mon Sep 17 00:00:00 2001 From: vuppalli Date: Fri, 5 Jun 2020 15:35:13 -0400 Subject: [PATCH 01/59] add data ingestion code --- .../data-ingestion/requirements-test.txt | 1 + .../data-ingestion/requirements.txt | 6 + .../data-ingestion/setup-test.py | 210 ++++++++++++++++++ data-science-onramp/data-ingestion/setup.py | 149 +++++++++++++ data-science-onramp/data-ingestion/setup.sh | 6 + 5 files changed, 372 insertions(+) create mode 100644 data-science-onramp/data-ingestion/requirements-test.txt create mode 100644 data-science-onramp/data-ingestion/requirements.txt create mode 100644 data-science-onramp/data-ingestion/setup-test.py create mode 100644 data-science-onramp/data-ingestion/setup.py create mode 100644 data-science-onramp/data-ingestion/setup.sh diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt new file mode 100644 index 00000000000..781d4326c94 --- /dev/null +++ b/data-science-onramp/data-ingestion/requirements-test.txt @@ -0,0 +1 @@ +pytest==5.3.2 diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt new file mode 100644 index 00000000000..f435423c623 --- /dev/null +++ b/data-science-onramp/data-ingestion/requirements.txt @@ -0,0 +1,6 @@ +grpcio==1.29.0 +google-auth==1.16.0 +google-auth-httplib2==0.0.3 +google-cloud==0.34.0 +google-cloud-storage==1.28.1 +google-cloud-dataproc==0.8.0 \ No newline at end of file diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup-test.py new file mode 100644 index 00000000000..d827c805818 --- /dev/null +++ b/data-science-onramp/data-ingestion/setup-test.py @@ -0,0 +1,210 @@ +import os +import re + +import uuid + +from google.api_core.exceptions import GoogleAPICallError + +from google.cloud import dataproc_v1 as dataproc +from google.cloud import storage +from google.cloud.exceptions import NotFound + +import pytest + +waiting_cluster_callback = False + +# Set global variables +project = os.environ['GCLOUD_PROJECT'] +region = "us-central1" +zone = "us-central1-a" +cluster_name = 'setup-test-{}'.format(str(uuid.uuid4())) +bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4())) + + +@pytest.fixture(autouse=True) +def teardown(): + yield + + # Delete cluster + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': f'{region}-dataproc.googleapis.com:443' + }) + + try: + operation = cluster_client.delete_cluster(project, region, + cluster_name) + operation.result() + except GoogleAPICallError: + pass + + # Delete GCS bucket + storage_client = storage.Client() + try: + bucket = storage_client.get_bucket(bucket_name) + bucket.delete(force=True) + except NotFound: + pass + + +def test_setup(capsys): + '''Tests setup.py by submitting it to a dataproc cluster''' + + # Create GCS Bucket + storage_client = storage.Client() + bucket = storage_client.create_bucket(bucket_name) + + # Upload file + destination_blob_name = "setup.py" + blob = bucket.blob(destination_blob_name) + blob.upload_from_filename("setup.py") + + job_file_name = "gs://" + bucket_name + "/setup.py" + + # Create cluster configuration + zone_uri = \ + 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( + project, zone) + cluster_data = { + 'project_id': project, + 'cluster_name': cluster_name, + 'config': { + 'gce_cluster_config': { + 'zone_uri': zone_uri, + "metadata": { + "PIP_PACKAGES": "google-cloud-storage" + }, + }, + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-8' + }, + 'worker_config': { + 'num_instances': 6, + 'machine_type_uri': 'n1-standard-8' + }, + "initialization_actions": [ + { + "executable_file": ("gs://dataproc-initialization-actions/" + "python/pip-install.sh"), + } + ], + "software_config": { + "image_version": "1.5.4-debian10", + "optional_components": [ + "ANACONDA" + ], + } + } + } + + # Create cluster using cluster client + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + }) + + cluster = cluster_client.create_cluster(project, region, cluster_data) + cluster.add_done_callback(callback) + + # Wait for cluster to provision + global waiting_cluster_callback + waiting_cluster_callback = True + + wait_for_cluster_creation() + + # Create job configuration + job_details = { + 'placement': { + 'cluster_name': cluster_name + }, + 'pyspark_job': { + 'main_python_file_uri': job_file_name, + 'args': [ + bucket_name, + "--test", + ], + "jar_file_uris": [ + "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" + ], + }, + } + + # Submit job to dataproc cluster + job_client = dataproc.JobControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + }) + + result = job_client.submit_job(project_id=project, region=region, + job=job_details) + + job_id = result.reference.job_id + print('Submitted job \"{}\".'.format(job_id)) + + # Wait for job to complete + wait_for_job(job_client, job_id) + + # Get job output + cluster_info = cluster_client.get_cluster(project, region, cluster_name) + bucket = storage_client.get_bucket(cluster_info.config.config_bucket) + output_blob = ( + 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' + .format(cluster_info.cluster_uuid, job_id)) + out = bucket.blob(output_blob).download_as_string().decode("utf-8") + + # tripDuration + assert re.search("[0-9] s", out) + assert re.search("[0-9] m", out) + assert re.search("[0-9] h", out) + + # station latitude & longitude + assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out) + + # birth_year + assert re.search("19[0-9][0-9]\\|", out) + assert re.search("20[0-9][0-9]\\|", out) + + # gender + assert "M" in out + assert "male" in out + assert "MALE" in out + assert "F" in out + assert "female" in out + assert "FEMALE" in out + assert "u" in out + assert "unknown" in out + assert "UNKNOWN" in out + + # customer_plan + assert "Subscriber" in out + assert "subscriber" in out + assert "SUBSCRIBER" in out + assert "sub" in out + assert "Customer" in out + assert "customer" in out + assert "CUSTOMER" in out + assert "cust" in out + + # Missing data + assert "null" in out + + +def callback(operation_future): + '''Sets a flag to stop waiting''' + global waiting_cluster_callback + waiting_cluster_callback = False + + +def wait_for_cluster_creation(): + '''Waits for cluster to create''' + while True: + if not waiting_cluster_callback: + break + + +def wait_for_job(job_client, job_id): + '''Waits for job to finish''' + while True: + job = job_client.get_job(project, region, job_id) + assert job.status.State.Name(job.status.state) != "ERROR" + + if job.status.State.Name(job.status.state) == "DONE": + return diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py new file mode 100644 index 00000000000..91a740b34d0 --- /dev/null +++ b/data-science-onramp/data-ingestion/setup.py @@ -0,0 +1,149 @@ +from random import choice, choices, randint, seed +import sys + +from time import time_ns + +from google.cloud import bigquery + +from py4j.protocol import Py4JJavaError +from pyspark.sql import SparkSession + +from pyspark.sql.functions import UserDefinedFunction +from pyspark.sql.types import IntegerType, StringType + + +# Create a SparkSession under the name "setup". Viewable via the Spark UI +spark = SparkSession.builder.appName("setup").getOrCreate() + +bucket_name = sys.argv[1] +upload = True # Whether to upload data to BigQuery + +# Check whether or not results should be uploaded +try: + sys.argv[2] + upload = False +except IndexError: + print("Results will be uploaded to BigQuery") + +table = "bigquery-public-data.new_york_citibike.citibike_trips" + +# Check if table exists +try: + df = spark.read.format('bigquery').option('table', table).load() +except Py4JJavaError: + print(f"{table} does not exist. ") + sys.exit(0) + +# START MAKING DATA DIRTY + + +def random_select(items, cum_weights): + '''Picks an item according to the cumulative weights''' + return choices(items, cum_weights=cum_weights, k=1)[0] + + +def tripduration(duration): + '''Converts trip duration to other units''' + seconds = str(duration) + " s" + minutes = str(float(duration) / 60) + " min" + hours = str(float(duration) / 3600) + " h" + return random_select([seconds, minutes, hours, str(randint(-1000, -1))], + [0.3, 0.6, 0.9, 1]) + + +def station_name(name): + '''Replaces '&' with '/' with a 50% chance''' + return choice([name, name.replace("&", "/")]) + + +def usertype(user): + '''Manipulates the user type string''' + return choice([user, user.upper(), user.lower(), + "sub" if user == "Subscriber" else user, + "cust" if user == "Customer" else user]) + + +def gender(s): + '''Manipulates the gender string''' + return choice([s, s.upper(), s.lower(), + s[0] if len(s) > 0 else "", + s[0].lower() if len(s) > 0 else ""]) + + +def convertAngle(angle): + '''Converts long and lat to DMS notation''' + degrees = int(angle) + minutes = int((angle - degrees) * 60) + seconds = int((angle - degrees - minutes/60) * 3600) + new_angle = str(degrees) + u"\u00B0" + \ + str(minutes) + "'" + str(seconds) + '"' + return random_select([str(angle), new_angle], cum_weights=[0.55, 1]) + + +def dirty_data(proc_func, allow_none): + '''Master function returns a user defined function + that transforms the column data''' + def udf(col_value): + seed(hash(col_value) + time_ns()) + if col_value is None: + return col_value + elif allow_none: + return random_select([None, proc_func(col_value)], + cum_weights=[0.05, 1]) + else: + return proc_func(col_value) + return udf + + +def id(x): + return x + + +# Declare data transformations for each column in dataframe +udfs = [ + (dirty_data(tripduration, True), StringType()), # tripduration + (dirty_data(id, True), StringType()), # starttime + (dirty_data(id, True), StringType()), # stoptime + (id, IntegerType()), # start_station_id + (dirty_data(station_name, False), StringType()), # start_station_name + (dirty_data(convertAngle, True), StringType()), # start_station_latitude + (dirty_data(convertAngle, True), StringType()), # start_station_longitude + (id, IntegerType()), # end_station_id + (dirty_data(station_name, False), StringType()), # end_station_name + (dirty_data(convertAngle, True), StringType()), # end_station_latitude + (dirty_data(convertAngle, True), StringType()), # end_station_longitude + (id, IntegerType()), # bikeid + (dirty_data(usertype, False), StringType()), # usertype + (id, IntegerType()), # birth_year + (dirty_data(gender, False), StringType()), # gender + (id, StringType()), # customer_plan +] + +# Apply dirty transformations to df +names = df.schema.names +new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) + for udf, column, name in zip(udfs, df.columns, names)]) + +# Duplicate about 0.01% of the rows +dup_df = new_df.sample(False, 0.0001, seed=42) + +# Create final dirty dataframe +df = new_df.union(dup_df) +df.sample(False, 0.0001, seed=50).show(n=200) +print("Dataframe sample printed") + +# Write to BigQuery +if upload: + # Create BigQuery Dataset + client = bigquery.Client() + dataset_id = '{}.new_york_citibike_trips'.format(client.project) + dataset = bigquery.Dataset(dataset_id) + dataset.location = "US" + dataset = client.create_dataset(dataset) + + # Saving the data to BigQuery + spark.conf.set('temporaryGcsBucket', bucket_name) + + df.write.format('bigquery') \ + .option('table', dataset_id + ".RAW_DATA") \ + .save() diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh new file mode 100644 index 00000000000..12730a3a6fe --- /dev/null +++ b/data-science-onramp/data-ingestion/setup.sh @@ -0,0 +1,6 @@ +# Submit a PySpark job via the Cloud Dataproc Jobs API +gcloud dataproc jobs submit pyspark \ + --cluster ${CLUSTER_NAME} \ + --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ + --driver-log-levels root=FATAL \ + setup.py -- ${BUCKET_NAME} From 739114a595c9698737bc853a06f52dae7ca63291 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Mon, 8 Jun 2020 10:24:56 -0400 Subject: [PATCH 02/59] begin addressing comments --- data-science-onramp/data-ingestion/setup.py | 57 +++---- data-science-onramp/data-ingestion/setup.sh | 3 + .../{setup-test.py => setup_test.py} | 145 +++++++----------- 3 files changed, 90 insertions(+), 115 deletions(-) rename data-science-onramp/data-ingestion/{setup-test.py => setup_test.py} (59%) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 91a740b34d0..dc869903c84 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,4 +1,4 @@ -from random import choice, choices, randint, seed +import random import sys from time import time_ns @@ -19,10 +19,10 @@ upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded -try: - sys.argv[2] +if len(sys.arv) > 1: upload = False -except IndexError: + print("Not uploading results to BigQuery") +else: print("Results will be uploaded to BigQuery") table = "bigquery-public-data.new_york_citibike.citibike_trips" @@ -37,59 +37,60 @@ # START MAKING DATA DIRTY -def random_select(items, cum_weights): +def random_select(items, weights): '''Picks an item according to the cumulative weights''' - return choices(items, cum_weights=cum_weights, k=1)[0] + return random.choices(items, weights=weights, k=1)[0] -def tripduration(duration): +def trip_duration(duration): '''Converts trip duration to other units''' seconds = str(duration) + " s" minutes = str(float(duration) / 60) + " min" hours = str(float(duration) / 3600) + " h" - return random_select([seconds, minutes, hours, str(randint(-1000, -1))], - [0.3, 0.6, 0.9, 1]) + return random_select([seconds, minutes, hours, + str(random.randint(-1000, -1))], + [0.3, 0.3, 0.3, 0.1]) def station_name(name): '''Replaces '&' with '/' with a 50% chance''' - return choice([name, name.replace("&", "/")]) + return random.choice([name, name.replace("&", "/")]) -def usertype(user): +def user_type(user): '''Manipulates the user type string''' - return choice([user, user.upper(), user.lower(), - "sub" if user == "Subscriber" else user, - "cust" if user == "Customer" else user]) + return random.choice([user, user.upper(), user.lower(), + "sub" if user == "Subscriber" else user, + "cust" if user == "Customer" else user]) def gender(s): '''Manipulates the gender string''' - return choice([s, s.upper(), s.lower(), - s[0] if len(s) > 0 else "", - s[0].lower() if len(s) > 0 else ""]) + return random.choice([s, s.upper(), s.lower(), + s[0] if len(s) > 0 else "", + s[0].lower() if len(s) > 0 else ""]) -def convertAngle(angle): +def convert_angle(angle): '''Converts long and lat to DMS notation''' degrees = int(angle) minutes = int((angle - degrees) * 60) seconds = int((angle - degrees - minutes/60) * 3600) new_angle = str(degrees) + u"\u00B0" + \ str(minutes) + "'" + str(seconds) + '"' - return random_select([str(angle), new_angle], cum_weights=[0.55, 1]) + return random_select([str(angle), new_angle], [0.55, 0.45]) def dirty_data(proc_func, allow_none): '''Master function returns a user defined function that transforms the column data''' def udf(col_value): - seed(hash(col_value) + time_ns()) + random.seed(hash(col_value) + time_ns()) if col_value is None: return col_value elif allow_none: return random_select([None, proc_func(col_value)], - cum_weights=[0.05, 1]) + [0.05, 0.95]) else: return proc_func(col_value) return udf @@ -101,19 +102,19 @@ def id(x): # Declare data transformations for each column in dataframe udfs = [ - (dirty_data(tripduration, True), StringType()), # tripduration + (dirty_data(trip_duration, True), StringType()), # tripduration (dirty_data(id, True), StringType()), # starttime (dirty_data(id, True), StringType()), # stoptime (id, IntegerType()), # start_station_id (dirty_data(station_name, False), StringType()), # start_station_name - (dirty_data(convertAngle, True), StringType()), # start_station_latitude - (dirty_data(convertAngle, True), StringType()), # start_station_longitude + (dirty_data(convert_angle, True), StringType()), # start_station_latitude + (dirty_data(convert_angle, True), StringType()), # start_station_longitude (id, IntegerType()), # end_station_id (dirty_data(station_name, False), StringType()), # end_station_name - (dirty_data(convertAngle, True), StringType()), # end_station_latitude - (dirty_data(convertAngle, True), StringType()), # end_station_longitude + (dirty_data(convert_angle, True), StringType()), # end_station_latitude + (dirty_data(convert_angle, True), StringType()), # end_station_longitude (id, IntegerType()), # bikeid - (dirty_data(usertype, False), StringType()), # usertype + (dirty_data(user_type, False), StringType()), # usertype (id, IntegerType()), # birth_year (dirty_data(gender, False), StringType()), # gender (id, StringType()), # customer_plan @@ -136,7 +137,7 @@ def id(x): if upload: # Create BigQuery Dataset client = bigquery.Client() - dataset_id = '{}.new_york_citibike_trips'.format(client.project) + dataset_id = f'{client.project}.new_york_citibike_trips' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh index 12730a3a6fe..f78c8cd120b 100644 --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -1,4 +1,7 @@ # Submit a PySpark job via the Cloud Dataproc Jobs API +# Requires having CLUSTER_NAME and BUCKET_NAME set as +# environment variables + gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup_test.py similarity index 59% rename from data-science-onramp/data-ingestion/setup-test.py rename to data-science-onramp/data-ingestion/setup_test.py index d827c805818..54f3d20e902 100644 --- a/data-science-onramp/data-ingestion/setup-test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -11,62 +11,25 @@ import pytest -waiting_cluster_callback = False # Set global variables -project = os.environ['GCLOUD_PROJECT'] -region = "us-central1" -zone = "us-central1-a" -cluster_name = 'setup-test-{}'.format(str(uuid.uuid4())) -bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4())) +PROJECT = os.environ['GCLOUD_PROJECT'] +REGION = "us-central1" +ZONE = "us-central1-a" +CLUSTER_NAME = f'setup-test-{uuid.uuid4()}' +BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' +BUCKET = None -@pytest.fixture(autouse=True) -def teardown(): - yield - - # Delete cluster - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{region}-dataproc.googleapis.com:443' - }) - - try: - operation = cluster_client.delete_cluster(project, region, - cluster_name) - operation.result() - except GoogleAPICallError: - pass - - # Delete GCS bucket - storage_client = storage.Client() - try: - bucket = storage_client.get_bucket(bucket_name) - bucket.delete(force=True) - except NotFound: - pass - - -def test_setup(capsys): - '''Tests setup.py by submitting it to a dataproc cluster''' - - # Create GCS Bucket - storage_client = storage.Client() - bucket = storage_client.create_bucket(bucket_name) - - # Upload file - destination_blob_name = "setup.py" - blob = bucket.blob(destination_blob_name) - blob.upload_from_filename("setup.py") - - job_file_name = "gs://" + bucket_name + "/setup.py" +@pytest.fixture(autouse=True) +def setup_and_teardown_cluster(): # Create cluster configuration zone_uri = \ - 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( - project, zone) + f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}' cluster_data = { - 'project_id': project, - 'cluster_name': cluster_name, + 'project_id': PROJECT, + 'cluster_name': CLUSTER_NAME, 'config': { 'gce_cluster_config': { 'zone_uri': zone_uri, @@ -99,27 +62,59 @@ def test_setup(capsys): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) - cluster = cluster_client.create_cluster(project, region, cluster_data) - cluster.add_done_callback(callback) + operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data) # Wait for cluster to provision - global waiting_cluster_callback - waiting_cluster_callback = True + operation.result() - wait_for_cluster_creation() + yield + + # Delete cluster + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' + }) + + operation = cluster_client.delete_cluster(PROJECT, REGION, + CLUSTER_NAME) + operation.result() + + +@pytest.fixture(autouse=True) +def setup_and_teardown_bucket(): + global BUCKET + # Create GCS Bucket + storage_client = storage.Client() + BUCKET = storage_client.create_bucket(BUCKET_NAME) + + yield + + # Delete GCS bucket + storage_client = storage.Client() + bucket = storage_client.get_bucket(BUCKET_NAME) + bucket.delete(force=True) + +def test_setup(capsys): + '''Tests setup.py by submitting it to a dataproc cluster''' + + # Upload file + destination_blob_name = "setup.py" + blob = BUCKET.blob(destination_blob_name) + blob.upload_from_filename("setup.py") + + job_file_name = "gs://" + BUCKET_NAME + "/setup.py" # Create job configuration job_details = { 'placement': { - 'cluster_name': cluster_name + 'cluster_name': CLUSTER_NAME }, 'pyspark_job': { 'main_python_file_uri': job_file_name, 'args': [ - bucket_name, + BUCKET_NAME, "--test", ], "jar_file_uris": [ @@ -130,25 +125,21 @@ def test_setup(capsys): # Submit job to dataproc cluster job_client = dataproc.JobControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) - result = job_client.submit_job(project_id=project, region=region, + response = job_client.submit_job(project_id=PROJECT, region=REGION, job=job_details) - job_id = result.reference.job_id + job_id = response.reference.job_id print('Submitted job \"{}\".'.format(job_id)) # Wait for job to complete - wait_for_job(job_client, job_id) + result = response.add_done_callback(callback) # Get job output - cluster_info = cluster_client.get_cluster(project, region, cluster_name) - bucket = storage_client.get_bucket(cluster_info.config.config_bucket) - output_blob = ( - 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' - .format(cluster_info.cluster_uuid, job_id)) - out = bucket.blob(output_blob).download_as_string().decode("utf-8") + output_location = result.driver_output_resource_uri() + ".000000000" + output = BUCKET.blob(output_location).download_as_string().decode("utf-8") # tripDuration assert re.search("[0-9] s", out) @@ -186,25 +177,5 @@ def test_setup(capsys): # Missing data assert "null" in out - def callback(operation_future): - '''Sets a flag to stop waiting''' - global waiting_cluster_callback - waiting_cluster_callback = False - - -def wait_for_cluster_creation(): - '''Waits for cluster to create''' - while True: - if not waiting_cluster_callback: - break - - -def wait_for_job(job_client, job_id): - '''Waits for job to finish''' - while True: - job = job_client.get_job(project, region, job_id) - assert job.status.State.Name(job.status.state) != "ERROR" - - if job.status.State.Name(job.status.state) == "DONE": - return + return operation_future.result() From 681eaf3c85c3e159796d12cbcdf6c68c03a906f0 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Mon, 8 Jun 2020 11:26:33 -0400 Subject: [PATCH 03/59] change submit job --- data-science-onramp/data-ingestion/noxfile.py | 225 ++++++++++++++++++ .../data-ingestion/setup_test.py | 9 +- 2 files changed, 228 insertions(+), 6 deletions(-) create mode 100644 data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py new file mode 100644 index 00000000000..b23055f14a6 --- /dev/null +++ b/data-science-onramp/data-ingestion/noxfile.py @@ -0,0 +1,225 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from pathlib import Path +import sys + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +# Copy `noxfile_config.py` to your directory and modify it instead. + + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + 'ignored_versions': ["2.7"], + + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + 'gcloud_project_env': 'GCLOUD_PROJECT', + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + 'envs': {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append('.') + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars(): + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG['gcloud_project_env'] + # This should error out if not set. + ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] + ret['GCLOUD_PROJECT'] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG['envs']) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to tested samples. +ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) +# +# Style Checks +# + + +def _determine_local_import_names(start_dir): + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session): + session.install("flake8", "flake8-import-order") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + "." + ] + session.run("flake8", *args) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests(session, post_install=None): + """Runs py.test for a particular project.""" + if os.path.exists("requirements.txt"): + session.install("-r", "requirements.txt") + + if os.path.exists("requirements-test.txt"): + session.install("-r", "requirements-test.txt") + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars() + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session): + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip("SKIPPED: {} tests are disabled for this sample.".format( + session.python + )) + + +# +# Readmegen +# + + +def _get_repo_root(): + """ Returns the root folder of the project. """ + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session, path): + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 54f3d20e902..919dcc4f35c 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -128,17 +128,14 @@ def test_setup(capsys): 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) - response = job_client.submit_job(project_id=PROJECT, region=REGION, + response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, job=job_details) - job_id = response.reference.job_id - print('Submitted job \"{}\".'.format(job_id)) - # Wait for job to complete - result = response.add_done_callback(callback) + result = response.result() # Get job output - output_location = result.driver_output_resource_uri() + ".000000000" + output_location = result.driver_output_resource_uri + ".000000000" output = BUCKET.blob(output_location).download_as_string().decode("utf-8") # tripDuration From 4afbf1c5935e71882524a29511266b8dc2bf2a3a Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Mon, 8 Jun 2020 14:43:48 -0400 Subject: [PATCH 04/59] address code structure and global variable issues --- data-science-onramp/data-ingestion/noxfile.py | 225 ------------------ data-science-onramp/data-ingestion/setup.py | 125 +++++----- .../data-ingestion/setup_test.py | 17 +- 3 files changed, 78 insertions(+), 289 deletions(-) delete mode 100644 data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py deleted file mode 100644 index b23055f14a6..00000000000 --- a/data-science-onramp/data-ingestion/noxfile.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -from pathlib import Path -import sys - -import nox - - -# WARNING - WARNING - WARNING - WARNING - WARNING -# WARNING - WARNING - WARNING - WARNING - WARNING -# DO NOT EDIT THIS FILE EVER! -# WARNING - WARNING - WARNING - WARNING - WARNING -# WARNING - WARNING - WARNING - WARNING - WARNING - -# Copy `noxfile_config.py` to your directory and modify it instead. - - -# `TEST_CONFIG` dict is a configuration hook that allows users to -# modify the test configurations. The values here should be in sync -# with `noxfile_config.py`. Users will copy `noxfile_config.py` into -# their directory and modify it. - -TEST_CONFIG = { - # You can opt out from the test for specific Python versions. - 'ignored_versions': ["2.7"], - - # An envvar key for determining the project id to use. Change it - # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a - # build specific Cloud project. You can also use your own string - # to use your own Cloud project. - 'gcloud_project_env': 'GCLOUD_PROJECT', - # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', - - # A dictionary you want to inject into your test. Don't put any - # secrets here. These values will override predefined values. - 'envs': {}, -} - - -try: - # Ensure we can import noxfile_config in the project's directory. - sys.path.append('.') - from noxfile_config import TEST_CONFIG_OVERRIDE -except ImportError as e: - print("No user noxfile_config found: detail: {}".format(e)) - TEST_CONFIG_OVERRIDE = {} - -# Update the TEST_CONFIG with the user supplied values. -TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) - - -def get_pytest_env_vars(): - """Returns a dict for pytest invocation.""" - ret = {} - - # Override the GCLOUD_PROJECT and the alias. - env_key = TEST_CONFIG['gcloud_project_env'] - # This should error out if not set. - ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] - ret['GCLOUD_PROJECT'] = os.environ[env_key] - - # Apply user supplied envs. - ret.update(TEST_CONFIG['envs']) - return ret - - -# DO NOT EDIT - automatically generated. -# All versions used to tested samples. -ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] - -# Any default versions that should be ignored. -IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] - -TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) - -INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) -# -# Style Checks -# - - -def _determine_local_import_names(start_dir): - """Determines all import names that should be considered "local". - - This is used when running the linter to insure that import order is - properly checked. - """ - file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] - return [ - basename - for basename, extension in file_ext_pairs - if extension == ".py" - or os.path.isdir(os.path.join(start_dir, basename)) - and basename not in ("__pycache__") - ] - - -# Linting with flake8. -# -# We ignore the following rules: -# E203: whitespace before ‘:’ -# E266: too many leading ‘#’ for block comment -# E501: line too long -# I202: Additional newline in a section of imports -# -# We also need to specify the rules which are ignored by default: -# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] -FLAKE8_COMMON_ARGS = [ - "--show-source", - "--builtin=gettext", - "--max-complexity=20", - "--import-order-style=google", - "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", - "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", - "--max-line-length=88", -] - - -@nox.session -def lint(session): - session.install("flake8", "flake8-import-order") - - local_names = _determine_local_import_names(".") - args = FLAKE8_COMMON_ARGS + [ - "--application-import-names", - ",".join(local_names), - "." - ] - session.run("flake8", *args) - - -# -# Sample Tests -# - - -PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] - - -def _session_tests(session, post_install=None): - """Runs py.test for a particular project.""" - if os.path.exists("requirements.txt"): - session.install("-r", "requirements.txt") - - if os.path.exists("requirements-test.txt"): - session.install("-r", "requirements-test.txt") - - if INSTALL_LIBRARY_FROM_SOURCE: - session.install("-e", _get_repo_root()) - - if post_install: - post_install(session) - - session.run( - "pytest", - *(PYTEST_COMMON_ARGS + session.posargs), - # Pytest will return 5 when no tests are collected. This can happen - # on travis where slow and flaky tests are excluded. - # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html - success_codes=[0, 5], - env=get_pytest_env_vars() - ) - - -@nox.session(python=ALL_VERSIONS) -def py(session): - """Runs py.test for a sample using the specified version of Python.""" - if session.python in TESTED_VERSIONS: - _session_tests(session) - else: - session.skip("SKIPPED: {} tests are disabled for this sample.".format( - session.python - )) - - -# -# Readmegen -# - - -def _get_repo_root(): - """ Returns the root folder of the project. """ - # Get root of this repository. Assume we don't have directories nested deeper than 10 items. - p = Path(os.getcwd()) - for i in range(10): - if p is None: - break - if Path(p / ".git").exists(): - return str(p) - p = p.parent - raise Exception("Unable to detect repository root.") - - -GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) - - -@nox.session -@nox.parametrize("path", GENERATED_READMES) -def readmegen(session, path): - """(Re-)generates the readme for a sample.""" - session.install("jinja2", "pyyaml") - dir_ = os.path.dirname(path) - - if os.path.exists(os.path.join(dir_, "requirements.txt")): - session.install("-r", os.path.join(dir_, "requirements.txt")) - - in_file = os.path.join(dir_, "README.rst.in") - session.run( - "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file - ) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index dc869903c84..da162e1c91d 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -12,31 +12,11 @@ from pyspark.sql.types import IntegerType, StringType -# Create a SparkSession under the name "setup". Viewable via the Spark UI -spark = SparkSession.builder.appName("setup").getOrCreate() +BUCKET_NAME = sys.argv[1] +TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -bucket_name = sys.argv[1] -upload = True # Whether to upload data to BigQuery - -# Check whether or not results should be uploaded -if len(sys.arv) > 1: - upload = False - print("Not uploading results to BigQuery") -else: - print("Results will be uploaded to BigQuery") - -table = "bigquery-public-data.new_york_citibike.citibike_trips" - -# Check if table exists -try: - df = spark.read.format('bigquery').option('table', table).load() -except Py4JJavaError: - print(f"{table} does not exist. ") - sys.exit(0) # START MAKING DATA DIRTY - - def random_select(items, weights): '''Picks an item according to the cumulative weights''' return random.choices(items, weights=weights, k=1)[0] @@ -81,6 +61,8 @@ def convert_angle(angle): return random_select([str(angle), new_angle], [0.55, 0.45]) +# This function is nested since a UserDefinedFunction is +# expected to take a single argument def dirty_data(proc_func, allow_none): '''Master function returns a user defined function that transforms the column data''' @@ -99,42 +81,9 @@ def udf(col_value): def id(x): return x +def write_to_bigquery(df): + '''Write a dataframe to BigQuery''' -# Declare data transformations for each column in dataframe -udfs = [ - (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(id, True), StringType()), # starttime - (dirty_data(id, True), StringType()), # stoptime - (id, IntegerType()), # start_station_id - (dirty_data(station_name, False), StringType()), # start_station_name - (dirty_data(convert_angle, True), StringType()), # start_station_latitude - (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (id, IntegerType()), # end_station_id - (dirty_data(station_name, False), StringType()), # end_station_name - (dirty_data(convert_angle, True), StringType()), # end_station_latitude - (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (id, IntegerType()), # bikeid - (dirty_data(user_type, False), StringType()), # usertype - (id, IntegerType()), # birth_year - (dirty_data(gender, False), StringType()), # gender - (id, StringType()), # customer_plan -] - -# Apply dirty transformations to df -names = df.schema.names -new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) - for udf, column, name in zip(udfs, df.columns, names)]) - -# Duplicate about 0.01% of the rows -dup_df = new_df.sample(False, 0.0001, seed=42) - -# Create final dirty dataframe -df = new_df.union(dup_df) -df.sample(False, 0.0001, seed=50).show(n=200) -print("Dataframe sample printed") - -# Write to BigQuery -if upload: # Create BigQuery Dataset client = bigquery.Client() dataset_id = f'{client.project}.new_york_citibike_trips' @@ -143,8 +92,68 @@ def id(x): dataset = client.create_dataset(dataset) # Saving the data to BigQuery - spark.conf.set('temporaryGcsBucket', bucket_name) + spark.conf.set('temporaryGcsBucket', BUCKET_NAME) df.write.format('bigquery') \ .option('table', dataset_id + ".RAW_DATA") \ .save() + +def main(): + # Create a SparkSession under the name "setup". Viewable via the Spark UI + spark = SparkSession.builder.appName("setup").getOrCreate() + + upload = True # Whether to upload data to BigQuery + + # Check whether or not results should be uploaded + if len(sys.argv) > 1: + upload = False + print("Not uploading results to BigQuery") + else: + print("Results will be uploaded to BigQuery") + + # Check if table exists + try: + df = spark.read.format('bigquery').option('table', TABLE).load() + except Py4JJavaError: + print(f"{TABLE} does not exist. ") + sys.exit(0) + + # Declare data transformations for each column in dataframe + udfs = [ + (dirty_data(trip_duration, True), StringType()), # tripduration + (dirty_data(id, True), StringType()), # starttime + (dirty_data(id, True), StringType()), # stoptime + (id, IntegerType()), # start_station_id + (dirty_data(station_name, False), StringType()), # start_station_name + (dirty_data(convert_angle, True), StringType()), # start_station_latitude + (dirty_data(convert_angle, True), StringType()), # start_station_longitude + (id, IntegerType()), # end_station_id + (dirty_data(station_name, False), StringType()), # end_station_name + (dirty_data(convert_angle, True), StringType()), # end_station_latitude + (dirty_data(convert_angle, True), StringType()), # end_station_longitude + (id, IntegerType()), # bikeid + (dirty_data(user_type, False), StringType()), # usertype + (id, IntegerType()), # birth_year + (dirty_data(gender, False), StringType()), # gender + (id, StringType()), # customer_plan + ] + + # Apply dirty transformations to df + names = df.schema.names + new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) + for udf, column, name in zip(udfs, df.columns, names)]) + + # Duplicate about 0.01% of the rows + dup_df = new_df.sample(False, 0.0001, seed=42) + + # Create final dirty dataframe + df = new_df.union(dup_df) + df.sample(False, 0.0001, seed=50).show(n=200) + print("Dataframe sample printed") + + if upload: + write_to_bigquery(df) + + +if __name__ == '__main__': + main() diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 919dcc4f35c..e62d2cc1355 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -52,7 +52,7 @@ def setup_and_teardown_cluster(): } ], "software_config": { - "image_version": "1.5.4-debian10", + "image_version": "1.4-debian10", "optional_components": [ "ANACONDA" ], @@ -134,9 +134,17 @@ def test_setup(capsys): # Wait for job to complete result = response.result() + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + }) + + cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME) + # Get job output - output_location = result.driver_output_resource_uri + ".000000000" - output = BUCKET.blob(output_location).download_as_string().decode("utf-8") + output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000" + storage_client = storage.Client() + bucket = storage_client.get_bucket(cluster_info.config.config_bucket) + output = bucket.blob(output_location).download_as_string().decode("utf-8") # tripDuration assert re.search("[0-9] s", out) @@ -173,6 +181,3 @@ def test_setup(capsys): # Missing data assert "null" in out - -def callback(operation_future): - return operation_future.result() From 744f80c805160fc6b72b6c26673f17202168b92a Mon Sep 17 00:00:00 2001 From: vuppalli Date: Mon, 8 Jun 2020 19:12:09 -0400 Subject: [PATCH 05/59] get dataproc job output and fix linting --- .gitignore | 1 + data-science-onramp/data-ingestion/setup.py | 14 ++++++---- .../data-ingestion/setup_test.py | 28 +++++++++---------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index c827e035649..369e7983b52 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ credentials.dat .DS_store env/ .idea +data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index da162e1c91d..09046b75879 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -12,7 +12,7 @@ from pyspark.sql.types import IntegerType, StringType -BUCKET_NAME = sys.argv[1] +BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" @@ -81,7 +81,8 @@ def udf(col_value): def id(x): return x -def write_to_bigquery(df): + +def write_to_bigquery(spark, df): '''Write a dataframe to BigQuery''' # Create BigQuery Dataset @@ -98,6 +99,7 @@ def write_to_bigquery(df): .option('table', dataset_id + ".RAW_DATA") \ .save() + def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() @@ -143,16 +145,16 @@ def main(): new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) for udf, column, name in zip(udfs, df.columns, names)]) + new_df.sample(False, 0.0001, seed=50).show(n=100) + # Duplicate about 0.01% of the rows - dup_df = new_df.sample(False, 0.0001, seed=42) + dup_df = new_df.sample(True, 0.0001, seed=42) # Create final dirty dataframe df = new_df.union(dup_df) - df.sample(False, 0.0001, seed=50).show(n=200) - print("Dataframe sample printed") if upload: - write_to_bigquery(df) + write_to_bigquery(spark, df) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index e62d2cc1355..f55a155bc75 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -3,11 +3,8 @@ import uuid -from google.api_core.exceptions import GoogleAPICallError - from google.cloud import dataproc_v1 as dataproc from google.cloud import storage -from google.cloud.exceptions import NotFound import pytest @@ -52,7 +49,7 @@ def setup_and_teardown_cluster(): } ], "software_config": { - "image_version": "1.4-debian10", + "image_version": "1.5.4-debian10", "optional_components": [ "ANACONDA" ], @@ -96,6 +93,7 @@ def setup_and_teardown_bucket(): bucket = storage_client.get_bucket(BUCKET_NAME) bucket.delete(force=True) + def test_setup(capsys): '''Tests setup.py by submitting it to a dataproc cluster''' @@ -129,22 +127,15 @@ def test_setup(capsys): }) response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, - job=job_details) + job=job_details) # Wait for job to complete result = response.result() - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) - }) - - cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME) - # Get job output - output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000" - storage_client = storage.Client() - bucket = storage_client.get_bucket(cluster_info.config.config_bucket) - output = bucket.blob(output_location).download_as_string().decode("utf-8") + output_location = result.driver_output_resource_uri + ".000000000" + blob = get_blob_from_path(output_location) + out = blob.download_as_string().decode("utf-8") # tripDuration assert re.search("[0-9] s", out) @@ -181,3 +172,10 @@ def test_setup(capsys): # Missing data assert "null" in out + + +def get_blob_from_path(path): + bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] + bucket = storage.Client().get_bucket(bucket_name) + output_location = re.search("google-cloud-dataproc.+", path).group(0) + return bucket.blob(output_location) From 8cd7dc611e877b71b778ba91878e91b439f6c334 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Tue, 9 Jun 2020 15:32:02 -0400 Subject: [PATCH 06/59] fix PR comments --- .gitignore | 30 ----- data-science-onramp/data-ingestion/setup.py | 28 ++-- .../data-ingestion/setup_test.py | 125 ++++++++---------- 3 files changed, 73 insertions(+), 110 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 369e7983b52..00000000000 --- a/.gitignore +++ /dev/null @@ -1,30 +0,0 @@ -.coveralls.yml -*.pyc -.coverage -.tox -.pytest_cache -.ipynb_checkpoints -.executed_notebooks -coverage.xml -python-docs-samples.json -service-account.json -client-secrets.json -__pycache__ -*db\.sqlite3 -managed_vms/django_tutorial/static/* -**/migrations/* -lib -testing/resources/test-env.sh -testing/resources/service-account.json -testing/resources/client-secrets.json -secrets.tar -.cache -junit.xml -credentials.dat -.nox -.vscode/ -*sponge_log.xml -.DS_store -env/ -.idea -data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 09046b75879..cf61f81562a 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -46,8 +46,8 @@ def user_type(user): def gender(s): '''Manipulates the gender string''' - return random.choice([s, s.upper(), s.lower(), - s[0] if len(s) > 0 else "", + return random.choice([s.upper(), s.lower(), + s[0].upper() if len(s) > 0 else "", s[0].lower() if len(s) > 0 else ""]) @@ -78,7 +78,9 @@ def udf(col_value): return udf -def id(x): +# This function is required because we need to apply a +# function for every column and some columns do not change +def identity(x): return x @@ -118,26 +120,26 @@ def main(): df = spark.read.format('bigquery').option('table', TABLE).load() except Py4JJavaError: print(f"{TABLE} does not exist. ") - sys.exit(0) + return # Declare data transformations for each column in dataframe udfs = [ (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(id, True), StringType()), # starttime - (dirty_data(id, True), StringType()), # stoptime - (id, IntegerType()), # start_station_id + (dirty_data(identity, True), StringType()), # starttime + (dirty_data(identity, True), StringType()), # stoptime + (identity, IntegerType()), # start_station_id (dirty_data(station_name, False), StringType()), # start_station_name (dirty_data(convert_angle, True), StringType()), # start_station_latitude (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (id, IntegerType()), # end_station_id + (identity, IntegerType()), # end_station_id (dirty_data(station_name, False), StringType()), # end_station_name (dirty_data(convert_angle, True), StringType()), # end_station_latitude (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (id, IntegerType()), # bikeid + (identity, IntegerType()), # bikeid (dirty_data(user_type, False), StringType()), # usertype - (id, IntegerType()), # birth_year + (identity, IntegerType()), # birth_year (dirty_data(gender, False), StringType()), # gender - (id, StringType()), # customer_plan + (identity, StringType()), # customer_plan ] # Apply dirty transformations to df @@ -145,10 +147,10 @@ def main(): new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) for udf, column, name in zip(udfs, df.columns, names)]) - new_df.sample(False, 0.0001, seed=50).show(n=100) + new_df.sample(False, 0.0001).show(n=100) # Duplicate about 0.01% of the rows - dup_df = new_df.sample(True, 0.0001, seed=42) + dup_df = new_df.sample(True, 0.0001) # Create final dirty dataframe df = new_df.union(dup_df) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index f55a155bc75..aab08230028 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -12,24 +12,36 @@ # Set global variables PROJECT = os.environ['GCLOUD_PROJECT'] REGION = "us-central1" -ZONE = "us-central1-a" CLUSTER_NAME = f'setup-test-{uuid.uuid4()}' BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' - -BUCKET = None +DESTINATION_BLOB_NAME = "setup.py" +JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' +JOB_DETAILS = { # Job configuration + 'placement': { + 'cluster_name': CLUSTER_NAME + }, + 'pyspark_job': { + 'main_python_file_uri': JOB_FILE_NAME, + 'args': [ + BUCKET_NAME, + "--test", + ], + "jar_file_uris": [ + "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" + ], + }, +} @pytest.fixture(autouse=True) def setup_and_teardown_cluster(): # Create cluster configuration - zone_uri = \ - f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}' cluster_data = { 'project_id': PROJECT, 'cluster_name': CLUSTER_NAME, 'config': { 'gce_cluster_config': { - 'zone_uri': zone_uri, + 'zone_uri': '', "metadata": { "PIP_PACKAGES": "google-cloud-storage" }, @@ -59,9 +71,8 @@ def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) - operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data) # Wait for cluster to provision @@ -70,10 +81,6 @@ def setup_and_teardown_cluster(): yield # Delete cluster - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' - }) - operation = cluster_client.delete_cluster(PROJECT, REGION, CLUSTER_NAME) operation.result() @@ -81,53 +88,41 @@ def setup_and_teardown_cluster(): @pytest.fixture(autouse=True) def setup_and_teardown_bucket(): - global BUCKET # Create GCS Bucket storage_client = storage.Client() - BUCKET = storage_client.create_bucket(BUCKET_NAME) + bucket = storage_client.create_bucket(BUCKET_NAME) + + # Upload file + blob = bucket.blob(DESTINATION_BLOB_NAME) + blob.upload_from_filename("setup.py") yield # Delete GCS bucket - storage_client = storage.Client() bucket = storage_client.get_bucket(BUCKET_NAME) bucket.delete(force=True) -def test_setup(capsys): - '''Tests setup.py by submitting it to a dataproc cluster''' +def get_blob_from_path(path): + bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] + bucket = storage.Client().get_bucket(bucket_name) + output_location = re.search("google-cloud-dataproc.+", path).group(0) + return bucket.blob(output_location) - # Upload file - destination_blob_name = "setup.py" - blob = BUCKET.blob(destination_blob_name) - blob.upload_from_filename("setup.py") - job_file_name = "gs://" + BUCKET_NAME + "/setup.py" - - # Create job configuration - job_details = { - 'placement': { - 'cluster_name': CLUSTER_NAME - }, - 'pyspark_job': { - 'main_python_file_uri': job_file_name, - 'args': [ - BUCKET_NAME, - "--test", - ], - "jar_file_uris": [ - "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" - ], - }, - } +def is_in_table(value, out): + return re.search(f"\| *{value}\|", out) + + +def test_setup(): + '''Tests setup.py by submitting it to a dataproc cluster''' # Submit job to dataproc cluster job_client = dataproc.JobControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) - response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, - job=job_details) + job=JOB_DETAILS) # Wait for job to complete result = response.result() @@ -150,32 +145,28 @@ def test_setup(capsys): assert re.search("20[0-9][0-9]\\|", out) # gender - assert "M" in out - assert "male" in out - assert "MALE" in out - assert "F" in out - assert "female" in out - assert "FEMALE" in out - assert "u" in out - assert "unknown" in out - assert "UNKNOWN" in out + assert is_in_table("M", out) + assert is_in_table("m", out) + assert is_in_table("male", out) + assert is_in_table("MALE", out) + assert is_in_table("F", out) + assert is_in_table("f", out) + assert is_in_table("female", out) + assert is_in_table("FEMALE", out) + assert is_in_table("U", out) + assert is_in_table("u", out) + assert is_in_table("unknown", out) + assert is_in_table("UNKNOWN", out) # customer_plan - assert "Subscriber" in out - assert "subscriber" in out - assert "SUBSCRIBER" in out - assert "sub" in out - assert "Customer" in out - assert "customer" in out - assert "CUSTOMER" in out - assert "cust" in out + assert is_in_table("Subscriber", out) + assert is_in_table("subscriber", out) + assert is_in_table("SUBSCRIBER", out) + assert is_in_table("sub", out) + assert is_in_table("Customer", out) + assert is_in_table("customer", out) + assert is_in_table("CUSTOMER", out) + assert is_in_table("cust", out) # Missing data - assert "null" in out - - -def get_blob_from_path(path): - bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] - bucket = storage.Client().get_bucket(bucket_name) - output_location = re.search("google-cloud-dataproc.+", path).group(0) - return bucket.blob(output_location) + assert is_in_table("null", out) From 81265d29c7b7e613cdb247871c89899833d89694 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Tue, 9 Jun 2020 16:01:52 -0400 Subject: [PATCH 07/59] linting and global vars --- .../data-ingestion/setup_test.py | 68 +++++++++---------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index aab08230028..e9358de912c 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -31,49 +31,47 @@ ], }, } - - -@pytest.fixture(autouse=True) -def setup_and_teardown_cluster(): - # Create cluster configuration - cluster_data = { - 'project_id': PROJECT, - 'cluster_name': CLUSTER_NAME, - 'config': { - 'gce_cluster_config': { - 'zone_uri': '', - "metadata": { - "PIP_PACKAGES": "google-cloud-storage" - }, +CLUSTER_DATA = { # Create cluster configuration + 'project_id': PROJECT, + 'cluster_name': CLUSTER_NAME, + 'config': { + 'gce_cluster_config': { + 'zone_uri': '', + "metadata": { + "PIP_PACKAGES": "google-cloud-storage" }, - 'master_config': { - 'num_instances': 1, - 'machine_type_uri': 'n1-standard-8' - }, - 'worker_config': { - 'num_instances': 6, - 'machine_type_uri': 'n1-standard-8' - }, - "initialization_actions": [ - { - "executable_file": ("gs://dataproc-initialization-actions/" - "python/pip-install.sh"), - } - ], - "software_config": { - "image_version": "1.5.4-debian10", - "optional_components": [ - "ANACONDA" - ], + }, + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-8' + }, + 'worker_config': { + 'num_instances': 6, + 'machine_type_uri': 'n1-standard-8' + }, + "initialization_actions": [ + { + "executable_file": ("gs://dataproc-initialization-actions/" + "python/pip-install.sh"), } + ], + "software_config": { + "image_version": "1.5.4-debian10", + "optional_components": [ + "ANACONDA" + ], } } +} + +@pytest.fixture(autouse=True) +def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient(client_options={ 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) - operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data) + operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA) # Wait for cluster to provision operation.result() @@ -111,7 +109,7 @@ def get_blob_from_path(path): def is_in_table(value, out): - return re.search(f"\| *{value}\|", out) + return re.search(f"\\| *{value}\\|", out) def test_setup(): From 3e86bda7c1da37c4201792b792003baa2147559c Mon Sep 17 00:00:00 2001 From: vuppalli Date: Wed, 10 Jun 2020 11:27:11 -0400 Subject: [PATCH 08/59] address Brad PR comments --- data-science-onramp/data-ingestion/setup.py | 34 +++++++------------ .../data-ingestion/setup_test.py | 3 -- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index cf61f81562a..7f5efa28e0a 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,13 +1,10 @@ import random import sys - from time import time_ns from google.cloud import bigquery - from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession - from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.types import IntegerType, StringType @@ -56,7 +53,7 @@ def convert_angle(angle): degrees = int(angle) minutes = int((angle - degrees) * 60) seconds = int((angle - degrees - minutes/60) * 3600) - new_angle = str(degrees) + u"\u00B0" + \ + new_angle = str(degrees) + "\u00B0" + \ str(minutes) + "'" + str(seconds) + '"' return random_select([str(angle), new_angle], [0.55, 0.45]) @@ -78,13 +75,7 @@ def udf(col_value): return udf -# This function is required because we need to apply a -# function for every column and some columns do not change -def identity(x): - return x - - -def write_to_bigquery(spark, df): +def write_to_bigquery(df): '''Write a dataframe to BigQuery''' # Create BigQuery Dataset @@ -95,10 +86,9 @@ def write_to_bigquery(spark, df): dataset = client.create_dataset(dataset) # Saving the data to BigQuery - spark.conf.set('temporaryGcsBucket', BUCKET_NAME) - df.write.format('bigquery') \ .option('table', dataset_id + ".RAW_DATA") \ + .option("temporaryGcsBucket", BUCKET_NAME) \ .save() @@ -109,7 +99,7 @@ def main(): upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded - if len(sys.argv) > 1: + if len(sys.argv) > 2: upload = False print("Not uploading results to BigQuery") else: @@ -125,21 +115,21 @@ def main(): # Declare data transformations for each column in dataframe udfs = [ (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(identity, True), StringType()), # starttime - (dirty_data(identity, True), StringType()), # stoptime - (identity, IntegerType()), # start_station_id + (dirty_data(lambda x: x, True), StringType()), # starttime + (dirty_data(lambda x: x, True), StringType()), # stoptime + (lambda x: x, IntegerType()), # start_station_id (dirty_data(station_name, False), StringType()), # start_station_name (dirty_data(convert_angle, True), StringType()), # start_station_latitude (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (identity, IntegerType()), # end_station_id + (lambda x: x, IntegerType()), # end_station_id (dirty_data(station_name, False), StringType()), # end_station_name (dirty_data(convert_angle, True), StringType()), # end_station_latitude (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (identity, IntegerType()), # bikeid + (lambda x: x, IntegerType()), # bikeid (dirty_data(user_type, False), StringType()), # usertype - (identity, IntegerType()), # birth_year + (lambda x: x, IntegerType()), # birth_year (dirty_data(gender, False), StringType()), # gender - (identity, StringType()), # customer_plan + (lambda x: x, StringType()), # customer_plan ] # Apply dirty transformations to df @@ -156,7 +146,7 @@ def main(): df = new_df.union(dup_df) if upload: - write_to_bigquery(spark, df) + write_to_bigquery(df) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index e9358de912c..2aa82535d79 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -1,14 +1,11 @@ import os import re - import uuid from google.cloud import dataproc_v1 as dataproc from google.cloud import storage - import pytest - # Set global variables PROJECT = os.environ['GCLOUD_PROJECT'] REGION = "us-central1" From 580c8e1078e9480ef30d3083522fd2c467c4f1b1 Mon Sep 17 00:00:00 2001 From: Tushar Khan Date: Thu, 11 Jun 2020 11:45:10 -0400 Subject: [PATCH 09/59] broken clean.py --- data-science-onramp/data-processing/clean.py | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 data-science-onramp/data-processing/clean.py diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py new file mode 100644 index 00000000000..0bca32d3299 --- /dev/null +++ b/data-science-onramp/data-processing/clean.py @@ -0,0 +1,44 @@ +import os +import sys + +from py4j.protocol import Py4JJavaError +from pyspark.sql import SparkSession +from pyspark.sql.functions import UserDefinedFunction, lit +from pyspark.sql.types import IntegerType, StringType + + +PROJECT_ID = sys.argv[1] +BUCKET_NAME = sys.argv[2] +TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA' + +def station_name(name): + if name: + return name.replace('/', '&') + else: + return '' + +def main(): + '''...''' + # Create a SparkSession under the name 'clean'. Viewable via the Spark UI + spark = SparkSession.builder.appName('clean').getOrCreate() + + # Check if table exists + + try: + df = spark.read.format('bigquery').option('table', TABLE).load() + except Py4JJavaError: + print(f"{TABLE} does not exist. ") + return + + udf_map = { + 'start_station_name': (station_name, StringType()) + } + + for name, (func, col_type) in udf_map.items(): + df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name)) + + df = spark.createDataframe + df.show(n=100) + +if __name__ == '__main__': + main() \ No newline at end of file From 4ed5a157a9b2492602c5e2925a8b3d30376215d8 Mon Sep 17 00:00:00 2001 From: Tushar Khan Date: Thu, 11 Jun 2020 11:49:49 -0400 Subject: [PATCH 10/59] Revert "broken clean.py" This reverts commit 580c8e1078e9480ef30d3083522fd2c467c4f1b1. --- data-science-onramp/data-processing/clean.py | 44 -------------------- 1 file changed, 44 deletions(-) delete mode 100644 data-science-onramp/data-processing/clean.py diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py deleted file mode 100644 index 0bca32d3299..00000000000 --- a/data-science-onramp/data-processing/clean.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import sys - -from py4j.protocol import Py4JJavaError -from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction, lit -from pyspark.sql.types import IntegerType, StringType - - -PROJECT_ID = sys.argv[1] -BUCKET_NAME = sys.argv[2] -TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA' - -def station_name(name): - if name: - return name.replace('/', '&') - else: - return '' - -def main(): - '''...''' - # Create a SparkSession under the name 'clean'. Viewable via the Spark UI - spark = SparkSession.builder.appName('clean').getOrCreate() - - # Check if table exists - - try: - df = spark.read.format('bigquery').option('table', TABLE).load() - except Py4JJavaError: - print(f"{TABLE} does not exist. ") - return - - udf_map = { - 'start_station_name': (station_name, StringType()) - } - - for name, (func, col_type) in udf_map.items(): - df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name)) - - df = spark.createDataframe - df.show(n=100) - -if __name__ == '__main__': - main() \ No newline at end of file From e6fe99dfbef9c2377335b52674ed41d41178696e Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 16 Jun 2020 11:29:46 -0400 Subject: [PATCH 11/59] optimize data ingestion --- data-science-onramp/data-ingestion/setup.py | 109 +++++++++--------- .../data-ingestion/setup_test.py | 14 +-- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 7f5efa28e0a..33c7c728733 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,41 +1,43 @@ import random import sys -from time import time_ns from google.cloud import bigquery from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction +from pyspark.sql.functions import UserDefinedFunction, when, expr from pyspark.sql.types import IntegerType, StringType BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" +RAW_DATASET_NAME = "new_york_citibike_trips5" +RAW_TABLE_NAME = "RAW_DATA" # START MAKING DATA DIRTY -def random_select(items, weights): - '''Picks an item according to the cumulative weights''' - return random.choices(items, weights=weights, k=1)[0] - - def trip_duration(duration): '''Converts trip duration to other units''' + if duration is None: + return None seconds = str(duration) + " s" minutes = str(float(duration) / 60) + " min" hours = str(float(duration) / 3600) + " h" - return random_select([seconds, minutes, hours, + return random.choices([seconds, minutes, hours, str(random.randint(-1000, -1))], - [0.3, 0.3, 0.3, 0.1]) + weights=[0.3, 0.3, 0.3, 0.1])[0] def station_name(name): '''Replaces '&' with '/' with a 50% chance''' + if name is None: + return None return random.choice([name, name.replace("&", "/")]) def user_type(user): '''Manipulates the user type string''' + if user is None: + return None return random.choice([user, user.upper(), user.lower(), "sub" if user == "Subscriber" else user, "cust" if user == "Customer" else user]) @@ -43,6 +45,8 @@ def user_type(user): def gender(s): '''Manipulates the gender string''' + if s is None: + return None return random.choice([s.upper(), s.lower(), s[0].upper() if len(s) > 0 else "", s[0].lower() if len(s) > 0 else ""]) @@ -50,29 +54,15 @@ def gender(s): def convert_angle(angle): '''Converts long and lat to DMS notation''' + if angle is None: + return None degrees = int(angle) minutes = int((angle - degrees) * 60) seconds = int((angle - degrees - minutes/60) * 3600) new_angle = str(degrees) + "\u00B0" + \ str(minutes) + "'" + str(seconds) + '"' - return random_select([str(angle), new_angle], [0.55, 0.45]) - - -# This function is nested since a UserDefinedFunction is -# expected to take a single argument -def dirty_data(proc_func, allow_none): - '''Master function returns a user defined function - that transforms the column data''' - def udf(col_value): - random.seed(hash(col_value) + time_ns()) - if col_value is None: - return col_value - elif allow_none: - return random_select([None, proc_func(col_value)], - [0.05, 0.95]) - else: - return proc_func(col_value) - return udf + return random.choices([str(angle), new_angle], + weights=[0.55, 0.45])[0] def write_to_bigquery(df): @@ -80,17 +70,19 @@ def write_to_bigquery(df): # Create BigQuery Dataset client = bigquery.Client() - dataset_id = f'{client.project}.new_york_citibike_trips' + dataset_id = f'{client.project}.{RAW_DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) # Saving the data to BigQuery df.write.format('bigquery') \ - .option('table', dataset_id + ".RAW_DATA") \ + .option('table', dataset_id + f".{RAW_TABLE_NAME}") \ .option("temporaryGcsBucket", BUCKET_NAME) \ .save() + print("Table successfully written to BigQuery") + def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI @@ -112,42 +104,49 @@ def main(): print(f"{TABLE} does not exist. ") return - # Declare data transformations for each column in dataframe - udfs = [ - (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(lambda x: x, True), StringType()), # starttime - (dirty_data(lambda x: x, True), StringType()), # stoptime - (lambda x: x, IntegerType()), # start_station_id - (dirty_data(station_name, False), StringType()), # start_station_name - (dirty_data(convert_angle, True), StringType()), # start_station_latitude - (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (lambda x: x, IntegerType()), # end_station_id - (dirty_data(station_name, False), StringType()), # end_station_name - (dirty_data(convert_angle, True), StringType()), # end_station_latitude - (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (lambda x: x, IntegerType()), # bikeid - (dirty_data(user_type, False), StringType()), # usertype - (lambda x: x, IntegerType()), # birth_year - (dirty_data(gender, False), StringType()), # gender - (lambda x: x, StringType()), # customer_plan + # Declare dictionary with keys column names and values user defined + # functions and return types + udf_map = { + 'tripduration': (trip_duration, StringType()), + 'start_station_name': (station_name, StringType()), + 'start_station_latitude': (convert_angle, StringType()), + 'start_station_longitude': (convert_angle, StringType()), + 'end_station_name': (station_name, StringType()), + 'end_station_latitude': (convert_angle, StringType()), + 'end_station_longitude': (convert_angle, StringType()), + 'usertype': (user_type, StringType()), + 'gender': (gender, StringType()), + } + + # Declare which columns to set some values to null randomly + null_columns = [ + 'tripduration', + 'starttime', + 'stoptime', + 'start_station_latitude', + 'start_station_longitude', + 'end_station_latitude', + 'end_station_longitude', ] - # Apply dirty transformations to df - names = df.schema.names - new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) - for udf, column, name in zip(udfs, df.columns, names)]) + # Dirty the columns + for name, udf in udf_map.items(): + df = df.withColumn(name, UserDefinedFunction(*udf)(name)) - new_df.sample(False, 0.0001).show(n=100) + # Randomly set about 5% of the values in some columns to null + for name in null_columns: + df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows - dup_df = new_df.sample(True, 0.0001) + dup_df = df.sample(True, 0.0001) # Create final dirty dataframe - df = new_df.union(dup_df) + df = df.union(dup_df) if upload: write_to_bigquery(df) - + else: + df.sample(True, 0.0001).show(n=500, truncate=False) if __name__ == '__main__': main() diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 2aa82535d79..8fb1938c843 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -106,7 +106,7 @@ def get_blob_from_path(path): def is_in_table(value, out): - return re.search(f"\\| *{value}\\|", out) + return re.search(f"\\|{value} *\\|", out) def test_setup(): @@ -128,16 +128,16 @@ def test_setup(): out = blob.download_as_string().decode("utf-8") # tripDuration - assert re.search("[0-9] s", out) - assert re.search("[0-9] m", out) - assert re.search("[0-9] h", out) + assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) + assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) + assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) # station latitude & longitude - assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out) + assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) # birth_year - assert re.search("19[0-9][0-9]\\|", out) - assert re.search("20[0-9][0-9]\\|", out) + assert is_in_table("19[0-9][0-9]", out) + assert is_in_table("20[0-9][0-9]", out) # gender assert is_in_table("M", out) From 540acaae2d61907de35fa31bce089c832f8603d4 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Tue, 16 Jun 2020 11:54:59 -0400 Subject: [PATCH 12/59] fix linting errors --- data-science-onramp/data-ingestion/setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 33c7c728733..7308d13a37e 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -4,8 +4,8 @@ from google.cloud import bigquery from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction, when, expr -from pyspark.sql.types import IntegerType, StringType +from pyspark.sql.functions import expr, UserDefinedFunction, when +from pyspark.sql.types import StringType BUCKET_NAME = sys.argv[1] @@ -23,8 +23,8 @@ def trip_duration(duration): minutes = str(float(duration) / 60) + " min" hours = str(float(duration) / 3600) + " h" return random.choices([seconds, minutes, hours, - str(random.randint(-1000, -1))], - weights=[0.3, 0.3, 0.3, 0.1])[0] + str(random.randint(-1000, -1))], + weights=[0.3, 0.3, 0.3, 0.1])[0] def station_name(name): @@ -117,7 +117,7 @@ def main(): 'usertype': (user_type, StringType()), 'gender': (gender, StringType()), } - + # Declare which columns to set some values to null randomly null_columns = [ 'tripduration', @@ -148,5 +148,6 @@ def main(): else: df.sample(True, 0.0001).show(n=500, truncate=False) + if __name__ == '__main__': main() From a7e29723f004b3cc3c4092b26eb90286b6ee7318 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 16 Jun 2020 18:21:20 -0400 Subject: [PATCH 13/59] fix minor style issues --- data-science-onramp/data-ingestion/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 7308d13a37e..b142aa8f37a 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -10,7 +10,7 @@ BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -RAW_DATASET_NAME = "new_york_citibike_trips5" +RAW_DATASET_NAME = "new_york_citibike_trips" RAW_TABLE_NAME = "RAW_DATA" @@ -77,7 +77,7 @@ def write_to_bigquery(df): # Saving the data to BigQuery df.write.format('bigquery') \ - .option('table', dataset_id + f".{RAW_TABLE_NAME}") \ + .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \ .option("temporaryGcsBucket", BUCKET_NAME) \ .save() From 3e5ba3bb464a8aaa0bbd9d9edeb420261e01aebb Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Fri, 19 Jun 2020 17:47:23 -0400 Subject: [PATCH 14/59] remove pip from cluster config --- data-science-onramp/data-ingestion/setup_test.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 8fb1938c843..d8def350c8e 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -34,9 +34,6 @@ 'config': { 'gce_cluster_config': { 'zone_uri': '', - "metadata": { - "PIP_PACKAGES": "google-cloud-storage" - }, }, 'master_config': { 'num_instances': 1, @@ -46,12 +43,6 @@ 'num_instances': 6, 'machine_type_uri': 'n1-standard-8' }, - "initialization_actions": [ - { - "executable_file": ("gs://dataproc-initialization-actions/" - "python/pip-install.sh"), - } - ], "software_config": { "image_version": "1.5.4-debian10", "optional_components": [ From 21061531d2c710f05ebe9e17910724e18bfedbc9 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Fri, 26 Jun 2020 19:24:21 -0400 Subject: [PATCH 15/59] load external datasets from url --- data-science-onramp/data-ingestion/setup.py | 74 ++++++++++++++++--- .../data-ingestion/setup_test.py | 13 +++- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index b142aa8f37a..06b8ce00689 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,17 +1,37 @@ import random import sys +import pandas as pd from google.cloud import bigquery from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession from pyspark.sql.functions import expr, UserDefinedFunction, when -from pyspark.sql.types import StringType +from pyspark.sql.types import FloatType, StringType, StructField, StructType BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -RAW_DATASET_NAME = "new_york_citibike_trips" -RAW_TABLE_NAME = "RAW_DATA" +DATASET_NAME = "data_science_onramp" +RAW_TABLE_NAME = "new_york_citibike_trips" +EXTERNAL_DATASETS = { + "gas_prices": { + "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", + "schema": StructType([ + StructField("Date", StringType(), True), + StructField("New_York_State_Average_USD_per_Gal", + FloatType(), True), + StructField("Albany_Average_USD_per_Gal", FloatType(), True), + StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True), + StructField("Buffalo_Average_USD_per_Gal", FloatType(), True), + StructField("Nassau_Average_USD_per_Gal", FloatType(), True), + StructField("New_York_City_Average_USD_per_Gal", + FloatType(), True), + StructField("Rochester_Average_USD_per_Gal", FloatType(), True), + StructField("Syracuse_Average_USD_per_Gal", FloatType(), True), + StructField("Utica_Average_USD_per_Gal", FloatType(), True), + ]), + }, +} # START MAKING DATA DIRTY @@ -65,23 +85,39 @@ def convert_angle(angle): weights=[0.55, 0.45])[0] -def write_to_bigquery(df): - '''Write a dataframe to BigQuery''' - +def create_bigquery_dataset(): # Create BigQuery Dataset client = bigquery.Client() - dataset_id = f'{client.project}.{RAW_DATASET_NAME}' + dataset_id = f'{client.project}.{DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) + +def write_to_bigquery(df, table_name): + '''Write a dataframe to BigQuery''' + client = bigquery.Client() + dataset_id = f'{client.project}.{DATASET_NAME}' + # Saving the data to BigQuery df.write.format('bigquery') \ - .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \ + .option('table', f"{dataset_id}.{table_name}") \ .option("temporaryGcsBucket", BUCKET_NAME) \ .save() - print("Table successfully written to BigQuery") + print(f"Table {table_name} successfully written to BigQuery") + + +def print_df(df, table_name): + '''Print 20 rows from dataframe and a random sample''' + # first 100 rows for smaller tables + df.show() + + # random sample for larger tables + # for small tables this will be empty + df.sample(True, 0.0001).show(n=500, truncate=False) + + print(f"Table {table_name} printed") def main(): @@ -91,12 +127,25 @@ def main(): upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded - if len(sys.argv) > 2: + if '--test' in sys.argv: upload = False print("Not uploading results to BigQuery") else: + create_bigquery_dataset() print("Results will be uploaded to BigQuery") + # Ingest External Datasets + + for table_name, data in EXTERNAL_DATASETS.items(): + print(f'Creating dataframe for {table_name}') + df = spark.createDataFrame(pd.read_csv(data["url"]), + schema=data["schema"]) + + if upload: + write_to_bigquery(df, table_name) + else: + print_df(df, table_name) + # Check if table exists try: df = spark.read.format('bigquery').option('table', TABLE).load() @@ -135,6 +184,7 @@ def main(): # Randomly set about 5% of the values in some columns to null for name in null_columns: + df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows @@ -144,9 +194,9 @@ def main(): df = df.union(dup_df) if upload: - write_to_bigquery(df) + write_to_bigquery(df, RAW_TABLE_NAME) else: - df.sample(True, 0.0001).show(n=500, truncate=False) + print_df(df, RAW_TABLE_NAME) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index d8def350c8e..baec10a79a5 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -13,6 +13,10 @@ BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' +TABLE_NAMES = [ + "new_york_citibike_trips", + "gas_prices", +] JOB_DETAILS = { # Job configuration 'placement': { 'cluster_name': CLUSTER_NAME @@ -97,9 +101,12 @@ def get_blob_from_path(path): def is_in_table(value, out): - return re.search(f"\\|{value} *\\|", out) + return re.search(f"\\| *{value} *\\|", out) +def table_printed(table_name, out): + return re.search(f"Table {table_name} printed", out) + def test_setup(): '''Tests setup.py by submitting it to a dataproc cluster''' @@ -118,6 +125,10 @@ def test_setup(): blob = get_blob_from_path(output_location) out = blob.download_as_string().decode("utf-8") + # check that tables were printed + for table_name in TABLE_NAMES: + assert table_printed(table_name, out) + # tripDuration assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) From 9febbad28873cfd3010b9a28dd037a71e4a36c4d Mon Sep 17 00:00:00 2001 From: Tushar Khan Date: Tue, 7 Jul 2020 12:54:45 -0400 Subject: [PATCH 16/59] added dry-run flag --- data-science-onramp/data-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 06b8ce00689..bfa22087c39 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -127,7 +127,7 @@ def main(): upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded - if '--test' in sys.argv: + if '--dry-run' in sys.argv: upload = False print("Not uploading results to BigQuery") else: From 5d56b9777771406bf73f7bcdf1579a4df732e9c5 Mon Sep 17 00:00:00 2001 From: Symmetries Date: Wed, 8 Jul 2020 12:29:46 -0400 Subject: [PATCH 17/59] dry-run flag --- data-science-onramp/data-ingestion/setup_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index baec10a79a5..a8fbe0d014d 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -25,7 +25,7 @@ 'main_python_file_uri': JOB_FILE_NAME, 'args': [ BUCKET_NAME, - "--test", + "--dry-run", ], "jar_file_uris": [ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" From 22be5d3f8496da0f61f89107faa03e9536923e2b Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 9 Jul 2020 19:00:28 -0400 Subject: [PATCH 18/59] address some review comments --- data-science-onramp/data-ingestion/setup.py | 57 +++++++++---------- .../data-ingestion/setup_test.py | 17 ++++-- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index bfa22087c39..ecffd628b50 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,3 +1,12 @@ +"""Setup Dataproc job for Data Science Onramp Sample Application +This job ingests an external gas prices in NY dataset as well as +takes a New York Citibike dataset available on BigQuery and +"dirties" the dataset before uploading it back to BigQuery +It needs the following arguments +* the name of the Google Cloud Storage bucket to be used +* an optional --test flag to upload a subset of the dataset for testing +""" + import random import sys import pandas as pd @@ -37,11 +46,11 @@ # START MAKING DATA DIRTY def trip_duration(duration): '''Converts trip duration to other units''' - if duration is None: + if not duration: return None - seconds = str(duration) + " s" - minutes = str(float(duration) / 60) + " min" - hours = str(float(duration) / 3600) + " h" + seconds = f"{str(duration)} s" + minutes = f"{str(float(duration) / 60)} min" + hours = f"{str(float(duration) / 3600)} h" return random.choices([seconds, minutes, hours, str(random.randint(-1000, -1))], weights=[0.3, 0.3, 0.3, 0.1])[0] @@ -49,14 +58,14 @@ def trip_duration(duration): def station_name(name): '''Replaces '&' with '/' with a 50% chance''' - if name is None: + if not name: return None return random.choice([name, name.replace("&", "/")]) def user_type(user): '''Manipulates the user type string''' - if user is None: + if not user: return None return random.choice([user, user.upper(), user.lower(), "sub" if user == "Subscriber" else user, @@ -65,7 +74,7 @@ def user_type(user): def gender(s): '''Manipulates the gender string''' - if s is None: + if not s: return None return random.choice([s.upper(), s.lower(), s[0].upper() if len(s) > 0 else "", @@ -108,28 +117,16 @@ def write_to_bigquery(df, table_name): print(f"Table {table_name} successfully written to BigQuery") -def print_df(df, table_name): - '''Print 20 rows from dataframe and a random sample''' - # first 100 rows for smaller tables - df.show() - - # random sample for larger tables - # for small tables this will be empty - df.sample(True, 0.0001).show(n=500, truncate=False) - - print(f"Table {table_name} printed") - - def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() - upload = True # Whether to upload data to BigQuery + test = False # Whether we are running the job as a test - # Check whether or not results should be uploaded - if '--dry-run' in sys.argv: - upload = False - print("Not uploading results to BigQuery") + # Check whether or not the job is running as a test + if '--test' in sys.argv: + test = True + print("Subset of whole dataset will be uploaded to BigQuery") else: create_bigquery_dataset() print("Results will be uploaded to BigQuery") @@ -141,10 +138,7 @@ def main(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - if upload: - write_to_bigquery(df, table_name) - else: - print_df(df, table_name) + write_to_bigquery(df, table_name) # Check if table exists try: @@ -184,7 +178,6 @@ def main(): # Randomly set about 5% of the values in some columns to null for name in null_columns: - df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows @@ -193,10 +186,12 @@ def main(): # Create final dirty dataframe df = df.union(dup_df) - if upload: + if not test: write_to_bigquery(df, RAW_TABLE_NAME) else: - print_df(df, RAW_TABLE_NAME) + # df.sample(True, 0.0001).show(n=500, truncate=False) + # Upload 0.001% of the table (about 600 rows) + write_to_bigquery(df.sample(False, 0.00001)) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index a8fbe0d014d..a0ae6fb2814 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -1,3 +1,9 @@ +"""Test file for the setup job in the Data Science Onramp sample application +Creates a test Dataproc cluster and runs the job with a --test flag. +The job uploads a subset of the data to BigQuery. +Then, data is pulled from BigQuery and checks are made to see if the data is dirty. +""" + import os import re import uuid @@ -25,7 +31,7 @@ 'main_python_file_uri': JOB_FILE_NAME, 'args': [ BUCKET_NAME, - "--dry-run", + "--test", ], "jar_file_uris": [ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" @@ -104,8 +110,9 @@ def is_in_table(value, out): return re.search(f"\\| *{value} *\\|", out) -def table_printed(table_name, out): - return re.search(f"Table {table_name} printed", out) +def table_uploaded(table_name, out): + return re.search(f"Table {table_name} successfully written to BigQuery", out) + def test_setup(): '''Tests setup.py by submitting it to a dataproc cluster''' @@ -125,9 +132,9 @@ def test_setup(): blob = get_blob_from_path(output_location) out = blob.download_as_string().decode("utf-8") - # check that tables were printed + # Check if table upload success message was printed for table_name in TABLE_NAMES: - assert table_printed(table_name, out) + assert table_uploaded(table_name, out) # tripDuration assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) From f040542e5274af839924520ce65a01ec8443eab3 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 14 Jul 2020 17:22:26 -0400 Subject: [PATCH 19/59] optimize setup test --- .../data-ingestion/requirements.txt | 3 +- data-science-onramp/data-ingestion/setup.py | 34 ++--- data-science-onramp/data-ingestion/setup.sh | 2 +- .../data-ingestion/setup_test.py | 116 ++++++++++-------- 4 files changed, 86 insertions(+), 69 deletions(-) mode change 100644 => 100755 data-science-onramp/data-ingestion/setup.sh diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt index f435423c623..e0328e4aec9 100644 --- a/data-science-onramp/data-ingestion/requirements.txt +++ b/data-science-onramp/data-ingestion/requirements.txt @@ -3,4 +3,5 @@ google-auth==1.16.0 google-auth-httplib2==0.0.3 google-cloud==0.34.0 google-cloud-storage==1.28.1 -google-cloud-dataproc==0.8.0 \ No newline at end of file +google-cloud-dataproc==0.8.0 +google-cloud-bigquery==1.25.0 diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index ecffd628b50..bdad93720d2 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -4,14 +4,15 @@ "dirties" the dataset before uploading it back to BigQuery It needs the following arguments * the name of the Google Cloud Storage bucket to be used +* the name of the BigQuery dataset to be created * an optional --test flag to upload a subset of the dataset for testing """ import random import sys -import pandas as pd from google.cloud import bigquery +import pandas as pd from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession from pyspark.sql.functions import expr, UserDefinedFunction, when @@ -19,10 +20,10 @@ BUCKET_NAME = sys.argv[1] +DATASET_NAME = sys.argv[2] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -DATASET_NAME = "data_science_onramp" -RAW_TABLE_NAME = "new_york_citibike_trips" -EXTERNAL_DATASETS = { +CITIBIKE_TABLE_NAME = "new_york_citibike_trips" +EXTERNAL_TABLES = { "gas_prices": { "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", "schema": StructType([ @@ -111,7 +112,6 @@ def write_to_bigquery(df, table_name): # Saving the data to BigQuery df.write.format('bigquery') \ .option('table', f"{dataset_id}.{table_name}") \ - .option("temporaryGcsBucket", BUCKET_NAME) \ .save() print(f"Table {table_name} successfully written to BigQuery") @@ -121,20 +121,22 @@ def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() - test = False # Whether we are running the job as a test + spark.conf.set('temporaryGcsBucket', BUCKET_NAME) + + create_bigquery_dataset() + + # Whether we are running the job as a test + test = False # Check whether or not the job is running as a test if '--test' in sys.argv: test = True - print("Subset of whole dataset will be uploaded to BigQuery") + print("A subset of the whole dataset will be uploaded to BigQuery") else: - create_bigquery_dataset() print("Results will be uploaded to BigQuery") # Ingest External Datasets - - for table_name, data in EXTERNAL_DATASETS.items(): - print(f'Creating dataframe for {table_name}') + for table_name, data in EXTERNAL_TABLES.items(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) @@ -143,6 +145,8 @@ def main(): # Check if table exists try: df = spark.read.format('bigquery').option('table', TABLE).load() + if test: + df = df.sample(False, 0.00001) except Py4JJavaError: print(f"{TABLE} does not exist. ") return @@ -186,12 +190,8 @@ def main(): # Create final dirty dataframe df = df.union(dup_df) - if not test: - write_to_bigquery(df, RAW_TABLE_NAME) - else: - # df.sample(True, 0.0001).show(n=500, truncate=False) - # Upload 0.001% of the table (about 600 rows) - write_to_bigquery(df.sample(False, 0.00001)) + print('Uploading citibike dataset...') + write_to_bigquery(df, CITIBIKE_TABLE_NAME) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh old mode 100644 new mode 100755 index f78c8cd120b..336f3da729d --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ --driver-log-levels root=FATAL \ - setup.py -- ${BUCKET_NAME} + setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index a0ae6fb2814..7b0f0bc6be5 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -10,13 +10,17 @@ from google.cloud import dataproc_v1 as dataproc from google.cloud import storage +from google.cloud import bigquery import pytest # Set global variables +ID = uuid.uuid4() + PROJECT = os.environ['GCLOUD_PROJECT'] REGION = "us-central1" -CLUSTER_NAME = f'setup-test-{uuid.uuid4()}' -BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' +CLUSTER_NAME = f'setup-test-{ID}' +BUCKET_NAME = f'setup-test-{ID}' +DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' TABLE_NAMES = [ @@ -31,6 +35,7 @@ 'main_python_file_uri': JOB_FILE_NAME, 'args': [ BUCKET_NAME, + DATASET_NAME, "--test", ], "jar_file_uris": [ @@ -99,6 +104,17 @@ def setup_and_teardown_bucket(): bucket.delete(force=True) +@pytest.fixture(autouse=True) +def setup_and_teardown_bq_dataset(): + # Dataset is created by the client + bq_client = bigquery.Client(project=PROJECT) + + yield + + # Delete Dataset + bq_client.delete_dataset(DATASET_NAME, delete_contents=True) + + def get_blob_from_path(path): bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] bucket = storage.Client().get_bucket(bucket_name) @@ -106,8 +122,14 @@ def get_blob_from_path(path): return bucket.blob(output_location) -def is_in_table(value, out): - return re.search(f"\\| *{value} *\\|", out) +def get_dataproc_job_output(result): + output_location = result.driver_output_resource_uri + ".000000000" + blob = get_blob_from_path(output_location) + return blob.download_as_string().decode("utf-8") + + +# def is_in_table(value, out): +# return re.search(f"\\| *{value} *\\|", out) def table_uploaded(table_name, out): @@ -128,49 +150,43 @@ def test_setup(): result = response.result() # Get job output - output_location = result.driver_output_resource_uri + ".000000000" - blob = get_blob_from_path(output_location) - out = blob.download_as_string().decode("utf-8") - - # Check if table upload success message was printed - for table_name in TABLE_NAMES: - assert table_uploaded(table_name, out) - - # tripDuration - assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) - assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) - assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) - - # station latitude & longitude - assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) - - # birth_year - assert is_in_table("19[0-9][0-9]", out) - assert is_in_table("20[0-9][0-9]", out) - - # gender - assert is_in_table("M", out) - assert is_in_table("m", out) - assert is_in_table("male", out) - assert is_in_table("MALE", out) - assert is_in_table("F", out) - assert is_in_table("f", out) - assert is_in_table("female", out) - assert is_in_table("FEMALE", out) - assert is_in_table("U", out) - assert is_in_table("u", out) - assert is_in_table("unknown", out) - assert is_in_table("UNKNOWN", out) - - # customer_plan - assert is_in_table("Subscriber", out) - assert is_in_table("subscriber", out) - assert is_in_table("SUBSCRIBER", out) - assert is_in_table("sub", out) - assert is_in_table("Customer", out) - assert is_in_table("customer", out) - assert is_in_table("CUSTOMER", out) - assert is_in_table("cust", out) - - # Missing data - assert is_in_table("null", out) + out = get_dataproc_job_output(result) + + # # tripDuration + # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) + # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) + # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) + + # # station latitude & longitude + # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) + + # # birth_year + # assert is_in_table("19[0-9][0-9]", out) + # assert is_in_table("20[0-9][0-9]", out) + + # # gender + # assert is_in_table("M", out) + # assert is_in_table("m", out) + # assert is_in_table("male", out) + # assert is_in_table("MALE", out) + # assert is_in_table("F", out) + # assert is_in_table("f", out) + # assert is_in_table("female", out) + # assert is_in_table("FEMALE", out) + # assert is_in_table("U", out) + # assert is_in_table("u", out) + # assert is_in_table("unknown", out) + # assert is_in_table("UNKNOWN", out) + + # # customer_plan + # assert is_in_table("Subscriber", out) + # assert is_in_table("subscriber", out) + # assert is_in_table("SUBSCRIBER", out) + # assert is_in_table("sub", out) + # assert is_in_table("Customer", out) + # assert is_in_table("customer", out) + # assert is_in_table("CUSTOMER", out) + # assert is_in_table("cust", out) + + # # Missing data + # assert is_in_table("null", out) From 55354df6dd010e17198c9fe144b2c212f3b6c2e7 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Wed, 15 Jul 2020 18:40:18 -0400 Subject: [PATCH 20/59] query data in test --- data-science-onramp/data-ingestion/setup.sh | 2 +- .../data-ingestion/setup_test.py | 107 +++++++++++------- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh index 336f3da729d..a69cda6a134 100755 --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ --driver-log-levels root=FATAL \ - setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test + setup.py -- ${BUCKET_NAME} data_science_onramp diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 7b0f0bc6be5..ad9e756f8d1 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -21,6 +21,7 @@ CLUSTER_NAME = f'setup-test-{ID}' BUCKET_NAME = f'setup-test-{ID}' DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") +CITIBIKE_TABLE = "new_york_citibike_trips" DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' TABLE_NAMES = [ @@ -123,6 +124,7 @@ def get_blob_from_path(path): def get_dataproc_job_output(result): + """Get the dataproc job logs in plain text""" output_location = result.driver_output_resource_uri + ".000000000" blob = get_blob_from_path(output_location) return blob.download_as_string().decode("utf-8") @@ -132,12 +134,50 @@ def get_dataproc_job_output(result): # return re.search(f"\\| *{value} *\\|", out) -def table_uploaded(table_name, out): - return re.search(f"Table {table_name} successfully written to BigQuery", out) +def assert_table_success_message(table_name, out): + """Check table upload success message was printed in job logs.""" + assert re.search(f"Table {table_name} successfully written to BigQuery", out), \ + f"Table {table_name} sucess message not printed in job logs" + + + +def assert_regexes_in_table(regex_dict, query_result): + """Assert that at least one row satisfies each regex. + The arguments are + - regex_dict: a dictionary where the keys are column + names and values are lists of regexes; + - query_result: the bigquery query result of the whole table. + """ + + # Create dictionary with keys column names and values dictionaries + # The dictionaries stored have keys regexes and values booleans + # `regex_found_dict[column][regex]` hold the truth value of + # whether the there is at least one row of column with name `column` + # which satisfies the regular expression `regex`. + regex_found_dict = {} + for column, regexes in regex_dict.items(): + regex_found_dict[column] = {} + for regex in regexes: + regex_found_dict[column][regex] = False + + # Outer loop is over `query_result` since this is + # an iterator which can only iterate once + for row in query_result: + for column_name, regexes in regex_dict.items(): + for regex in regexes: + if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]): + regex_found_dict[column_name][regex] = True + + # Assert that all entries in regex_found_dict are true + for column_name in regex_found_dict: + for regex, found in regex_found_dict[column_name].items(): + assert found, \ + f"No matches to regular expression \"{regex}\" found in column {column_name}" def test_setup(): - '''Tests setup.py by submitting it to a dataproc cluster''' + """Test setup.py by submitting it to a dataproc cluster + Check table upload success message as well as data in the table itself""" # Submit job to dataproc cluster job_client = dataproc.JobControllerClient(client_options={ @@ -151,42 +191,27 @@ def test_setup(): # Get job output out = get_dataproc_job_output(result) + + # Check logs to see if tables were uploaded + for table_name in TABLE_NAMES: + assert_table_success_message(table_name, out) + + # Query BigQuery Table + client = bigquery.Client() + query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" + query_job = client.query(query) + + result = query_job.result() + + regex_dict = { + "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"], + "gender": ['f', 'F', 'm', 'M', 'u', 'U', 'male', 'MALE', 'female', 'FEMALE', 'unknown', 'UNKNOWN'], + "start_station_latitude": ["[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\""], + "start_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], + "end_station_latitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], + "end_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], + "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"], + } + + assert_regexes_in_table(regex_dict, result) - # # tripDuration - # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) - # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) - # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) - - # # station latitude & longitude - # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) - - # # birth_year - # assert is_in_table("19[0-9][0-9]", out) - # assert is_in_table("20[0-9][0-9]", out) - - # # gender - # assert is_in_table("M", out) - # assert is_in_table("m", out) - # assert is_in_table("male", out) - # assert is_in_table("MALE", out) - # assert is_in_table("F", out) - # assert is_in_table("f", out) - # assert is_in_table("female", out) - # assert is_in_table("FEMALE", out) - # assert is_in_table("U", out) - # assert is_in_table("u", out) - # assert is_in_table("unknown", out) - # assert is_in_table("UNKNOWN", out) - - # # customer_plan - # assert is_in_table("Subscriber", out) - # assert is_in_table("subscriber", out) - # assert is_in_table("SUBSCRIBER", out) - # assert is_in_table("sub", out) - # assert is_in_table("Customer", out) - # assert is_in_table("customer", out) - # assert is_in_table("CUSTOMER", out) - # assert is_in_table("cust", out) - - # # Missing data - # assert is_in_table("null", out) From 5f80974d5407cf64021f1bae73b0b65c904566ea Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Fri, 17 Jul 2020 14:03:26 -0400 Subject: [PATCH 21/59] address live session comments --- .../data-ingestion/setup_test.py | 64 ++++++------------- 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index ad9e756f8d1..5ee77d5e1a3 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -8,9 +8,9 @@ import re import uuid +from google.cloud import bigquery from google.cloud import dataproc_v1 as dataproc from google.cloud import storage -from google.cloud import bigquery import pytest # Set global variables @@ -130,51 +130,12 @@ def get_dataproc_job_output(result): return blob.download_as_string().decode("utf-8") -# def is_in_table(value, out): -# return re.search(f"\\| *{value} *\\|", out) - - def assert_table_success_message(table_name, out): """Check table upload success message was printed in job logs.""" assert re.search(f"Table {table_name} successfully written to BigQuery", out), \ f"Table {table_name} sucess message not printed in job logs" - -def assert_regexes_in_table(regex_dict, query_result): - """Assert that at least one row satisfies each regex. - The arguments are - - regex_dict: a dictionary where the keys are column - names and values are lists of regexes; - - query_result: the bigquery query result of the whole table. - """ - - # Create dictionary with keys column names and values dictionaries - # The dictionaries stored have keys regexes and values booleans - # `regex_found_dict[column][regex]` hold the truth value of - # whether the there is at least one row of column with name `column` - # which satisfies the regular expression `regex`. - regex_found_dict = {} - for column, regexes in regex_dict.items(): - regex_found_dict[column] = {} - for regex in regexes: - regex_found_dict[column][regex] = False - - # Outer loop is over `query_result` since this is - # an iterator which can only iterate once - for row in query_result: - for column_name, regexes in regex_dict.items(): - for regex in regexes: - if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]): - regex_found_dict[column_name][regex] = True - - # Assert that all entries in regex_found_dict are true - for column_name in regex_found_dict: - for regex, found in regex_found_dict[column_name].items(): - assert found, \ - f"No matches to regular expression \"{regex}\" found in column {column_name}" - - def test_setup(): """Test setup.py by submitting it to a dataproc cluster Check table upload success message as well as data in the table itself""" @@ -191,17 +152,13 @@ def test_setup(): # Get job output out = get_dataproc_job_output(result) - + # Check logs to see if tables were uploaded for table_name in TABLE_NAMES: assert_table_success_message(table_name, out) # Query BigQuery Table client = bigquery.Client() - query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" - query_job = client.query(query) - - result = query_job.result() regex_dict = { "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"], @@ -213,5 +170,20 @@ def test_setup(): "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"], } - assert_regexes_in_table(regex_dict, result) + for column_name, regexes in regex_dict.items(): + query = f"SELECT {column_name} FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" + query_job = client.query(query) + result = query_job.result() + + rows = [] + for row in result: + rows.append(row[column_name]) + + for regex in regexes: + found = False + for row in rows: + if row and re.match(f"\\A{regex}\\Z", row): + found = True + assert found, \ + f"No matches to regular expression \"{regex}\" found in column {column_name}" From e8837654c2e0bb29785c9aa6aaf03c23e8cc0745 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Mon, 20 Jul 2020 11:36:25 -0400 Subject: [PATCH 22/59] add break statement --- data-science-onramp/data-ingestion/setup_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 5ee77d5e1a3..978a5376480 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -185,5 +185,6 @@ def test_setup(): for row in rows: if row and re.match(f"\\A{regex}\\Z", row): found = True + break assert found, \ f"No matches to regular expression \"{regex}\" found in column {column_name}" From 2ec8b30d60250b7eab737e0a6087ff734c8a50fd Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 23 Jul 2020 16:48:55 -0400 Subject: [PATCH 23/59] revert breaking table and dataset name change --- data-science-onramp/data-ingestion/setup.py | 2 +- data-science-onramp/data-ingestion/setup.sh | 2 +- data-science-onramp/data-ingestion/setup_test.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index bdad93720d2..a1f13dfa5ef 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -22,7 +22,7 @@ BUCKET_NAME = sys.argv[1] DATASET_NAME = sys.argv[2] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -CITIBIKE_TABLE_NAME = "new_york_citibike_trips" +CITIBIKE_TABLE_NAME = "RAW_DATA" EXTERNAL_TABLES = { "gas_prices": { "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh index a69cda6a134..2c4773f7272 100755 --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ --driver-log-levels root=FATAL \ - setup.py -- ${BUCKET_NAME} data_science_onramp + setup.py -- ${BUCKET_NAME} new_york_citibike_trips diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 978a5376480..b1395af9793 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -21,11 +21,11 @@ CLUSTER_NAME = f'setup-test-{ID}' BUCKET_NAME = f'setup-test-{ID}' DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") -CITIBIKE_TABLE = "new_york_citibike_trips" +CITIBIKE_TABLE = "RAW_DATA" DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' TABLE_NAMES = [ - "new_york_citibike_trips", + CITIBIKE_TABLE, "gas_prices", ] JOB_DETAILS = { # Job configuration From b5ea09e00ca6cd6a9ef5150f4eea36285c8099be Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 4 Aug 2020 19:57:05 -0400 Subject: [PATCH 24/59] fix datetime formatting in setup job --- data-science-onramp/data-ingestion/setup.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index a1f13dfa5ef..8205f551c51 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -15,7 +15,7 @@ import pandas as pd from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import expr, UserDefinedFunction, when +from pyspark.sql.functions import expr, UserDefinedFunction, when, date_format from pyspark.sql.types import FloatType, StringType, StructField, StructType @@ -101,7 +101,7 @@ def create_bigquery_dataset(): dataset_id = f'{client.project}.{DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" - dataset = client.create_dataset(dataset) + #dataset = client.create_dataset(dataset) def write_to_bigquery(df, table_name): @@ -140,7 +140,7 @@ def main(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - write_to_bigquery(df, table_name) + #write_to_bigquery(df, table_name) # Check if table exists try: @@ -180,6 +180,10 @@ def main(): for name, udf in udf_map.items(): df = df.withColumn(name, UserDefinedFunction(*udf)(name)) + # Format the datetimes correctly + for name in ['starttime', 'stoptime']: + df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss")) + # Randomly set about 5% of the values in some columns to null for name in null_columns: df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) From 213dfcac25a8f4bba6dcee24a00a73b1c9a4f853 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 6 Aug 2020 14:03:13 -0400 Subject: [PATCH 25/59] uncomment commented dataset creation and writing --- data-science-onramp/data-ingestion/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 8205f551c51..352d8b029b4 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -101,7 +101,7 @@ def create_bigquery_dataset(): dataset_id = f'{client.project}.{DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" - #dataset = client.create_dataset(dataset) + dataset = client.create_dataset(dataset) def write_to_bigquery(df, table_name): @@ -140,7 +140,7 @@ def main(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - #write_to_bigquery(df, table_name) + write_to_bigquery(df, table_name) # Check if table exists try: From 589568a55f1cb8df1b5ad20fb26202c7399ca8c8 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Fri, 5 Jun 2020 15:35:13 -0400 Subject: [PATCH 26/59] add data ingestion code --- .../data-ingestion/requirements-test.txt | 1 + .../data-ingestion/requirements.txt | 6 + .../data-ingestion/setup-test.py | 210 ++++++++++++++++++ data-science-onramp/data-ingestion/setup.py | 149 +++++++++++++ data-science-onramp/data-ingestion/setup.sh | 6 + 5 files changed, 372 insertions(+) create mode 100644 data-science-onramp/data-ingestion/requirements-test.txt create mode 100644 data-science-onramp/data-ingestion/requirements.txt create mode 100644 data-science-onramp/data-ingestion/setup-test.py create mode 100644 data-science-onramp/data-ingestion/setup.py create mode 100644 data-science-onramp/data-ingestion/setup.sh diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt new file mode 100644 index 00000000000..781d4326c94 --- /dev/null +++ b/data-science-onramp/data-ingestion/requirements-test.txt @@ -0,0 +1 @@ +pytest==5.3.2 diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt new file mode 100644 index 00000000000..f435423c623 --- /dev/null +++ b/data-science-onramp/data-ingestion/requirements.txt @@ -0,0 +1,6 @@ +grpcio==1.29.0 +google-auth==1.16.0 +google-auth-httplib2==0.0.3 +google-cloud==0.34.0 +google-cloud-storage==1.28.1 +google-cloud-dataproc==0.8.0 \ No newline at end of file diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup-test.py new file mode 100644 index 00000000000..d827c805818 --- /dev/null +++ b/data-science-onramp/data-ingestion/setup-test.py @@ -0,0 +1,210 @@ +import os +import re + +import uuid + +from google.api_core.exceptions import GoogleAPICallError + +from google.cloud import dataproc_v1 as dataproc +from google.cloud import storage +from google.cloud.exceptions import NotFound + +import pytest + +waiting_cluster_callback = False + +# Set global variables +project = os.environ['GCLOUD_PROJECT'] +region = "us-central1" +zone = "us-central1-a" +cluster_name = 'setup-test-{}'.format(str(uuid.uuid4())) +bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4())) + + +@pytest.fixture(autouse=True) +def teardown(): + yield + + # Delete cluster + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': f'{region}-dataproc.googleapis.com:443' + }) + + try: + operation = cluster_client.delete_cluster(project, region, + cluster_name) + operation.result() + except GoogleAPICallError: + pass + + # Delete GCS bucket + storage_client = storage.Client() + try: + bucket = storage_client.get_bucket(bucket_name) + bucket.delete(force=True) + except NotFound: + pass + + +def test_setup(capsys): + '''Tests setup.py by submitting it to a dataproc cluster''' + + # Create GCS Bucket + storage_client = storage.Client() + bucket = storage_client.create_bucket(bucket_name) + + # Upload file + destination_blob_name = "setup.py" + blob = bucket.blob(destination_blob_name) + blob.upload_from_filename("setup.py") + + job_file_name = "gs://" + bucket_name + "/setup.py" + + # Create cluster configuration + zone_uri = \ + 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( + project, zone) + cluster_data = { + 'project_id': project, + 'cluster_name': cluster_name, + 'config': { + 'gce_cluster_config': { + 'zone_uri': zone_uri, + "metadata": { + "PIP_PACKAGES": "google-cloud-storage" + }, + }, + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-8' + }, + 'worker_config': { + 'num_instances': 6, + 'machine_type_uri': 'n1-standard-8' + }, + "initialization_actions": [ + { + "executable_file": ("gs://dataproc-initialization-actions/" + "python/pip-install.sh"), + } + ], + "software_config": { + "image_version": "1.5.4-debian10", + "optional_components": [ + "ANACONDA" + ], + } + } + } + + # Create cluster using cluster client + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + }) + + cluster = cluster_client.create_cluster(project, region, cluster_data) + cluster.add_done_callback(callback) + + # Wait for cluster to provision + global waiting_cluster_callback + waiting_cluster_callback = True + + wait_for_cluster_creation() + + # Create job configuration + job_details = { + 'placement': { + 'cluster_name': cluster_name + }, + 'pyspark_job': { + 'main_python_file_uri': job_file_name, + 'args': [ + bucket_name, + "--test", + ], + "jar_file_uris": [ + "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" + ], + }, + } + + # Submit job to dataproc cluster + job_client = dataproc.JobControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + }) + + result = job_client.submit_job(project_id=project, region=region, + job=job_details) + + job_id = result.reference.job_id + print('Submitted job \"{}\".'.format(job_id)) + + # Wait for job to complete + wait_for_job(job_client, job_id) + + # Get job output + cluster_info = cluster_client.get_cluster(project, region, cluster_name) + bucket = storage_client.get_bucket(cluster_info.config.config_bucket) + output_blob = ( + 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' + .format(cluster_info.cluster_uuid, job_id)) + out = bucket.blob(output_blob).download_as_string().decode("utf-8") + + # tripDuration + assert re.search("[0-9] s", out) + assert re.search("[0-9] m", out) + assert re.search("[0-9] h", out) + + # station latitude & longitude + assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out) + + # birth_year + assert re.search("19[0-9][0-9]\\|", out) + assert re.search("20[0-9][0-9]\\|", out) + + # gender + assert "M" in out + assert "male" in out + assert "MALE" in out + assert "F" in out + assert "female" in out + assert "FEMALE" in out + assert "u" in out + assert "unknown" in out + assert "UNKNOWN" in out + + # customer_plan + assert "Subscriber" in out + assert "subscriber" in out + assert "SUBSCRIBER" in out + assert "sub" in out + assert "Customer" in out + assert "customer" in out + assert "CUSTOMER" in out + assert "cust" in out + + # Missing data + assert "null" in out + + +def callback(operation_future): + '''Sets a flag to stop waiting''' + global waiting_cluster_callback + waiting_cluster_callback = False + + +def wait_for_cluster_creation(): + '''Waits for cluster to create''' + while True: + if not waiting_cluster_callback: + break + + +def wait_for_job(job_client, job_id): + '''Waits for job to finish''' + while True: + job = job_client.get_job(project, region, job_id) + assert job.status.State.Name(job.status.state) != "ERROR" + + if job.status.State.Name(job.status.state) == "DONE": + return diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py new file mode 100644 index 00000000000..91a740b34d0 --- /dev/null +++ b/data-science-onramp/data-ingestion/setup.py @@ -0,0 +1,149 @@ +from random import choice, choices, randint, seed +import sys + +from time import time_ns + +from google.cloud import bigquery + +from py4j.protocol import Py4JJavaError +from pyspark.sql import SparkSession + +from pyspark.sql.functions import UserDefinedFunction +from pyspark.sql.types import IntegerType, StringType + + +# Create a SparkSession under the name "setup". Viewable via the Spark UI +spark = SparkSession.builder.appName("setup").getOrCreate() + +bucket_name = sys.argv[1] +upload = True # Whether to upload data to BigQuery + +# Check whether or not results should be uploaded +try: + sys.argv[2] + upload = False +except IndexError: + print("Results will be uploaded to BigQuery") + +table = "bigquery-public-data.new_york_citibike.citibike_trips" + +# Check if table exists +try: + df = spark.read.format('bigquery').option('table', table).load() +except Py4JJavaError: + print(f"{table} does not exist. ") + sys.exit(0) + +# START MAKING DATA DIRTY + + +def random_select(items, cum_weights): + '''Picks an item according to the cumulative weights''' + return choices(items, cum_weights=cum_weights, k=1)[0] + + +def tripduration(duration): + '''Converts trip duration to other units''' + seconds = str(duration) + " s" + minutes = str(float(duration) / 60) + " min" + hours = str(float(duration) / 3600) + " h" + return random_select([seconds, minutes, hours, str(randint(-1000, -1))], + [0.3, 0.6, 0.9, 1]) + + +def station_name(name): + '''Replaces '&' with '/' with a 50% chance''' + return choice([name, name.replace("&", "/")]) + + +def usertype(user): + '''Manipulates the user type string''' + return choice([user, user.upper(), user.lower(), + "sub" if user == "Subscriber" else user, + "cust" if user == "Customer" else user]) + + +def gender(s): + '''Manipulates the gender string''' + return choice([s, s.upper(), s.lower(), + s[0] if len(s) > 0 else "", + s[0].lower() if len(s) > 0 else ""]) + + +def convertAngle(angle): + '''Converts long and lat to DMS notation''' + degrees = int(angle) + minutes = int((angle - degrees) * 60) + seconds = int((angle - degrees - minutes/60) * 3600) + new_angle = str(degrees) + u"\u00B0" + \ + str(minutes) + "'" + str(seconds) + '"' + return random_select([str(angle), new_angle], cum_weights=[0.55, 1]) + + +def dirty_data(proc_func, allow_none): + '''Master function returns a user defined function + that transforms the column data''' + def udf(col_value): + seed(hash(col_value) + time_ns()) + if col_value is None: + return col_value + elif allow_none: + return random_select([None, proc_func(col_value)], + cum_weights=[0.05, 1]) + else: + return proc_func(col_value) + return udf + + +def id(x): + return x + + +# Declare data transformations for each column in dataframe +udfs = [ + (dirty_data(tripduration, True), StringType()), # tripduration + (dirty_data(id, True), StringType()), # starttime + (dirty_data(id, True), StringType()), # stoptime + (id, IntegerType()), # start_station_id + (dirty_data(station_name, False), StringType()), # start_station_name + (dirty_data(convertAngle, True), StringType()), # start_station_latitude + (dirty_data(convertAngle, True), StringType()), # start_station_longitude + (id, IntegerType()), # end_station_id + (dirty_data(station_name, False), StringType()), # end_station_name + (dirty_data(convertAngle, True), StringType()), # end_station_latitude + (dirty_data(convertAngle, True), StringType()), # end_station_longitude + (id, IntegerType()), # bikeid + (dirty_data(usertype, False), StringType()), # usertype + (id, IntegerType()), # birth_year + (dirty_data(gender, False), StringType()), # gender + (id, StringType()), # customer_plan +] + +# Apply dirty transformations to df +names = df.schema.names +new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) + for udf, column, name in zip(udfs, df.columns, names)]) + +# Duplicate about 0.01% of the rows +dup_df = new_df.sample(False, 0.0001, seed=42) + +# Create final dirty dataframe +df = new_df.union(dup_df) +df.sample(False, 0.0001, seed=50).show(n=200) +print("Dataframe sample printed") + +# Write to BigQuery +if upload: + # Create BigQuery Dataset + client = bigquery.Client() + dataset_id = '{}.new_york_citibike_trips'.format(client.project) + dataset = bigquery.Dataset(dataset_id) + dataset.location = "US" + dataset = client.create_dataset(dataset) + + # Saving the data to BigQuery + spark.conf.set('temporaryGcsBucket', bucket_name) + + df.write.format('bigquery') \ + .option('table', dataset_id + ".RAW_DATA") \ + .save() diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh new file mode 100644 index 00000000000..12730a3a6fe --- /dev/null +++ b/data-science-onramp/data-ingestion/setup.sh @@ -0,0 +1,6 @@ +# Submit a PySpark job via the Cloud Dataproc Jobs API +gcloud dataproc jobs submit pyspark \ + --cluster ${CLUSTER_NAME} \ + --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ + --driver-log-levels root=FATAL \ + setup.py -- ${BUCKET_NAME} From 9148f5b1c1ee987833a425b2dfb683ec9fe8e95f Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Mon, 8 Jun 2020 10:24:56 -0400 Subject: [PATCH 27/59] begin addressing comments --- data-science-onramp/data-ingestion/setup.py | 57 +++---- data-science-onramp/data-ingestion/setup.sh | 3 + .../{setup-test.py => setup_test.py} | 145 +++++++----------- 3 files changed, 90 insertions(+), 115 deletions(-) rename data-science-onramp/data-ingestion/{setup-test.py => setup_test.py} (59%) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 91a740b34d0..dc869903c84 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,4 +1,4 @@ -from random import choice, choices, randint, seed +import random import sys from time import time_ns @@ -19,10 +19,10 @@ upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded -try: - sys.argv[2] +if len(sys.arv) > 1: upload = False -except IndexError: + print("Not uploading results to BigQuery") +else: print("Results will be uploaded to BigQuery") table = "bigquery-public-data.new_york_citibike.citibike_trips" @@ -37,59 +37,60 @@ # START MAKING DATA DIRTY -def random_select(items, cum_weights): +def random_select(items, weights): '''Picks an item according to the cumulative weights''' - return choices(items, cum_weights=cum_weights, k=1)[0] + return random.choices(items, weights=weights, k=1)[0] -def tripduration(duration): +def trip_duration(duration): '''Converts trip duration to other units''' seconds = str(duration) + " s" minutes = str(float(duration) / 60) + " min" hours = str(float(duration) / 3600) + " h" - return random_select([seconds, minutes, hours, str(randint(-1000, -1))], - [0.3, 0.6, 0.9, 1]) + return random_select([seconds, minutes, hours, + str(random.randint(-1000, -1))], + [0.3, 0.3, 0.3, 0.1]) def station_name(name): '''Replaces '&' with '/' with a 50% chance''' - return choice([name, name.replace("&", "/")]) + return random.choice([name, name.replace("&", "/")]) -def usertype(user): +def user_type(user): '''Manipulates the user type string''' - return choice([user, user.upper(), user.lower(), - "sub" if user == "Subscriber" else user, - "cust" if user == "Customer" else user]) + return random.choice([user, user.upper(), user.lower(), + "sub" if user == "Subscriber" else user, + "cust" if user == "Customer" else user]) def gender(s): '''Manipulates the gender string''' - return choice([s, s.upper(), s.lower(), - s[0] if len(s) > 0 else "", - s[0].lower() if len(s) > 0 else ""]) + return random.choice([s, s.upper(), s.lower(), + s[0] if len(s) > 0 else "", + s[0].lower() if len(s) > 0 else ""]) -def convertAngle(angle): +def convert_angle(angle): '''Converts long and lat to DMS notation''' degrees = int(angle) minutes = int((angle - degrees) * 60) seconds = int((angle - degrees - minutes/60) * 3600) new_angle = str(degrees) + u"\u00B0" + \ str(minutes) + "'" + str(seconds) + '"' - return random_select([str(angle), new_angle], cum_weights=[0.55, 1]) + return random_select([str(angle), new_angle], [0.55, 0.45]) def dirty_data(proc_func, allow_none): '''Master function returns a user defined function that transforms the column data''' def udf(col_value): - seed(hash(col_value) + time_ns()) + random.seed(hash(col_value) + time_ns()) if col_value is None: return col_value elif allow_none: return random_select([None, proc_func(col_value)], - cum_weights=[0.05, 1]) + [0.05, 0.95]) else: return proc_func(col_value) return udf @@ -101,19 +102,19 @@ def id(x): # Declare data transformations for each column in dataframe udfs = [ - (dirty_data(tripduration, True), StringType()), # tripduration + (dirty_data(trip_duration, True), StringType()), # tripduration (dirty_data(id, True), StringType()), # starttime (dirty_data(id, True), StringType()), # stoptime (id, IntegerType()), # start_station_id (dirty_data(station_name, False), StringType()), # start_station_name - (dirty_data(convertAngle, True), StringType()), # start_station_latitude - (dirty_data(convertAngle, True), StringType()), # start_station_longitude + (dirty_data(convert_angle, True), StringType()), # start_station_latitude + (dirty_data(convert_angle, True), StringType()), # start_station_longitude (id, IntegerType()), # end_station_id (dirty_data(station_name, False), StringType()), # end_station_name - (dirty_data(convertAngle, True), StringType()), # end_station_latitude - (dirty_data(convertAngle, True), StringType()), # end_station_longitude + (dirty_data(convert_angle, True), StringType()), # end_station_latitude + (dirty_data(convert_angle, True), StringType()), # end_station_longitude (id, IntegerType()), # bikeid - (dirty_data(usertype, False), StringType()), # usertype + (dirty_data(user_type, False), StringType()), # usertype (id, IntegerType()), # birth_year (dirty_data(gender, False), StringType()), # gender (id, StringType()), # customer_plan @@ -136,7 +137,7 @@ def id(x): if upload: # Create BigQuery Dataset client = bigquery.Client() - dataset_id = '{}.new_york_citibike_trips'.format(client.project) + dataset_id = f'{client.project}.new_york_citibike_trips' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh index 12730a3a6fe..f78c8cd120b 100644 --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -1,4 +1,7 @@ # Submit a PySpark job via the Cloud Dataproc Jobs API +# Requires having CLUSTER_NAME and BUCKET_NAME set as +# environment variables + gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ diff --git a/data-science-onramp/data-ingestion/setup-test.py b/data-science-onramp/data-ingestion/setup_test.py similarity index 59% rename from data-science-onramp/data-ingestion/setup-test.py rename to data-science-onramp/data-ingestion/setup_test.py index d827c805818..54f3d20e902 100644 --- a/data-science-onramp/data-ingestion/setup-test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -11,62 +11,25 @@ import pytest -waiting_cluster_callback = False # Set global variables -project = os.environ['GCLOUD_PROJECT'] -region = "us-central1" -zone = "us-central1-a" -cluster_name = 'setup-test-{}'.format(str(uuid.uuid4())) -bucket_name = 'setup-test-code-{}'.format(str(uuid.uuid4())) +PROJECT = os.environ['GCLOUD_PROJECT'] +REGION = "us-central1" +ZONE = "us-central1-a" +CLUSTER_NAME = f'setup-test-{uuid.uuid4()}' +BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' +BUCKET = None -@pytest.fixture(autouse=True) -def teardown(): - yield - - # Delete cluster - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{region}-dataproc.googleapis.com:443' - }) - - try: - operation = cluster_client.delete_cluster(project, region, - cluster_name) - operation.result() - except GoogleAPICallError: - pass - - # Delete GCS bucket - storage_client = storage.Client() - try: - bucket = storage_client.get_bucket(bucket_name) - bucket.delete(force=True) - except NotFound: - pass - - -def test_setup(capsys): - '''Tests setup.py by submitting it to a dataproc cluster''' - - # Create GCS Bucket - storage_client = storage.Client() - bucket = storage_client.create_bucket(bucket_name) - - # Upload file - destination_blob_name = "setup.py" - blob = bucket.blob(destination_blob_name) - blob.upload_from_filename("setup.py") - - job_file_name = "gs://" + bucket_name + "/setup.py" +@pytest.fixture(autouse=True) +def setup_and_teardown_cluster(): # Create cluster configuration zone_uri = \ - 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( - project, zone) + f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}' cluster_data = { - 'project_id': project, - 'cluster_name': cluster_name, + 'project_id': PROJECT, + 'cluster_name': CLUSTER_NAME, 'config': { 'gce_cluster_config': { 'zone_uri': zone_uri, @@ -99,27 +62,59 @@ def test_setup(capsys): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) - cluster = cluster_client.create_cluster(project, region, cluster_data) - cluster.add_done_callback(callback) + operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data) # Wait for cluster to provision - global waiting_cluster_callback - waiting_cluster_callback = True + operation.result() - wait_for_cluster_creation() + yield + + # Delete cluster + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' + }) + + operation = cluster_client.delete_cluster(PROJECT, REGION, + CLUSTER_NAME) + operation.result() + + +@pytest.fixture(autouse=True) +def setup_and_teardown_bucket(): + global BUCKET + # Create GCS Bucket + storage_client = storage.Client() + BUCKET = storage_client.create_bucket(BUCKET_NAME) + + yield + + # Delete GCS bucket + storage_client = storage.Client() + bucket = storage_client.get_bucket(BUCKET_NAME) + bucket.delete(force=True) + +def test_setup(capsys): + '''Tests setup.py by submitting it to a dataproc cluster''' + + # Upload file + destination_blob_name = "setup.py" + blob = BUCKET.blob(destination_blob_name) + blob.upload_from_filename("setup.py") + + job_file_name = "gs://" + BUCKET_NAME + "/setup.py" # Create job configuration job_details = { 'placement': { - 'cluster_name': cluster_name + 'cluster_name': CLUSTER_NAME }, 'pyspark_job': { 'main_python_file_uri': job_file_name, 'args': [ - bucket_name, + BUCKET_NAME, "--test", ], "jar_file_uris": [ @@ -130,25 +125,21 @@ def test_setup(capsys): # Submit job to dataproc cluster job_client = dataproc.JobControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) - result = job_client.submit_job(project_id=project, region=region, + response = job_client.submit_job(project_id=PROJECT, region=REGION, job=job_details) - job_id = result.reference.job_id + job_id = response.reference.job_id print('Submitted job \"{}\".'.format(job_id)) # Wait for job to complete - wait_for_job(job_client, job_id) + result = response.add_done_callback(callback) # Get job output - cluster_info = cluster_client.get_cluster(project, region, cluster_name) - bucket = storage_client.get_bucket(cluster_info.config.config_bucket) - output_blob = ( - 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' - .format(cluster_info.cluster_uuid, job_id)) - out = bucket.blob(output_blob).download_as_string().decode("utf-8") + output_location = result.driver_output_resource_uri() + ".000000000" + output = BUCKET.blob(output_location).download_as_string().decode("utf-8") # tripDuration assert re.search("[0-9] s", out) @@ -186,25 +177,5 @@ def test_setup(capsys): # Missing data assert "null" in out - def callback(operation_future): - '''Sets a flag to stop waiting''' - global waiting_cluster_callback - waiting_cluster_callback = False - - -def wait_for_cluster_creation(): - '''Waits for cluster to create''' - while True: - if not waiting_cluster_callback: - break - - -def wait_for_job(job_client, job_id): - '''Waits for job to finish''' - while True: - job = job_client.get_job(project, region, job_id) - assert job.status.State.Name(job.status.state) != "ERROR" - - if job.status.State.Name(job.status.state) == "DONE": - return + return operation_future.result() From 1abf664ed47ec3656a293c46a9ff7385b748bbab Mon Sep 17 00:00:00 2001 From: vuppalli Date: Mon, 8 Jun 2020 11:26:33 -0400 Subject: [PATCH 28/59] change submit job --- data-science-onramp/data-ingestion/noxfile.py | 225 ++++++++++++++++++ .../data-ingestion/setup_test.py | 9 +- 2 files changed, 228 insertions(+), 6 deletions(-) create mode 100644 data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py new file mode 100644 index 00000000000..b23055f14a6 --- /dev/null +++ b/data-science-onramp/data-ingestion/noxfile.py @@ -0,0 +1,225 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from pathlib import Path +import sys + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +# Copy `noxfile_config.py` to your directory and modify it instead. + + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + 'ignored_versions': ["2.7"], + + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + 'gcloud_project_env': 'GCLOUD_PROJECT', + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + 'envs': {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append('.') + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars(): + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG['gcloud_project_env'] + # This should error out if not set. + ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] + ret['GCLOUD_PROJECT'] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG['envs']) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to tested samples. +ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) +# +# Style Checks +# + + +def _determine_local_import_names(start_dir): + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session): + session.install("flake8", "flake8-import-order") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + "." + ] + session.run("flake8", *args) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests(session, post_install=None): + """Runs py.test for a particular project.""" + if os.path.exists("requirements.txt"): + session.install("-r", "requirements.txt") + + if os.path.exists("requirements-test.txt"): + session.install("-r", "requirements-test.txt") + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars() + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session): + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip("SKIPPED: {} tests are disabled for this sample.".format( + session.python + )) + + +# +# Readmegen +# + + +def _get_repo_root(): + """ Returns the root folder of the project. """ + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session, path): + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 54f3d20e902..919dcc4f35c 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -128,17 +128,14 @@ def test_setup(capsys): 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) }) - response = job_client.submit_job(project_id=PROJECT, region=REGION, + response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, job=job_details) - job_id = response.reference.job_id - print('Submitted job \"{}\".'.format(job_id)) - # Wait for job to complete - result = response.add_done_callback(callback) + result = response.result() # Get job output - output_location = result.driver_output_resource_uri() + ".000000000" + output_location = result.driver_output_resource_uri + ".000000000" output = BUCKET.blob(output_location).download_as_string().decode("utf-8") # tripDuration From c6007249ebff5d5f827fa8482c8c488c9eb46a06 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Mon, 8 Jun 2020 14:43:48 -0400 Subject: [PATCH 29/59] address code structure and global variable issues --- data-science-onramp/data-ingestion/noxfile.py | 225 ------------------ data-science-onramp/data-ingestion/setup.py | 125 +++++----- .../data-ingestion/setup_test.py | 17 +- 3 files changed, 78 insertions(+), 289 deletions(-) delete mode 100644 data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/noxfile.py b/data-science-onramp/data-ingestion/noxfile.py deleted file mode 100644 index b23055f14a6..00000000000 --- a/data-science-onramp/data-ingestion/noxfile.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright 2019 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -from pathlib import Path -import sys - -import nox - - -# WARNING - WARNING - WARNING - WARNING - WARNING -# WARNING - WARNING - WARNING - WARNING - WARNING -# DO NOT EDIT THIS FILE EVER! -# WARNING - WARNING - WARNING - WARNING - WARNING -# WARNING - WARNING - WARNING - WARNING - WARNING - -# Copy `noxfile_config.py` to your directory and modify it instead. - - -# `TEST_CONFIG` dict is a configuration hook that allows users to -# modify the test configurations. The values here should be in sync -# with `noxfile_config.py`. Users will copy `noxfile_config.py` into -# their directory and modify it. - -TEST_CONFIG = { - # You can opt out from the test for specific Python versions. - 'ignored_versions': ["2.7"], - - # An envvar key for determining the project id to use. Change it - # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a - # build specific Cloud project. You can also use your own string - # to use your own Cloud project. - 'gcloud_project_env': 'GCLOUD_PROJECT', - # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', - - # A dictionary you want to inject into your test. Don't put any - # secrets here. These values will override predefined values. - 'envs': {}, -} - - -try: - # Ensure we can import noxfile_config in the project's directory. - sys.path.append('.') - from noxfile_config import TEST_CONFIG_OVERRIDE -except ImportError as e: - print("No user noxfile_config found: detail: {}".format(e)) - TEST_CONFIG_OVERRIDE = {} - -# Update the TEST_CONFIG with the user supplied values. -TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) - - -def get_pytest_env_vars(): - """Returns a dict for pytest invocation.""" - ret = {} - - # Override the GCLOUD_PROJECT and the alias. - env_key = TEST_CONFIG['gcloud_project_env'] - # This should error out if not set. - ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] - ret['GCLOUD_PROJECT'] = os.environ[env_key] - - # Apply user supplied envs. - ret.update(TEST_CONFIG['envs']) - return ret - - -# DO NOT EDIT - automatically generated. -# All versions used to tested samples. -ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] - -# Any default versions that should be ignored. -IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] - -TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) - -INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) -# -# Style Checks -# - - -def _determine_local_import_names(start_dir): - """Determines all import names that should be considered "local". - - This is used when running the linter to insure that import order is - properly checked. - """ - file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] - return [ - basename - for basename, extension in file_ext_pairs - if extension == ".py" - or os.path.isdir(os.path.join(start_dir, basename)) - and basename not in ("__pycache__") - ] - - -# Linting with flake8. -# -# We ignore the following rules: -# E203: whitespace before ‘:’ -# E266: too many leading ‘#’ for block comment -# E501: line too long -# I202: Additional newline in a section of imports -# -# We also need to specify the rules which are ignored by default: -# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] -FLAKE8_COMMON_ARGS = [ - "--show-source", - "--builtin=gettext", - "--max-complexity=20", - "--import-order-style=google", - "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", - "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", - "--max-line-length=88", -] - - -@nox.session -def lint(session): - session.install("flake8", "flake8-import-order") - - local_names = _determine_local_import_names(".") - args = FLAKE8_COMMON_ARGS + [ - "--application-import-names", - ",".join(local_names), - "." - ] - session.run("flake8", *args) - - -# -# Sample Tests -# - - -PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] - - -def _session_tests(session, post_install=None): - """Runs py.test for a particular project.""" - if os.path.exists("requirements.txt"): - session.install("-r", "requirements.txt") - - if os.path.exists("requirements-test.txt"): - session.install("-r", "requirements-test.txt") - - if INSTALL_LIBRARY_FROM_SOURCE: - session.install("-e", _get_repo_root()) - - if post_install: - post_install(session) - - session.run( - "pytest", - *(PYTEST_COMMON_ARGS + session.posargs), - # Pytest will return 5 when no tests are collected. This can happen - # on travis where slow and flaky tests are excluded. - # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html - success_codes=[0, 5], - env=get_pytest_env_vars() - ) - - -@nox.session(python=ALL_VERSIONS) -def py(session): - """Runs py.test for a sample using the specified version of Python.""" - if session.python in TESTED_VERSIONS: - _session_tests(session) - else: - session.skip("SKIPPED: {} tests are disabled for this sample.".format( - session.python - )) - - -# -# Readmegen -# - - -def _get_repo_root(): - """ Returns the root folder of the project. """ - # Get root of this repository. Assume we don't have directories nested deeper than 10 items. - p = Path(os.getcwd()) - for i in range(10): - if p is None: - break - if Path(p / ".git").exists(): - return str(p) - p = p.parent - raise Exception("Unable to detect repository root.") - - -GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) - - -@nox.session -@nox.parametrize("path", GENERATED_READMES) -def readmegen(session, path): - """(Re-)generates the readme for a sample.""" - session.install("jinja2", "pyyaml") - dir_ = os.path.dirname(path) - - if os.path.exists(os.path.join(dir_, "requirements.txt")): - session.install("-r", os.path.join(dir_, "requirements.txt")) - - in_file = os.path.join(dir_, "README.rst.in") - session.run( - "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file - ) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index dc869903c84..da162e1c91d 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -12,31 +12,11 @@ from pyspark.sql.types import IntegerType, StringType -# Create a SparkSession under the name "setup". Viewable via the Spark UI -spark = SparkSession.builder.appName("setup").getOrCreate() +BUCKET_NAME = sys.argv[1] +TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -bucket_name = sys.argv[1] -upload = True # Whether to upload data to BigQuery - -# Check whether or not results should be uploaded -if len(sys.arv) > 1: - upload = False - print("Not uploading results to BigQuery") -else: - print("Results will be uploaded to BigQuery") - -table = "bigquery-public-data.new_york_citibike.citibike_trips" - -# Check if table exists -try: - df = spark.read.format('bigquery').option('table', table).load() -except Py4JJavaError: - print(f"{table} does not exist. ") - sys.exit(0) # START MAKING DATA DIRTY - - def random_select(items, weights): '''Picks an item according to the cumulative weights''' return random.choices(items, weights=weights, k=1)[0] @@ -81,6 +61,8 @@ def convert_angle(angle): return random_select([str(angle), new_angle], [0.55, 0.45]) +# This function is nested since a UserDefinedFunction is +# expected to take a single argument def dirty_data(proc_func, allow_none): '''Master function returns a user defined function that transforms the column data''' @@ -99,42 +81,9 @@ def udf(col_value): def id(x): return x +def write_to_bigquery(df): + '''Write a dataframe to BigQuery''' -# Declare data transformations for each column in dataframe -udfs = [ - (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(id, True), StringType()), # starttime - (dirty_data(id, True), StringType()), # stoptime - (id, IntegerType()), # start_station_id - (dirty_data(station_name, False), StringType()), # start_station_name - (dirty_data(convert_angle, True), StringType()), # start_station_latitude - (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (id, IntegerType()), # end_station_id - (dirty_data(station_name, False), StringType()), # end_station_name - (dirty_data(convert_angle, True), StringType()), # end_station_latitude - (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (id, IntegerType()), # bikeid - (dirty_data(user_type, False), StringType()), # usertype - (id, IntegerType()), # birth_year - (dirty_data(gender, False), StringType()), # gender - (id, StringType()), # customer_plan -] - -# Apply dirty transformations to df -names = df.schema.names -new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) - for udf, column, name in zip(udfs, df.columns, names)]) - -# Duplicate about 0.01% of the rows -dup_df = new_df.sample(False, 0.0001, seed=42) - -# Create final dirty dataframe -df = new_df.union(dup_df) -df.sample(False, 0.0001, seed=50).show(n=200) -print("Dataframe sample printed") - -# Write to BigQuery -if upload: # Create BigQuery Dataset client = bigquery.Client() dataset_id = f'{client.project}.new_york_citibike_trips' @@ -143,8 +92,68 @@ def id(x): dataset = client.create_dataset(dataset) # Saving the data to BigQuery - spark.conf.set('temporaryGcsBucket', bucket_name) + spark.conf.set('temporaryGcsBucket', BUCKET_NAME) df.write.format('bigquery') \ .option('table', dataset_id + ".RAW_DATA") \ .save() + +def main(): + # Create a SparkSession under the name "setup". Viewable via the Spark UI + spark = SparkSession.builder.appName("setup").getOrCreate() + + upload = True # Whether to upload data to BigQuery + + # Check whether or not results should be uploaded + if len(sys.argv) > 1: + upload = False + print("Not uploading results to BigQuery") + else: + print("Results will be uploaded to BigQuery") + + # Check if table exists + try: + df = spark.read.format('bigquery').option('table', TABLE).load() + except Py4JJavaError: + print(f"{TABLE} does not exist. ") + sys.exit(0) + + # Declare data transformations for each column in dataframe + udfs = [ + (dirty_data(trip_duration, True), StringType()), # tripduration + (dirty_data(id, True), StringType()), # starttime + (dirty_data(id, True), StringType()), # stoptime + (id, IntegerType()), # start_station_id + (dirty_data(station_name, False), StringType()), # start_station_name + (dirty_data(convert_angle, True), StringType()), # start_station_latitude + (dirty_data(convert_angle, True), StringType()), # start_station_longitude + (id, IntegerType()), # end_station_id + (dirty_data(station_name, False), StringType()), # end_station_name + (dirty_data(convert_angle, True), StringType()), # end_station_latitude + (dirty_data(convert_angle, True), StringType()), # end_station_longitude + (id, IntegerType()), # bikeid + (dirty_data(user_type, False), StringType()), # usertype + (id, IntegerType()), # birth_year + (dirty_data(gender, False), StringType()), # gender + (id, StringType()), # customer_plan + ] + + # Apply dirty transformations to df + names = df.schema.names + new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) + for udf, column, name in zip(udfs, df.columns, names)]) + + # Duplicate about 0.01% of the rows + dup_df = new_df.sample(False, 0.0001, seed=42) + + # Create final dirty dataframe + df = new_df.union(dup_df) + df.sample(False, 0.0001, seed=50).show(n=200) + print("Dataframe sample printed") + + if upload: + write_to_bigquery(df) + + +if __name__ == '__main__': + main() diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 919dcc4f35c..e62d2cc1355 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -52,7 +52,7 @@ def setup_and_teardown_cluster(): } ], "software_config": { - "image_version": "1.5.4-debian10", + "image_version": "1.4-debian10", "optional_components": [ "ANACONDA" ], @@ -134,9 +134,17 @@ def test_setup(capsys): # Wait for job to complete result = response.result() + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + }) + + cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME) + # Get job output - output_location = result.driver_output_resource_uri + ".000000000" - output = BUCKET.blob(output_location).download_as_string().decode("utf-8") + output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000" + storage_client = storage.Client() + bucket = storage_client.get_bucket(cluster_info.config.config_bucket) + output = bucket.blob(output_location).download_as_string().decode("utf-8") # tripDuration assert re.search("[0-9] s", out) @@ -173,6 +181,3 @@ def test_setup(capsys): # Missing data assert "null" in out - -def callback(operation_future): - return operation_future.result() From ce04a6f8a578d17d6d7c592ce916bd700684666c Mon Sep 17 00:00:00 2001 From: vuppalli Date: Mon, 8 Jun 2020 19:12:09 -0400 Subject: [PATCH 30/59] get dataproc job output and fix linting --- .gitignore | 1 + data-science-onramp/data-ingestion/setup.py | 14 ++++++---- .../data-ingestion/setup_test.py | 28 +++++++++---------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index c827e035649..369e7983b52 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ credentials.dat .DS_store env/ .idea +data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index da162e1c91d..09046b75879 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -12,7 +12,7 @@ from pyspark.sql.types import IntegerType, StringType -BUCKET_NAME = sys.argv[1] +BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" @@ -81,7 +81,8 @@ def udf(col_value): def id(x): return x -def write_to_bigquery(df): + +def write_to_bigquery(spark, df): '''Write a dataframe to BigQuery''' # Create BigQuery Dataset @@ -98,6 +99,7 @@ def write_to_bigquery(df): .option('table', dataset_id + ".RAW_DATA") \ .save() + def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() @@ -143,16 +145,16 @@ def main(): new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) for udf, column, name in zip(udfs, df.columns, names)]) + new_df.sample(False, 0.0001, seed=50).show(n=100) + # Duplicate about 0.01% of the rows - dup_df = new_df.sample(False, 0.0001, seed=42) + dup_df = new_df.sample(True, 0.0001, seed=42) # Create final dirty dataframe df = new_df.union(dup_df) - df.sample(False, 0.0001, seed=50).show(n=200) - print("Dataframe sample printed") if upload: - write_to_bigquery(df) + write_to_bigquery(spark, df) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index e62d2cc1355..f55a155bc75 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -3,11 +3,8 @@ import uuid -from google.api_core.exceptions import GoogleAPICallError - from google.cloud import dataproc_v1 as dataproc from google.cloud import storage -from google.cloud.exceptions import NotFound import pytest @@ -52,7 +49,7 @@ def setup_and_teardown_cluster(): } ], "software_config": { - "image_version": "1.4-debian10", + "image_version": "1.5.4-debian10", "optional_components": [ "ANACONDA" ], @@ -96,6 +93,7 @@ def setup_and_teardown_bucket(): bucket = storage_client.get_bucket(BUCKET_NAME) bucket.delete(force=True) + def test_setup(capsys): '''Tests setup.py by submitting it to a dataproc cluster''' @@ -129,22 +127,15 @@ def test_setup(capsys): }) response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, - job=job_details) + job=job_details) # Wait for job to complete result = response.result() - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) - }) - - cluster_info = cluster_client.get_cluster(PROJECT, REGION, CLUSTER_NAME) - # Get job output - output_location = result.driver_output_resource_uri + "000000000" # + "driveroutput.000000000" - storage_client = storage.Client() - bucket = storage_client.get_bucket(cluster_info.config.config_bucket) - output = bucket.blob(output_location).download_as_string().decode("utf-8") + output_location = result.driver_output_resource_uri + ".000000000" + blob = get_blob_from_path(output_location) + out = blob.download_as_string().decode("utf-8") # tripDuration assert re.search("[0-9] s", out) @@ -181,3 +172,10 @@ def test_setup(capsys): # Missing data assert "null" in out + + +def get_blob_from_path(path): + bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] + bucket = storage.Client().get_bucket(bucket_name) + output_location = re.search("google-cloud-dataproc.+", path).group(0) + return bucket.blob(output_location) From ef2d2b3514c5c83ccc403e93baad4b4951168ea3 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Tue, 9 Jun 2020 15:32:02 -0400 Subject: [PATCH 31/59] fix PR comments --- .gitignore | 30 ----- data-science-onramp/data-ingestion/setup.py | 28 ++-- .../data-ingestion/setup_test.py | 125 ++++++++---------- 3 files changed, 73 insertions(+), 110 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 369e7983b52..00000000000 --- a/.gitignore +++ /dev/null @@ -1,30 +0,0 @@ -.coveralls.yml -*.pyc -.coverage -.tox -.pytest_cache -.ipynb_checkpoints -.executed_notebooks -coverage.xml -python-docs-samples.json -service-account.json -client-secrets.json -__pycache__ -*db\.sqlite3 -managed_vms/django_tutorial/static/* -**/migrations/* -lib -testing/resources/test-env.sh -testing/resources/service-account.json -testing/resources/client-secrets.json -secrets.tar -.cache -junit.xml -credentials.dat -.nox -.vscode/ -*sponge_log.xml -.DS_store -env/ -.idea -data-science-onramp/data-ingestion/noxfile.py diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 09046b75879..cf61f81562a 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -46,8 +46,8 @@ def user_type(user): def gender(s): '''Manipulates the gender string''' - return random.choice([s, s.upper(), s.lower(), - s[0] if len(s) > 0 else "", + return random.choice([s.upper(), s.lower(), + s[0].upper() if len(s) > 0 else "", s[0].lower() if len(s) > 0 else ""]) @@ -78,7 +78,9 @@ def udf(col_value): return udf -def id(x): +# This function is required because we need to apply a +# function for every column and some columns do not change +def identity(x): return x @@ -118,26 +120,26 @@ def main(): df = spark.read.format('bigquery').option('table', TABLE).load() except Py4JJavaError: print(f"{TABLE} does not exist. ") - sys.exit(0) + return # Declare data transformations for each column in dataframe udfs = [ (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(id, True), StringType()), # starttime - (dirty_data(id, True), StringType()), # stoptime - (id, IntegerType()), # start_station_id + (dirty_data(identity, True), StringType()), # starttime + (dirty_data(identity, True), StringType()), # stoptime + (identity, IntegerType()), # start_station_id (dirty_data(station_name, False), StringType()), # start_station_name (dirty_data(convert_angle, True), StringType()), # start_station_latitude (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (id, IntegerType()), # end_station_id + (identity, IntegerType()), # end_station_id (dirty_data(station_name, False), StringType()), # end_station_name (dirty_data(convert_angle, True), StringType()), # end_station_latitude (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (id, IntegerType()), # bikeid + (identity, IntegerType()), # bikeid (dirty_data(user_type, False), StringType()), # usertype - (id, IntegerType()), # birth_year + (identity, IntegerType()), # birth_year (dirty_data(gender, False), StringType()), # gender - (id, StringType()), # customer_plan + (identity, StringType()), # customer_plan ] # Apply dirty transformations to df @@ -145,10 +147,10 @@ def main(): new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) for udf, column, name in zip(udfs, df.columns, names)]) - new_df.sample(False, 0.0001, seed=50).show(n=100) + new_df.sample(False, 0.0001).show(n=100) # Duplicate about 0.01% of the rows - dup_df = new_df.sample(True, 0.0001, seed=42) + dup_df = new_df.sample(True, 0.0001) # Create final dirty dataframe df = new_df.union(dup_df) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index f55a155bc75..aab08230028 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -12,24 +12,36 @@ # Set global variables PROJECT = os.environ['GCLOUD_PROJECT'] REGION = "us-central1" -ZONE = "us-central1-a" CLUSTER_NAME = f'setup-test-{uuid.uuid4()}' BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' - -BUCKET = None +DESTINATION_BLOB_NAME = "setup.py" +JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' +JOB_DETAILS = { # Job configuration + 'placement': { + 'cluster_name': CLUSTER_NAME + }, + 'pyspark_job': { + 'main_python_file_uri': JOB_FILE_NAME, + 'args': [ + BUCKET_NAME, + "--test", + ], + "jar_file_uris": [ + "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" + ], + }, +} @pytest.fixture(autouse=True) def setup_and_teardown_cluster(): # Create cluster configuration - zone_uri = \ - f'https://www.googleapis.com/compute/v1/projects/{PROJECT}/zones/{ZONE}' cluster_data = { 'project_id': PROJECT, 'cluster_name': CLUSTER_NAME, 'config': { 'gce_cluster_config': { - 'zone_uri': zone_uri, + 'zone_uri': '', "metadata": { "PIP_PACKAGES": "google-cloud-storage" }, @@ -59,9 +71,8 @@ def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) - operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data) # Wait for cluster to provision @@ -70,10 +81,6 @@ def setup_and_teardown_cluster(): yield # Delete cluster - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' - }) - operation = cluster_client.delete_cluster(PROJECT, REGION, CLUSTER_NAME) operation.result() @@ -81,53 +88,41 @@ def setup_and_teardown_cluster(): @pytest.fixture(autouse=True) def setup_and_teardown_bucket(): - global BUCKET # Create GCS Bucket storage_client = storage.Client() - BUCKET = storage_client.create_bucket(BUCKET_NAME) + bucket = storage_client.create_bucket(BUCKET_NAME) + + # Upload file + blob = bucket.blob(DESTINATION_BLOB_NAME) + blob.upload_from_filename("setup.py") yield # Delete GCS bucket - storage_client = storage.Client() bucket = storage_client.get_bucket(BUCKET_NAME) bucket.delete(force=True) -def test_setup(capsys): - '''Tests setup.py by submitting it to a dataproc cluster''' +def get_blob_from_path(path): + bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] + bucket = storage.Client().get_bucket(bucket_name) + output_location = re.search("google-cloud-dataproc.+", path).group(0) + return bucket.blob(output_location) - # Upload file - destination_blob_name = "setup.py" - blob = BUCKET.blob(destination_blob_name) - blob.upload_from_filename("setup.py") - job_file_name = "gs://" + BUCKET_NAME + "/setup.py" - - # Create job configuration - job_details = { - 'placement': { - 'cluster_name': CLUSTER_NAME - }, - 'pyspark_job': { - 'main_python_file_uri': job_file_name, - 'args': [ - BUCKET_NAME, - "--test", - ], - "jar_file_uris": [ - "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" - ], - }, - } +def is_in_table(value, out): + return re.search(f"\| *{value}\|", out) + + +def test_setup(): + '''Tests setup.py by submitting it to a dataproc cluster''' # Submit job to dataproc cluster job_client = dataproc.JobControllerClient(client_options={ - 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) - response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, - job=job_details) + job=JOB_DETAILS) # Wait for job to complete result = response.result() @@ -150,32 +145,28 @@ def test_setup(capsys): assert re.search("20[0-9][0-9]\\|", out) # gender - assert "M" in out - assert "male" in out - assert "MALE" in out - assert "F" in out - assert "female" in out - assert "FEMALE" in out - assert "u" in out - assert "unknown" in out - assert "UNKNOWN" in out + assert is_in_table("M", out) + assert is_in_table("m", out) + assert is_in_table("male", out) + assert is_in_table("MALE", out) + assert is_in_table("F", out) + assert is_in_table("f", out) + assert is_in_table("female", out) + assert is_in_table("FEMALE", out) + assert is_in_table("U", out) + assert is_in_table("u", out) + assert is_in_table("unknown", out) + assert is_in_table("UNKNOWN", out) # customer_plan - assert "Subscriber" in out - assert "subscriber" in out - assert "SUBSCRIBER" in out - assert "sub" in out - assert "Customer" in out - assert "customer" in out - assert "CUSTOMER" in out - assert "cust" in out + assert is_in_table("Subscriber", out) + assert is_in_table("subscriber", out) + assert is_in_table("SUBSCRIBER", out) + assert is_in_table("sub", out) + assert is_in_table("Customer", out) + assert is_in_table("customer", out) + assert is_in_table("CUSTOMER", out) + assert is_in_table("cust", out) # Missing data - assert "null" in out - - -def get_blob_from_path(path): - bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] - bucket = storage.Client().get_bucket(bucket_name) - output_location = re.search("google-cloud-dataproc.+", path).group(0) - return bucket.blob(output_location) + assert is_in_table("null", out) From 93394a3aa0e8bfa9c0264b094d4e64033decd3a6 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Tue, 9 Jun 2020 16:01:52 -0400 Subject: [PATCH 32/59] linting and global vars --- .../data-ingestion/setup_test.py | 68 +++++++++---------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index aab08230028..e9358de912c 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -31,49 +31,47 @@ ], }, } - - -@pytest.fixture(autouse=True) -def setup_and_teardown_cluster(): - # Create cluster configuration - cluster_data = { - 'project_id': PROJECT, - 'cluster_name': CLUSTER_NAME, - 'config': { - 'gce_cluster_config': { - 'zone_uri': '', - "metadata": { - "PIP_PACKAGES": "google-cloud-storage" - }, +CLUSTER_DATA = { # Create cluster configuration + 'project_id': PROJECT, + 'cluster_name': CLUSTER_NAME, + 'config': { + 'gce_cluster_config': { + 'zone_uri': '', + "metadata": { + "PIP_PACKAGES": "google-cloud-storage" }, - 'master_config': { - 'num_instances': 1, - 'machine_type_uri': 'n1-standard-8' - }, - 'worker_config': { - 'num_instances': 6, - 'machine_type_uri': 'n1-standard-8' - }, - "initialization_actions": [ - { - "executable_file": ("gs://dataproc-initialization-actions/" - "python/pip-install.sh"), - } - ], - "software_config": { - "image_version": "1.5.4-debian10", - "optional_components": [ - "ANACONDA" - ], + }, + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-8' + }, + 'worker_config': { + 'num_instances': 6, + 'machine_type_uri': 'n1-standard-8' + }, + "initialization_actions": [ + { + "executable_file": ("gs://dataproc-initialization-actions/" + "python/pip-install.sh"), } + ], + "software_config": { + "image_version": "1.5.4-debian10", + "optional_components": [ + "ANACONDA" + ], } } +} + +@pytest.fixture(autouse=True) +def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient(client_options={ 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' }) - operation = cluster_client.create_cluster(PROJECT, REGION, cluster_data) + operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA) # Wait for cluster to provision operation.result() @@ -111,7 +109,7 @@ def get_blob_from_path(path): def is_in_table(value, out): - return re.search(f"\| *{value}\|", out) + return re.search(f"\\| *{value}\\|", out) def test_setup(): From a6fc6e644316422da6d2c7e602676465920d46b6 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Wed, 10 Jun 2020 11:27:11 -0400 Subject: [PATCH 33/59] address Brad PR comments --- data-science-onramp/data-ingestion/setup.py | 34 +++++++------------ .../data-ingestion/setup_test.py | 3 -- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index cf61f81562a..7f5efa28e0a 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,13 +1,10 @@ import random import sys - from time import time_ns from google.cloud import bigquery - from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession - from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.types import IntegerType, StringType @@ -56,7 +53,7 @@ def convert_angle(angle): degrees = int(angle) minutes = int((angle - degrees) * 60) seconds = int((angle - degrees - minutes/60) * 3600) - new_angle = str(degrees) + u"\u00B0" + \ + new_angle = str(degrees) + "\u00B0" + \ str(minutes) + "'" + str(seconds) + '"' return random_select([str(angle), new_angle], [0.55, 0.45]) @@ -78,13 +75,7 @@ def udf(col_value): return udf -# This function is required because we need to apply a -# function for every column and some columns do not change -def identity(x): - return x - - -def write_to_bigquery(spark, df): +def write_to_bigquery(df): '''Write a dataframe to BigQuery''' # Create BigQuery Dataset @@ -95,10 +86,9 @@ def write_to_bigquery(spark, df): dataset = client.create_dataset(dataset) # Saving the data to BigQuery - spark.conf.set('temporaryGcsBucket', BUCKET_NAME) - df.write.format('bigquery') \ .option('table', dataset_id + ".RAW_DATA") \ + .option("temporaryGcsBucket", BUCKET_NAME) \ .save() @@ -109,7 +99,7 @@ def main(): upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded - if len(sys.argv) > 1: + if len(sys.argv) > 2: upload = False print("Not uploading results to BigQuery") else: @@ -125,21 +115,21 @@ def main(): # Declare data transformations for each column in dataframe udfs = [ (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(identity, True), StringType()), # starttime - (dirty_data(identity, True), StringType()), # stoptime - (identity, IntegerType()), # start_station_id + (dirty_data(lambda x: x, True), StringType()), # starttime + (dirty_data(lambda x: x, True), StringType()), # stoptime + (lambda x: x, IntegerType()), # start_station_id (dirty_data(station_name, False), StringType()), # start_station_name (dirty_data(convert_angle, True), StringType()), # start_station_latitude (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (identity, IntegerType()), # end_station_id + (lambda x: x, IntegerType()), # end_station_id (dirty_data(station_name, False), StringType()), # end_station_name (dirty_data(convert_angle, True), StringType()), # end_station_latitude (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (identity, IntegerType()), # bikeid + (lambda x: x, IntegerType()), # bikeid (dirty_data(user_type, False), StringType()), # usertype - (identity, IntegerType()), # birth_year + (lambda x: x, IntegerType()), # birth_year (dirty_data(gender, False), StringType()), # gender - (identity, StringType()), # customer_plan + (lambda x: x, StringType()), # customer_plan ] # Apply dirty transformations to df @@ -156,7 +146,7 @@ def main(): df = new_df.union(dup_df) if upload: - write_to_bigquery(spark, df) + write_to_bigquery(df) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index e9358de912c..2aa82535d79 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -1,14 +1,11 @@ import os import re - import uuid from google.cloud import dataproc_v1 as dataproc from google.cloud import storage - import pytest - # Set global variables PROJECT = os.environ['GCLOUD_PROJECT'] REGION = "us-central1" From 1c9f52694d286b7646af538193efdcb51d9b24a2 Mon Sep 17 00:00:00 2001 From: Tushar Khan Date: Thu, 11 Jun 2020 11:45:10 -0400 Subject: [PATCH 34/59] broken clean.py --- data-science-onramp/data-processing/clean.py | 44 ++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 data-science-onramp/data-processing/clean.py diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py new file mode 100644 index 00000000000..0bca32d3299 --- /dev/null +++ b/data-science-onramp/data-processing/clean.py @@ -0,0 +1,44 @@ +import os +import sys + +from py4j.protocol import Py4JJavaError +from pyspark.sql import SparkSession +from pyspark.sql.functions import UserDefinedFunction, lit +from pyspark.sql.types import IntegerType, StringType + + +PROJECT_ID = sys.argv[1] +BUCKET_NAME = sys.argv[2] +TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA' + +def station_name(name): + if name: + return name.replace('/', '&') + else: + return '' + +def main(): + '''...''' + # Create a SparkSession under the name 'clean'. Viewable via the Spark UI + spark = SparkSession.builder.appName('clean').getOrCreate() + + # Check if table exists + + try: + df = spark.read.format('bigquery').option('table', TABLE).load() + except Py4JJavaError: + print(f"{TABLE} does not exist. ") + return + + udf_map = { + 'start_station_name': (station_name, StringType()) + } + + for name, (func, col_type) in udf_map.items(): + df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name)) + + df = spark.createDataframe + df.show(n=100) + +if __name__ == '__main__': + main() \ No newline at end of file From 327cf5b678a4bba2358281bcdb8be2b26aaf5ca1 Mon Sep 17 00:00:00 2001 From: Tushar Khan Date: Thu, 11 Jun 2020 11:49:49 -0400 Subject: [PATCH 35/59] Revert "broken clean.py" This reverts commit 580c8e1078e9480ef30d3083522fd2c467c4f1b1. --- data-science-onramp/data-processing/clean.py | 44 -------------------- 1 file changed, 44 deletions(-) delete mode 100644 data-science-onramp/data-processing/clean.py diff --git a/data-science-onramp/data-processing/clean.py b/data-science-onramp/data-processing/clean.py deleted file mode 100644 index 0bca32d3299..00000000000 --- a/data-science-onramp/data-processing/clean.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import sys - -from py4j.protocol import Py4JJavaError -from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction, lit -from pyspark.sql.types import IntegerType, StringType - - -PROJECT_ID = sys.argv[1] -BUCKET_NAME = sys.argv[2] -TABLE = f'{PROJECT_ID}.new_york_citibike_trips.RAW_DATA' - -def station_name(name): - if name: - return name.replace('/', '&') - else: - return '' - -def main(): - '''...''' - # Create a SparkSession under the name 'clean'. Viewable via the Spark UI - spark = SparkSession.builder.appName('clean').getOrCreate() - - # Check if table exists - - try: - df = spark.read.format('bigquery').option('table', TABLE).load() - except Py4JJavaError: - print(f"{TABLE} does not exist. ") - return - - udf_map = { - 'start_station_name': (station_name, StringType()) - } - - for name, (func, col_type) in udf_map.items(): - df = df.withColumn(name, UserDefinedFunction(func, col_type)(name).alias(name)) - - df = spark.createDataframe - df.show(n=100) - -if __name__ == '__main__': - main() \ No newline at end of file From 4bf07ee52c06da34a387f65b8e63d83911364a8d Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 16 Jun 2020 11:29:46 -0400 Subject: [PATCH 36/59] optimize data ingestion --- data-science-onramp/data-ingestion/setup.py | 109 +++++++++--------- .../data-ingestion/setup_test.py | 14 +-- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 7f5efa28e0a..33c7c728733 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,41 +1,43 @@ import random import sys -from time import time_ns from google.cloud import bigquery from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction +from pyspark.sql.functions import UserDefinedFunction, when, expr from pyspark.sql.types import IntegerType, StringType BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" +RAW_DATASET_NAME = "new_york_citibike_trips5" +RAW_TABLE_NAME = "RAW_DATA" # START MAKING DATA DIRTY -def random_select(items, weights): - '''Picks an item according to the cumulative weights''' - return random.choices(items, weights=weights, k=1)[0] - - def trip_duration(duration): '''Converts trip duration to other units''' + if duration is None: + return None seconds = str(duration) + " s" minutes = str(float(duration) / 60) + " min" hours = str(float(duration) / 3600) + " h" - return random_select([seconds, minutes, hours, + return random.choices([seconds, minutes, hours, str(random.randint(-1000, -1))], - [0.3, 0.3, 0.3, 0.1]) + weights=[0.3, 0.3, 0.3, 0.1])[0] def station_name(name): '''Replaces '&' with '/' with a 50% chance''' + if name is None: + return None return random.choice([name, name.replace("&", "/")]) def user_type(user): '''Manipulates the user type string''' + if user is None: + return None return random.choice([user, user.upper(), user.lower(), "sub" if user == "Subscriber" else user, "cust" if user == "Customer" else user]) @@ -43,6 +45,8 @@ def user_type(user): def gender(s): '''Manipulates the gender string''' + if s is None: + return None return random.choice([s.upper(), s.lower(), s[0].upper() if len(s) > 0 else "", s[0].lower() if len(s) > 0 else ""]) @@ -50,29 +54,15 @@ def gender(s): def convert_angle(angle): '''Converts long and lat to DMS notation''' + if angle is None: + return None degrees = int(angle) minutes = int((angle - degrees) * 60) seconds = int((angle - degrees - minutes/60) * 3600) new_angle = str(degrees) + "\u00B0" + \ str(minutes) + "'" + str(seconds) + '"' - return random_select([str(angle), new_angle], [0.55, 0.45]) - - -# This function is nested since a UserDefinedFunction is -# expected to take a single argument -def dirty_data(proc_func, allow_none): - '''Master function returns a user defined function - that transforms the column data''' - def udf(col_value): - random.seed(hash(col_value) + time_ns()) - if col_value is None: - return col_value - elif allow_none: - return random_select([None, proc_func(col_value)], - [0.05, 0.95]) - else: - return proc_func(col_value) - return udf + return random.choices([str(angle), new_angle], + weights=[0.55, 0.45])[0] def write_to_bigquery(df): @@ -80,17 +70,19 @@ def write_to_bigquery(df): # Create BigQuery Dataset client = bigquery.Client() - dataset_id = f'{client.project}.new_york_citibike_trips' + dataset_id = f'{client.project}.{RAW_DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) # Saving the data to BigQuery df.write.format('bigquery') \ - .option('table', dataset_id + ".RAW_DATA") \ + .option('table', dataset_id + f".{RAW_TABLE_NAME}") \ .option("temporaryGcsBucket", BUCKET_NAME) \ .save() + print("Table successfully written to BigQuery") + def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI @@ -112,42 +104,49 @@ def main(): print(f"{TABLE} does not exist. ") return - # Declare data transformations for each column in dataframe - udfs = [ - (dirty_data(trip_duration, True), StringType()), # tripduration - (dirty_data(lambda x: x, True), StringType()), # starttime - (dirty_data(lambda x: x, True), StringType()), # stoptime - (lambda x: x, IntegerType()), # start_station_id - (dirty_data(station_name, False), StringType()), # start_station_name - (dirty_data(convert_angle, True), StringType()), # start_station_latitude - (dirty_data(convert_angle, True), StringType()), # start_station_longitude - (lambda x: x, IntegerType()), # end_station_id - (dirty_data(station_name, False), StringType()), # end_station_name - (dirty_data(convert_angle, True), StringType()), # end_station_latitude - (dirty_data(convert_angle, True), StringType()), # end_station_longitude - (lambda x: x, IntegerType()), # bikeid - (dirty_data(user_type, False), StringType()), # usertype - (lambda x: x, IntegerType()), # birth_year - (dirty_data(gender, False), StringType()), # gender - (lambda x: x, StringType()), # customer_plan + # Declare dictionary with keys column names and values user defined + # functions and return types + udf_map = { + 'tripduration': (trip_duration, StringType()), + 'start_station_name': (station_name, StringType()), + 'start_station_latitude': (convert_angle, StringType()), + 'start_station_longitude': (convert_angle, StringType()), + 'end_station_name': (station_name, StringType()), + 'end_station_latitude': (convert_angle, StringType()), + 'end_station_longitude': (convert_angle, StringType()), + 'usertype': (user_type, StringType()), + 'gender': (gender, StringType()), + } + + # Declare which columns to set some values to null randomly + null_columns = [ + 'tripduration', + 'starttime', + 'stoptime', + 'start_station_latitude', + 'start_station_longitude', + 'end_station_latitude', + 'end_station_longitude', ] - # Apply dirty transformations to df - names = df.schema.names - new_df = df.select(*[UserDefinedFunction(*udf)(column).alias(name) - for udf, column, name in zip(udfs, df.columns, names)]) + # Dirty the columns + for name, udf in udf_map.items(): + df = df.withColumn(name, UserDefinedFunction(*udf)(name)) - new_df.sample(False, 0.0001).show(n=100) + # Randomly set about 5% of the values in some columns to null + for name in null_columns: + df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows - dup_df = new_df.sample(True, 0.0001) + dup_df = df.sample(True, 0.0001) # Create final dirty dataframe - df = new_df.union(dup_df) + df = df.union(dup_df) if upload: write_to_bigquery(df) - + else: + df.sample(True, 0.0001).show(n=500, truncate=False) if __name__ == '__main__': main() diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 2aa82535d79..8fb1938c843 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -106,7 +106,7 @@ def get_blob_from_path(path): def is_in_table(value, out): - return re.search(f"\\| *{value}\\|", out) + return re.search(f"\\|{value} *\\|", out) def test_setup(): @@ -128,16 +128,16 @@ def test_setup(): out = blob.download_as_string().decode("utf-8") # tripDuration - assert re.search("[0-9] s", out) - assert re.search("[0-9] m", out) - assert re.search("[0-9] h", out) + assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) + assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) + assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) # station latitude & longitude - assert re.search(u"\u00B0" + "[0-9]+\'[0-9]+\"", out) + assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) # birth_year - assert re.search("19[0-9][0-9]\\|", out) - assert re.search("20[0-9][0-9]\\|", out) + assert is_in_table("19[0-9][0-9]", out) + assert is_in_table("20[0-9][0-9]", out) # gender assert is_in_table("M", out) From 8dbd3bc4a359cf4959c668aa802fa5c2e5dd2195 Mon Sep 17 00:00:00 2001 From: vuppalli Date: Tue, 16 Jun 2020 11:54:59 -0400 Subject: [PATCH 37/59] fix linting errors --- data-science-onramp/data-ingestion/setup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 33c7c728733..7308d13a37e 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -4,8 +4,8 @@ from google.cloud import bigquery from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction, when, expr -from pyspark.sql.types import IntegerType, StringType +from pyspark.sql.functions import expr, UserDefinedFunction, when +from pyspark.sql.types import StringType BUCKET_NAME = sys.argv[1] @@ -23,8 +23,8 @@ def trip_duration(duration): minutes = str(float(duration) / 60) + " min" hours = str(float(duration) / 3600) + " h" return random.choices([seconds, minutes, hours, - str(random.randint(-1000, -1))], - weights=[0.3, 0.3, 0.3, 0.1])[0] + str(random.randint(-1000, -1))], + weights=[0.3, 0.3, 0.3, 0.1])[0] def station_name(name): @@ -117,7 +117,7 @@ def main(): 'usertype': (user_type, StringType()), 'gender': (gender, StringType()), } - + # Declare which columns to set some values to null randomly null_columns = [ 'tripduration', @@ -148,5 +148,6 @@ def main(): else: df.sample(True, 0.0001).show(n=500, truncate=False) + if __name__ == '__main__': main() From 4cdd733184fc26fd380d978d21425f046385efc2 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 16 Jun 2020 18:21:20 -0400 Subject: [PATCH 38/59] fix minor style issues --- data-science-onramp/data-ingestion/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 7308d13a37e..b142aa8f37a 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -10,7 +10,7 @@ BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -RAW_DATASET_NAME = "new_york_citibike_trips5" +RAW_DATASET_NAME = "new_york_citibike_trips" RAW_TABLE_NAME = "RAW_DATA" @@ -77,7 +77,7 @@ def write_to_bigquery(df): # Saving the data to BigQuery df.write.format('bigquery') \ - .option('table', dataset_id + f".{RAW_TABLE_NAME}") \ + .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \ .option("temporaryGcsBucket", BUCKET_NAME) \ .save() From 0769754bfeb70dc114bc4c176bfc8919517d9ed0 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Fri, 19 Jun 2020 17:47:23 -0400 Subject: [PATCH 39/59] remove pip from cluster config --- data-science-onramp/data-ingestion/setup_test.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 8fb1938c843..d8def350c8e 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -34,9 +34,6 @@ 'config': { 'gce_cluster_config': { 'zone_uri': '', - "metadata": { - "PIP_PACKAGES": "google-cloud-storage" - }, }, 'master_config': { 'num_instances': 1, @@ -46,12 +43,6 @@ 'num_instances': 6, 'machine_type_uri': 'n1-standard-8' }, - "initialization_actions": [ - { - "executable_file": ("gs://dataproc-initialization-actions/" - "python/pip-install.sh"), - } - ], "software_config": { "image_version": "1.5.4-debian10", "optional_components": [ From 52da79a9d687b4e39a575bb70552c966930e50ea Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Fri, 26 Jun 2020 19:24:21 -0400 Subject: [PATCH 40/59] load external datasets from url --- data-science-onramp/data-ingestion/setup.py | 74 ++++++++++++++++--- .../data-ingestion/setup_test.py | 13 +++- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index b142aa8f37a..06b8ce00689 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,17 +1,37 @@ import random import sys +import pandas as pd from google.cloud import bigquery from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession from pyspark.sql.functions import expr, UserDefinedFunction, when -from pyspark.sql.types import StringType +from pyspark.sql.types import FloatType, StringType, StructField, StructType BUCKET_NAME = sys.argv[1] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -RAW_DATASET_NAME = "new_york_citibike_trips" -RAW_TABLE_NAME = "RAW_DATA" +DATASET_NAME = "data_science_onramp" +RAW_TABLE_NAME = "new_york_citibike_trips" +EXTERNAL_DATASETS = { + "gas_prices": { + "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", + "schema": StructType([ + StructField("Date", StringType(), True), + StructField("New_York_State_Average_USD_per_Gal", + FloatType(), True), + StructField("Albany_Average_USD_per_Gal", FloatType(), True), + StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True), + StructField("Buffalo_Average_USD_per_Gal", FloatType(), True), + StructField("Nassau_Average_USD_per_Gal", FloatType(), True), + StructField("New_York_City_Average_USD_per_Gal", + FloatType(), True), + StructField("Rochester_Average_USD_per_Gal", FloatType(), True), + StructField("Syracuse_Average_USD_per_Gal", FloatType(), True), + StructField("Utica_Average_USD_per_Gal", FloatType(), True), + ]), + }, +} # START MAKING DATA DIRTY @@ -65,23 +85,39 @@ def convert_angle(angle): weights=[0.55, 0.45])[0] -def write_to_bigquery(df): - '''Write a dataframe to BigQuery''' - +def create_bigquery_dataset(): # Create BigQuery Dataset client = bigquery.Client() - dataset_id = f'{client.project}.{RAW_DATASET_NAME}' + dataset_id = f'{client.project}.{DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) + +def write_to_bigquery(df, table_name): + '''Write a dataframe to BigQuery''' + client = bigquery.Client() + dataset_id = f'{client.project}.{DATASET_NAME}' + # Saving the data to BigQuery df.write.format('bigquery') \ - .option('table', f"{dataset_id}.{RAW_TABLE_NAME}") \ + .option('table', f"{dataset_id}.{table_name}") \ .option("temporaryGcsBucket", BUCKET_NAME) \ .save() - print("Table successfully written to BigQuery") + print(f"Table {table_name} successfully written to BigQuery") + + +def print_df(df, table_name): + '''Print 20 rows from dataframe and a random sample''' + # first 100 rows for smaller tables + df.show() + + # random sample for larger tables + # for small tables this will be empty + df.sample(True, 0.0001).show(n=500, truncate=False) + + print(f"Table {table_name} printed") def main(): @@ -91,12 +127,25 @@ def main(): upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded - if len(sys.argv) > 2: + if '--test' in sys.argv: upload = False print("Not uploading results to BigQuery") else: + create_bigquery_dataset() print("Results will be uploaded to BigQuery") + # Ingest External Datasets + + for table_name, data in EXTERNAL_DATASETS.items(): + print(f'Creating dataframe for {table_name}') + df = spark.createDataFrame(pd.read_csv(data["url"]), + schema=data["schema"]) + + if upload: + write_to_bigquery(df, table_name) + else: + print_df(df, table_name) + # Check if table exists try: df = spark.read.format('bigquery').option('table', TABLE).load() @@ -135,6 +184,7 @@ def main(): # Randomly set about 5% of the values in some columns to null for name in null_columns: + df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows @@ -144,9 +194,9 @@ def main(): df = df.union(dup_df) if upload: - write_to_bigquery(df) + write_to_bigquery(df, RAW_TABLE_NAME) else: - df.sample(True, 0.0001).show(n=500, truncate=False) + print_df(df, RAW_TABLE_NAME) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index d8def350c8e..baec10a79a5 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -13,6 +13,10 @@ BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' +TABLE_NAMES = [ + "new_york_citibike_trips", + "gas_prices", +] JOB_DETAILS = { # Job configuration 'placement': { 'cluster_name': CLUSTER_NAME @@ -97,9 +101,12 @@ def get_blob_from_path(path): def is_in_table(value, out): - return re.search(f"\\|{value} *\\|", out) + return re.search(f"\\| *{value} *\\|", out) +def table_printed(table_name, out): + return re.search(f"Table {table_name} printed", out) + def test_setup(): '''Tests setup.py by submitting it to a dataproc cluster''' @@ -118,6 +125,10 @@ def test_setup(): blob = get_blob_from_path(output_location) out = blob.download_as_string().decode("utf-8") + # check that tables were printed + for table_name in TABLE_NAMES: + assert table_printed(table_name, out) + # tripDuration assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) From 2ac38ab67d4fecaad7b6a6db05a11c1db8258b21 Mon Sep 17 00:00:00 2001 From: Tushar Khan Date: Tue, 7 Jul 2020 12:54:45 -0400 Subject: [PATCH 41/59] added dry-run flag --- data-science-onramp/data-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 06b8ce00689..bfa22087c39 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -127,7 +127,7 @@ def main(): upload = True # Whether to upload data to BigQuery # Check whether or not results should be uploaded - if '--test' in sys.argv: + if '--dry-run' in sys.argv: upload = False print("Not uploading results to BigQuery") else: From 5ead6b239f955516f0b3e80242ef9278d1ff1779 Mon Sep 17 00:00:00 2001 From: Symmetries Date: Wed, 8 Jul 2020 12:29:46 -0400 Subject: [PATCH 42/59] dry-run flag --- data-science-onramp/data-ingestion/setup_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index baec10a79a5..a8fbe0d014d 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -25,7 +25,7 @@ 'main_python_file_uri': JOB_FILE_NAME, 'args': [ BUCKET_NAME, - "--test", + "--dry-run", ], "jar_file_uris": [ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" From 3bb0f79c3727b2310208bed55ee9e8607f6e508b Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 9 Jul 2020 19:00:28 -0400 Subject: [PATCH 43/59] address some review comments --- data-science-onramp/data-ingestion/setup.py | 57 +++++++++---------- .../data-ingestion/setup_test.py | 17 ++++-- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index bfa22087c39..ecffd628b50 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -1,3 +1,12 @@ +"""Setup Dataproc job for Data Science Onramp Sample Application +This job ingests an external gas prices in NY dataset as well as +takes a New York Citibike dataset available on BigQuery and +"dirties" the dataset before uploading it back to BigQuery +It needs the following arguments +* the name of the Google Cloud Storage bucket to be used +* an optional --test flag to upload a subset of the dataset for testing +""" + import random import sys import pandas as pd @@ -37,11 +46,11 @@ # START MAKING DATA DIRTY def trip_duration(duration): '''Converts trip duration to other units''' - if duration is None: + if not duration: return None - seconds = str(duration) + " s" - minutes = str(float(duration) / 60) + " min" - hours = str(float(duration) / 3600) + " h" + seconds = f"{str(duration)} s" + minutes = f"{str(float(duration) / 60)} min" + hours = f"{str(float(duration) / 3600)} h" return random.choices([seconds, minutes, hours, str(random.randint(-1000, -1))], weights=[0.3, 0.3, 0.3, 0.1])[0] @@ -49,14 +58,14 @@ def trip_duration(duration): def station_name(name): '''Replaces '&' with '/' with a 50% chance''' - if name is None: + if not name: return None return random.choice([name, name.replace("&", "/")]) def user_type(user): '''Manipulates the user type string''' - if user is None: + if not user: return None return random.choice([user, user.upper(), user.lower(), "sub" if user == "Subscriber" else user, @@ -65,7 +74,7 @@ def user_type(user): def gender(s): '''Manipulates the gender string''' - if s is None: + if not s: return None return random.choice([s.upper(), s.lower(), s[0].upper() if len(s) > 0 else "", @@ -108,28 +117,16 @@ def write_to_bigquery(df, table_name): print(f"Table {table_name} successfully written to BigQuery") -def print_df(df, table_name): - '''Print 20 rows from dataframe and a random sample''' - # first 100 rows for smaller tables - df.show() - - # random sample for larger tables - # for small tables this will be empty - df.sample(True, 0.0001).show(n=500, truncate=False) - - print(f"Table {table_name} printed") - - def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() - upload = True # Whether to upload data to BigQuery + test = False # Whether we are running the job as a test - # Check whether or not results should be uploaded - if '--dry-run' in sys.argv: - upload = False - print("Not uploading results to BigQuery") + # Check whether or not the job is running as a test + if '--test' in sys.argv: + test = True + print("Subset of whole dataset will be uploaded to BigQuery") else: create_bigquery_dataset() print("Results will be uploaded to BigQuery") @@ -141,10 +138,7 @@ def main(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - if upload: - write_to_bigquery(df, table_name) - else: - print_df(df, table_name) + write_to_bigquery(df, table_name) # Check if table exists try: @@ -184,7 +178,6 @@ def main(): # Randomly set about 5% of the values in some columns to null for name in null_columns: - df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) # Duplicate about 0.01% of the rows @@ -193,10 +186,12 @@ def main(): # Create final dirty dataframe df = df.union(dup_df) - if upload: + if not test: write_to_bigquery(df, RAW_TABLE_NAME) else: - print_df(df, RAW_TABLE_NAME) + # df.sample(True, 0.0001).show(n=500, truncate=False) + # Upload 0.001% of the table (about 600 rows) + write_to_bigquery(df.sample(False, 0.00001)) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index a8fbe0d014d..a0ae6fb2814 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -1,3 +1,9 @@ +"""Test file for the setup job in the Data Science Onramp sample application +Creates a test Dataproc cluster and runs the job with a --test flag. +The job uploads a subset of the data to BigQuery. +Then, data is pulled from BigQuery and checks are made to see if the data is dirty. +""" + import os import re import uuid @@ -25,7 +31,7 @@ 'main_python_file_uri': JOB_FILE_NAME, 'args': [ BUCKET_NAME, - "--dry-run", + "--test", ], "jar_file_uris": [ "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" @@ -104,8 +110,9 @@ def is_in_table(value, out): return re.search(f"\\| *{value} *\\|", out) -def table_printed(table_name, out): - return re.search(f"Table {table_name} printed", out) +def table_uploaded(table_name, out): + return re.search(f"Table {table_name} successfully written to BigQuery", out) + def test_setup(): '''Tests setup.py by submitting it to a dataproc cluster''' @@ -125,9 +132,9 @@ def test_setup(): blob = get_blob_from_path(output_location) out = blob.download_as_string().decode("utf-8") - # check that tables were printed + # Check if table upload success message was printed for table_name in TABLE_NAMES: - assert table_printed(table_name, out) + assert table_uploaded(table_name, out) # tripDuration assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) From c753ed723a5b7a7a53b12d81498493373db16d3e Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 14 Jul 2020 17:22:26 -0400 Subject: [PATCH 44/59] optimize setup test --- .../data-ingestion/requirements.txt | 3 +- data-science-onramp/data-ingestion/setup.py | 34 ++--- data-science-onramp/data-ingestion/setup.sh | 2 +- .../data-ingestion/setup_test.py | 116 ++++++++++-------- 4 files changed, 86 insertions(+), 69 deletions(-) mode change 100644 => 100755 data-science-onramp/data-ingestion/setup.sh diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt index f435423c623..e0328e4aec9 100644 --- a/data-science-onramp/data-ingestion/requirements.txt +++ b/data-science-onramp/data-ingestion/requirements.txt @@ -3,4 +3,5 @@ google-auth==1.16.0 google-auth-httplib2==0.0.3 google-cloud==0.34.0 google-cloud-storage==1.28.1 -google-cloud-dataproc==0.8.0 \ No newline at end of file +google-cloud-dataproc==0.8.0 +google-cloud-bigquery==1.25.0 diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index ecffd628b50..bdad93720d2 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -4,14 +4,15 @@ "dirties" the dataset before uploading it back to BigQuery It needs the following arguments * the name of the Google Cloud Storage bucket to be used +* the name of the BigQuery dataset to be created * an optional --test flag to upload a subset of the dataset for testing """ import random import sys -import pandas as pd from google.cloud import bigquery +import pandas as pd from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession from pyspark.sql.functions import expr, UserDefinedFunction, when @@ -19,10 +20,10 @@ BUCKET_NAME = sys.argv[1] +DATASET_NAME = sys.argv[2] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -DATASET_NAME = "data_science_onramp" -RAW_TABLE_NAME = "new_york_citibike_trips" -EXTERNAL_DATASETS = { +CITIBIKE_TABLE_NAME = "new_york_citibike_trips" +EXTERNAL_TABLES = { "gas_prices": { "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", "schema": StructType([ @@ -111,7 +112,6 @@ def write_to_bigquery(df, table_name): # Saving the data to BigQuery df.write.format('bigquery') \ .option('table', f"{dataset_id}.{table_name}") \ - .option("temporaryGcsBucket", BUCKET_NAME) \ .save() print(f"Table {table_name} successfully written to BigQuery") @@ -121,20 +121,22 @@ def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() - test = False # Whether we are running the job as a test + spark.conf.set('temporaryGcsBucket', BUCKET_NAME) + + create_bigquery_dataset() + + # Whether we are running the job as a test + test = False # Check whether or not the job is running as a test if '--test' in sys.argv: test = True - print("Subset of whole dataset will be uploaded to BigQuery") + print("A subset of the whole dataset will be uploaded to BigQuery") else: - create_bigquery_dataset() print("Results will be uploaded to BigQuery") # Ingest External Datasets - - for table_name, data in EXTERNAL_DATASETS.items(): - print(f'Creating dataframe for {table_name}') + for table_name, data in EXTERNAL_TABLES.items(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) @@ -143,6 +145,8 @@ def main(): # Check if table exists try: df = spark.read.format('bigquery').option('table', TABLE).load() + if test: + df = df.sample(False, 0.00001) except Py4JJavaError: print(f"{TABLE} does not exist. ") return @@ -186,12 +190,8 @@ def main(): # Create final dirty dataframe df = df.union(dup_df) - if not test: - write_to_bigquery(df, RAW_TABLE_NAME) - else: - # df.sample(True, 0.0001).show(n=500, truncate=False) - # Upload 0.001% of the table (about 600 rows) - write_to_bigquery(df.sample(False, 0.00001)) + print('Uploading citibike dataset...') + write_to_bigquery(df, CITIBIKE_TABLE_NAME) if __name__ == '__main__': diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh old mode 100644 new mode 100755 index f78c8cd120b..336f3da729d --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ --driver-log-levels root=FATAL \ - setup.py -- ${BUCKET_NAME} + setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index a0ae6fb2814..7b0f0bc6be5 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -10,13 +10,17 @@ from google.cloud import dataproc_v1 as dataproc from google.cloud import storage +from google.cloud import bigquery import pytest # Set global variables +ID = uuid.uuid4() + PROJECT = os.environ['GCLOUD_PROJECT'] REGION = "us-central1" -CLUSTER_NAME = f'setup-test-{uuid.uuid4()}' -BUCKET_NAME = f'setup-test-code-{uuid.uuid4()}' +CLUSTER_NAME = f'setup-test-{ID}' +BUCKET_NAME = f'setup-test-{ID}' +DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' TABLE_NAMES = [ @@ -31,6 +35,7 @@ 'main_python_file_uri': JOB_FILE_NAME, 'args': [ BUCKET_NAME, + DATASET_NAME, "--test", ], "jar_file_uris": [ @@ -99,6 +104,17 @@ def setup_and_teardown_bucket(): bucket.delete(force=True) +@pytest.fixture(autouse=True) +def setup_and_teardown_bq_dataset(): + # Dataset is created by the client + bq_client = bigquery.Client(project=PROJECT) + + yield + + # Delete Dataset + bq_client.delete_dataset(DATASET_NAME, delete_contents=True) + + def get_blob_from_path(path): bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] bucket = storage.Client().get_bucket(bucket_name) @@ -106,8 +122,14 @@ def get_blob_from_path(path): return bucket.blob(output_location) -def is_in_table(value, out): - return re.search(f"\\| *{value} *\\|", out) +def get_dataproc_job_output(result): + output_location = result.driver_output_resource_uri + ".000000000" + blob = get_blob_from_path(output_location) + return blob.download_as_string().decode("utf-8") + + +# def is_in_table(value, out): +# return re.search(f"\\| *{value} *\\|", out) def table_uploaded(table_name, out): @@ -128,49 +150,43 @@ def test_setup(): result = response.result() # Get job output - output_location = result.driver_output_resource_uri + ".000000000" - blob = get_blob_from_path(output_location) - out = blob.download_as_string().decode("utf-8") - - # Check if table upload success message was printed - for table_name in TABLE_NAMES: - assert table_uploaded(table_name, out) - - # tripDuration - assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) - assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) - assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) - - # station latitude & longitude - assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) - - # birth_year - assert is_in_table("19[0-9][0-9]", out) - assert is_in_table("20[0-9][0-9]", out) - - # gender - assert is_in_table("M", out) - assert is_in_table("m", out) - assert is_in_table("male", out) - assert is_in_table("MALE", out) - assert is_in_table("F", out) - assert is_in_table("f", out) - assert is_in_table("female", out) - assert is_in_table("FEMALE", out) - assert is_in_table("U", out) - assert is_in_table("u", out) - assert is_in_table("unknown", out) - assert is_in_table("UNKNOWN", out) - - # customer_plan - assert is_in_table("Subscriber", out) - assert is_in_table("subscriber", out) - assert is_in_table("SUBSCRIBER", out) - assert is_in_table("sub", out) - assert is_in_table("Customer", out) - assert is_in_table("customer", out) - assert is_in_table("CUSTOMER", out) - assert is_in_table("cust", out) - - # Missing data - assert is_in_table("null", out) + out = get_dataproc_job_output(result) + + # # tripDuration + # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) + # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) + # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) + + # # station latitude & longitude + # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) + + # # birth_year + # assert is_in_table("19[0-9][0-9]", out) + # assert is_in_table("20[0-9][0-9]", out) + + # # gender + # assert is_in_table("M", out) + # assert is_in_table("m", out) + # assert is_in_table("male", out) + # assert is_in_table("MALE", out) + # assert is_in_table("F", out) + # assert is_in_table("f", out) + # assert is_in_table("female", out) + # assert is_in_table("FEMALE", out) + # assert is_in_table("U", out) + # assert is_in_table("u", out) + # assert is_in_table("unknown", out) + # assert is_in_table("UNKNOWN", out) + + # # customer_plan + # assert is_in_table("Subscriber", out) + # assert is_in_table("subscriber", out) + # assert is_in_table("SUBSCRIBER", out) + # assert is_in_table("sub", out) + # assert is_in_table("Customer", out) + # assert is_in_table("customer", out) + # assert is_in_table("CUSTOMER", out) + # assert is_in_table("cust", out) + + # # Missing data + # assert is_in_table("null", out) From e0ffb41cc7494c22465b371b23085aa54b133147 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Wed, 15 Jul 2020 18:40:18 -0400 Subject: [PATCH 45/59] query data in test --- data-science-onramp/data-ingestion/setup.sh | 2 +- .../data-ingestion/setup_test.py | 107 +++++++++++------- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh index 336f3da729d..a69cda6a134 100755 --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ --driver-log-levels root=FATAL \ - setup.py -- ${BUCKET_NAME} data_science_onramp_test_six --test + setup.py -- ${BUCKET_NAME} data_science_onramp diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 7b0f0bc6be5..ad9e756f8d1 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -21,6 +21,7 @@ CLUSTER_NAME = f'setup-test-{ID}' BUCKET_NAME = f'setup-test-{ID}' DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") +CITIBIKE_TABLE = "new_york_citibike_trips" DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' TABLE_NAMES = [ @@ -123,6 +124,7 @@ def get_blob_from_path(path): def get_dataproc_job_output(result): + """Get the dataproc job logs in plain text""" output_location = result.driver_output_resource_uri + ".000000000" blob = get_blob_from_path(output_location) return blob.download_as_string().decode("utf-8") @@ -132,12 +134,50 @@ def get_dataproc_job_output(result): # return re.search(f"\\| *{value} *\\|", out) -def table_uploaded(table_name, out): - return re.search(f"Table {table_name} successfully written to BigQuery", out) +def assert_table_success_message(table_name, out): + """Check table upload success message was printed in job logs.""" + assert re.search(f"Table {table_name} successfully written to BigQuery", out), \ + f"Table {table_name} sucess message not printed in job logs" + + + +def assert_regexes_in_table(regex_dict, query_result): + """Assert that at least one row satisfies each regex. + The arguments are + - regex_dict: a dictionary where the keys are column + names and values are lists of regexes; + - query_result: the bigquery query result of the whole table. + """ + + # Create dictionary with keys column names and values dictionaries + # The dictionaries stored have keys regexes and values booleans + # `regex_found_dict[column][regex]` hold the truth value of + # whether the there is at least one row of column with name `column` + # which satisfies the regular expression `regex`. + regex_found_dict = {} + for column, regexes in regex_dict.items(): + regex_found_dict[column] = {} + for regex in regexes: + regex_found_dict[column][regex] = False + + # Outer loop is over `query_result` since this is + # an iterator which can only iterate once + for row in query_result: + for column_name, regexes in regex_dict.items(): + for regex in regexes: + if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]): + regex_found_dict[column_name][regex] = True + + # Assert that all entries in regex_found_dict are true + for column_name in regex_found_dict: + for regex, found in regex_found_dict[column_name].items(): + assert found, \ + f"No matches to regular expression \"{regex}\" found in column {column_name}" def test_setup(): - '''Tests setup.py by submitting it to a dataproc cluster''' + """Test setup.py by submitting it to a dataproc cluster + Check table upload success message as well as data in the table itself""" # Submit job to dataproc cluster job_client = dataproc.JobControllerClient(client_options={ @@ -151,42 +191,27 @@ def test_setup(): # Get job output out = get_dataproc_job_output(result) + + # Check logs to see if tables were uploaded + for table_name in TABLE_NAMES: + assert_table_success_message(table_name, out) + + # Query BigQuery Table + client = bigquery.Client() + query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" + query_job = client.query(query) + + result = query_job.result() + + regex_dict = { + "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"], + "gender": ['f', 'F', 'm', 'M', 'u', 'U', 'male', 'MALE', 'female', 'FEMALE', 'unknown', 'UNKNOWN'], + "start_station_latitude": ["[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\""], + "start_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], + "end_station_latitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], + "end_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], + "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"], + } + + assert_regexes_in_table(regex_dict, result) - # # tripDuration - # assert is_in_table("(\\d+(?:\\.\\d+)?) s", out) - # assert is_in_table("(\\d+(?:\\.\\d+)?) min", out) - # assert is_in_table("(\\d+(?:\\.\\d+)?) h", out) - - # # station latitude & longitude - # assert is_in_table("[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\"", out) - - # # birth_year - # assert is_in_table("19[0-9][0-9]", out) - # assert is_in_table("20[0-9][0-9]", out) - - # # gender - # assert is_in_table("M", out) - # assert is_in_table("m", out) - # assert is_in_table("male", out) - # assert is_in_table("MALE", out) - # assert is_in_table("F", out) - # assert is_in_table("f", out) - # assert is_in_table("female", out) - # assert is_in_table("FEMALE", out) - # assert is_in_table("U", out) - # assert is_in_table("u", out) - # assert is_in_table("unknown", out) - # assert is_in_table("UNKNOWN", out) - - # # customer_plan - # assert is_in_table("Subscriber", out) - # assert is_in_table("subscriber", out) - # assert is_in_table("SUBSCRIBER", out) - # assert is_in_table("sub", out) - # assert is_in_table("Customer", out) - # assert is_in_table("customer", out) - # assert is_in_table("CUSTOMER", out) - # assert is_in_table("cust", out) - - # # Missing data - # assert is_in_table("null", out) From b0d334be5aabd792bc8f904fff05d6ef5b1ecfe8 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Fri, 17 Jul 2020 14:03:26 -0400 Subject: [PATCH 46/59] address live session comments --- .../data-ingestion/setup_test.py | 64 ++++++------------- 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index ad9e756f8d1..5ee77d5e1a3 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -8,9 +8,9 @@ import re import uuid +from google.cloud import bigquery from google.cloud import dataproc_v1 as dataproc from google.cloud import storage -from google.cloud import bigquery import pytest # Set global variables @@ -130,51 +130,12 @@ def get_dataproc_job_output(result): return blob.download_as_string().decode("utf-8") -# def is_in_table(value, out): -# return re.search(f"\\| *{value} *\\|", out) - - def assert_table_success_message(table_name, out): """Check table upload success message was printed in job logs.""" assert re.search(f"Table {table_name} successfully written to BigQuery", out), \ f"Table {table_name} sucess message not printed in job logs" - -def assert_regexes_in_table(regex_dict, query_result): - """Assert that at least one row satisfies each regex. - The arguments are - - regex_dict: a dictionary where the keys are column - names and values are lists of regexes; - - query_result: the bigquery query result of the whole table. - """ - - # Create dictionary with keys column names and values dictionaries - # The dictionaries stored have keys regexes and values booleans - # `regex_found_dict[column][regex]` hold the truth value of - # whether the there is at least one row of column with name `column` - # which satisfies the regular expression `regex`. - regex_found_dict = {} - for column, regexes in regex_dict.items(): - regex_found_dict[column] = {} - for regex in regexes: - regex_found_dict[column][regex] = False - - # Outer loop is over `query_result` since this is - # an iterator which can only iterate once - for row in query_result: - for column_name, regexes in regex_dict.items(): - for regex in regexes: - if row[column_name] and re.match(f"\\A{regex}\\Z", row[column_name]): - regex_found_dict[column_name][regex] = True - - # Assert that all entries in regex_found_dict are true - for column_name in regex_found_dict: - for regex, found in regex_found_dict[column_name].items(): - assert found, \ - f"No matches to regular expression \"{regex}\" found in column {column_name}" - - def test_setup(): """Test setup.py by submitting it to a dataproc cluster Check table upload success message as well as data in the table itself""" @@ -191,17 +152,13 @@ def test_setup(): # Get job output out = get_dataproc_job_output(result) - + # Check logs to see if tables were uploaded for table_name in TABLE_NAMES: assert_table_success_message(table_name, out) # Query BigQuery Table client = bigquery.Client() - query = f"SELECT * FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" - query_job = client.query(query) - - result = query_job.result() regex_dict = { "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"], @@ -213,5 +170,20 @@ def test_setup(): "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"], } - assert_regexes_in_table(regex_dict, result) + for column_name, regexes in regex_dict.items(): + query = f"SELECT {column_name} FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" + query_job = client.query(query) + result = query_job.result() + + rows = [] + for row in result: + rows.append(row[column_name]) + + for regex in regexes: + found = False + for row in rows: + if row and re.match(f"\\A{regex}\\Z", row): + found = True + assert found, \ + f"No matches to regular expression \"{regex}\" found in column {column_name}" From 33afd6c9677f2865e222b72e0d9ab14408b72b27 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Mon, 20 Jul 2020 11:36:25 -0400 Subject: [PATCH 47/59] add break statement --- data-science-onramp/data-ingestion/setup_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 5ee77d5e1a3..978a5376480 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -185,5 +185,6 @@ def test_setup(): for row in rows: if row and re.match(f"\\A{regex}\\Z", row): found = True + break assert found, \ f"No matches to regular expression \"{regex}\" found in column {column_name}" From 9acb94ef8c00a5a9fd7a61fb702422197761329c Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 23 Jul 2020 16:48:55 -0400 Subject: [PATCH 48/59] revert breaking table and dataset name change --- data-science-onramp/data-ingestion/setup.py | 2 +- data-science-onramp/data-ingestion/setup.sh | 2 +- data-science-onramp/data-ingestion/setup_test.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index bdad93720d2..a1f13dfa5ef 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -22,7 +22,7 @@ BUCKET_NAME = sys.argv[1] DATASET_NAME = sys.argv[2] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" -CITIBIKE_TABLE_NAME = "new_york_citibike_trips" +CITIBIKE_TABLE_NAME = "RAW_DATA" EXTERNAL_TABLES = { "gas_prices": { "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", diff --git a/data-science-onramp/data-ingestion/setup.sh b/data-science-onramp/data-ingestion/setup.sh index a69cda6a134..2c4773f7272 100755 --- a/data-science-onramp/data-ingestion/setup.sh +++ b/data-science-onramp/data-ingestion/setup.sh @@ -6,4 +6,4 @@ gcloud dataproc jobs submit pyspark \ --cluster ${CLUSTER_NAME} \ --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ --driver-log-levels root=FATAL \ - setup.py -- ${BUCKET_NAME} data_science_onramp + setup.py -- ${BUCKET_NAME} new_york_citibike_trips diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 978a5376480..b1395af9793 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -21,11 +21,11 @@ CLUSTER_NAME = f'setup-test-{ID}' BUCKET_NAME = f'setup-test-{ID}' DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") -CITIBIKE_TABLE = "new_york_citibike_trips" +CITIBIKE_TABLE = "RAW_DATA" DESTINATION_BLOB_NAME = "setup.py" JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' TABLE_NAMES = [ - "new_york_citibike_trips", + CITIBIKE_TABLE, "gas_prices", ] JOB_DETAILS = { # Job configuration From c97d45497dcd8ee32c96167c3ba0a6e401c9f841 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Tue, 4 Aug 2020 19:57:05 -0400 Subject: [PATCH 49/59] fix datetime formatting in setup job --- data-science-onramp/data-ingestion/setup.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index a1f13dfa5ef..8205f551c51 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -15,7 +15,7 @@ import pandas as pd from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import expr, UserDefinedFunction, when +from pyspark.sql.functions import expr, UserDefinedFunction, when, date_format from pyspark.sql.types import FloatType, StringType, StructField, StructType @@ -101,7 +101,7 @@ def create_bigquery_dataset(): dataset_id = f'{client.project}.{DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" - dataset = client.create_dataset(dataset) + #dataset = client.create_dataset(dataset) def write_to_bigquery(df, table_name): @@ -140,7 +140,7 @@ def main(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - write_to_bigquery(df, table_name) + #write_to_bigquery(df, table_name) # Check if table exists try: @@ -180,6 +180,10 @@ def main(): for name, udf in udf_map.items(): df = df.withColumn(name, UserDefinedFunction(*udf)(name)) + # Format the datetimes correctly + for name in ['starttime', 'stoptime']: + df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss")) + # Randomly set about 5% of the values in some columns to null for name in null_columns: df = df.withColumn(name, when(expr("rand() < 0.05"), None).otherwise(df[name])) From 41406f9b44ad3e9d292d6f540ec754794d94c147 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 6 Aug 2020 14:03:13 -0400 Subject: [PATCH 50/59] uncomment commented dataset creation and writing --- data-science-onramp/data-ingestion/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 8205f551c51..352d8b029b4 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -101,7 +101,7 @@ def create_bigquery_dataset(): dataset_id = f'{client.project}.{DATASET_NAME}' dataset = bigquery.Dataset(dataset_id) dataset.location = "US" - #dataset = client.create_dataset(dataset) + dataset = client.create_dataset(dataset) def write_to_bigquery(df, table_name): @@ -140,7 +140,7 @@ def main(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - #write_to_bigquery(df, table_name) + write_to_bigquery(df, table_name) # Check if table exists try: From ca3c592b36192527757618d374f5037660d99a22 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 6 Aug 2020 23:34:51 -0400 Subject: [PATCH 51/59] fix import order --- data-science-onramp/data-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 352d8b029b4..135026e6e3b 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -15,7 +15,7 @@ import pandas as pd from py4j.protocol import Py4JJavaError from pyspark.sql import SparkSession -from pyspark.sql.functions import expr, UserDefinedFunction, when, date_format +from pyspark.sql.functions import date_format, expr, UserDefinedFunction, when from pyspark.sql.types import FloatType, StringType, StructField, StructType From cf3aae393fbebb27153de556c0e3509dcc8f010a Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 6 Aug 2020 23:36:17 -0400 Subject: [PATCH 52/59] use GOOGLE_CLOUD_PROJECT environment variable --- data-science-onramp/data-ingestion/setup_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index b1395af9793..298576f6039 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -16,7 +16,7 @@ # Set global variables ID = uuid.uuid4() -PROJECT = os.environ['GCLOUD_PROJECT'] +PROJECT = os.environ['GOOGLE_CLOUD_PROJECT'] REGION = "us-central1" CLUSTER_NAME = f'setup-test-{ID}' BUCKET_NAME = f'setup-test-{ID}' From dc11440a5f4efcf51831f66e3f41a6f1d84052be Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Wed, 12 Aug 2020 13:13:52 -0400 Subject: [PATCH 53/59] blacken and add f-strings to dms notation --- data-science-onramp/data-ingestion/setup.py | 136 +++++++++--------- .../data-ingestion/setup_test.py | 128 +++++++++-------- 2 files changed, 142 insertions(+), 122 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index 135026e6e3b..cc5245dfdcf 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -26,93 +26,101 @@ EXTERNAL_TABLES = { "gas_prices": { "url": "https://data.ny.gov/api/views/wuxr-ni2i/rows.csv", - "schema": StructType([ - StructField("Date", StringType(), True), - StructField("New_York_State_Average_USD_per_Gal", - FloatType(), True), - StructField("Albany_Average_USD_per_Gal", FloatType(), True), - StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True), - StructField("Buffalo_Average_USD_per_Gal", FloatType(), True), - StructField("Nassau_Average_USD_per_Gal", FloatType(), True), - StructField("New_York_City_Average_USD_per_Gal", - FloatType(), True), - StructField("Rochester_Average_USD_per_Gal", FloatType(), True), - StructField("Syracuse_Average_USD_per_Gal", FloatType(), True), - StructField("Utica_Average_USD_per_Gal", FloatType(), True), - ]), + "schema": StructType( + [ + StructField("Date", StringType(), True), + StructField("New_York_State_Average_USD_per_Gal", FloatType(), True), + StructField("Albany_Average_USD_per_Gal", FloatType(), True), + StructField("Blinghamton_Average_USD_per_Gal", FloatType(), True), + StructField("Buffalo_Average_USD_per_Gal", FloatType(), True), + StructField("Nassau_Average_USD_per_Gal", FloatType(), True), + StructField("New_York_City_Average_USD_per_Gal", FloatType(), True), + StructField("Rochester_Average_USD_per_Gal", FloatType(), True), + StructField("Syracuse_Average_USD_per_Gal", FloatType(), True), + StructField("Utica_Average_USD_per_Gal", FloatType(), True), + ] + ), }, } # START MAKING DATA DIRTY def trip_duration(duration): - '''Converts trip duration to other units''' + """Converts trip duration to other units""" if not duration: return None seconds = f"{str(duration)} s" minutes = f"{str(float(duration) / 60)} min" hours = f"{str(float(duration) / 3600)} h" - return random.choices([seconds, minutes, hours, - str(random.randint(-1000, -1))], - weights=[0.3, 0.3, 0.3, 0.1])[0] + return random.choices( + [seconds, minutes, hours, str(random.randint(-1000, -1))], + weights=[0.3, 0.3, 0.3, 0.1], + )[0] def station_name(name): - '''Replaces '&' with '/' with a 50% chance''' + """Replaces '&' with '/' with a 50% chance""" if not name: return None return random.choice([name, name.replace("&", "/")]) def user_type(user): - '''Manipulates the user type string''' + """Manipulates the user type string""" if not user: return None - return random.choice([user, user.upper(), user.lower(), - "sub" if user == "Subscriber" else user, - "cust" if user == "Customer" else user]) + return random.choice( + [ + user, + user.upper(), + user.lower(), + "sub" if user == "Subscriber" else user, + "cust" if user == "Customer" else user, + ] + ) def gender(s): - '''Manipulates the gender string''' + """Manipulates the gender string""" if not s: return None - return random.choice([s.upper(), s.lower(), - s[0].upper() if len(s) > 0 else "", - s[0].lower() if len(s) > 0 else ""]) + return random.choice( + [ + s.upper(), + s.lower(), + s[0].upper() if len(s) > 0 else "", + s[0].lower() if len(s) > 0 else "", + ] + ) def convert_angle(angle): - '''Converts long and lat to DMS notation''' + """Converts long and lat to DMS notation""" if angle is None: return None degrees = int(angle) minutes = int((angle - degrees) * 60) - seconds = int((angle - degrees - minutes/60) * 3600) - new_angle = str(degrees) + "\u00B0" + \ - str(minutes) + "'" + str(seconds) + '"' - return random.choices([str(angle), new_angle], - weights=[0.55, 0.45])[0] + seconds = int((angle - degrees - minutes / 60) * 3600) + new_angle = f"{degrees}\u00B0{minutes}'{seconds}\"" + return random.choices([str(angle), new_angle], weights=[0.55, 0.45])[0] def create_bigquery_dataset(): # Create BigQuery Dataset client = bigquery.Client() - dataset_id = f'{client.project}.{DATASET_NAME}' + dataset_id = f"{client.project}.{DATASET_NAME}" dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) def write_to_bigquery(df, table_name): - '''Write a dataframe to BigQuery''' + """Write a dataframe to BigQuery""" client = bigquery.Client() - dataset_id = f'{client.project}.{DATASET_NAME}' + dataset_id = f"{client.project}.{DATASET_NAME}" # Saving the data to BigQuery - df.write.format('bigquery') \ - .option('table', f"{dataset_id}.{table_name}") \ - .save() + df.write.format("bigquery").option("table", f"{dataset_id}.{table_name}").save() print(f"Table {table_name} successfully written to BigQuery") @@ -121,7 +129,7 @@ def main(): # Create a SparkSession under the name "setup". Viewable via the Spark UI spark = SparkSession.builder.appName("setup").getOrCreate() - spark.conf.set('temporaryGcsBucket', BUCKET_NAME) + spark.conf.set("temporaryGcsBucket", BUCKET_NAME) create_bigquery_dataset() @@ -129,7 +137,7 @@ def main(): test = False # Check whether or not the job is running as a test - if '--test' in sys.argv: + if "--test" in sys.argv: test = True print("A subset of the whole dataset will be uploaded to BigQuery") else: @@ -137,14 +145,14 @@ def main(): # Ingest External Datasets for table_name, data in EXTERNAL_TABLES.items(): - df = spark.createDataFrame(pd.read_csv(data["url"]), - schema=data["schema"]) + df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) write_to_bigquery(df, table_name) # Check if table exists try: - df = spark.read.format('bigquery').option('table', TABLE).load() + df = spark.read.format("bigquery").option("table", TABLE).load() + # if we are running a test, perform computations on a subset of the data if test: df = df.sample(False, 0.00001) except Py4JJavaError: @@ -152,28 +160,28 @@ def main(): return # Declare dictionary with keys column names and values user defined - # functions and return types + # functions and return types udf_map = { - 'tripduration': (trip_duration, StringType()), - 'start_station_name': (station_name, StringType()), - 'start_station_latitude': (convert_angle, StringType()), - 'start_station_longitude': (convert_angle, StringType()), - 'end_station_name': (station_name, StringType()), - 'end_station_latitude': (convert_angle, StringType()), - 'end_station_longitude': (convert_angle, StringType()), - 'usertype': (user_type, StringType()), - 'gender': (gender, StringType()), + "tripduration": (trip_duration, StringType()), + "start_station_name": (station_name, StringType()), + "start_station_latitude": (convert_angle, StringType()), + "start_station_longitude": (convert_angle, StringType()), + "end_station_name": (station_name, StringType()), + "end_station_latitude": (convert_angle, StringType()), + "end_station_longitude": (convert_angle, StringType()), + "usertype": (user_type, StringType()), + "gender": (gender, StringType()), } # Declare which columns to set some values to null randomly null_columns = [ - 'tripduration', - 'starttime', - 'stoptime', - 'start_station_latitude', - 'start_station_longitude', - 'end_station_latitude', - 'end_station_longitude', + "tripduration", + "starttime", + "stoptime", + "start_station_latitude", + "start_station_longitude", + "end_station_latitude", + "end_station_longitude", ] # Dirty the columns @@ -181,7 +189,7 @@ def main(): df = df.withColumn(name, UserDefinedFunction(*udf)(name)) # Format the datetimes correctly - for name in ['starttime', 'stoptime']: + for name in ["starttime", "stoptime"]: df = df.withColumn(name, date_format(name, "yyyy-MM-dd'T'HH:mm:ss")) # Randomly set about 5% of the values in some columns to null @@ -194,9 +202,9 @@ def main(): # Create final dirty dataframe df = df.union(dup_df) - print('Uploading citibike dataset...') + print("Uploading citibike dataset...") write_to_bigquery(df, CITIBIKE_TABLE_NAME) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 298576f6039..20dd0d9ad85 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -16,65 +16,47 @@ # Set global variables ID = uuid.uuid4() -PROJECT = os.environ['GOOGLE_CLOUD_PROJECT'] +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] REGION = "us-central1" -CLUSTER_NAME = f'setup-test-{ID}' -BUCKET_NAME = f'setup-test-{ID}' -DATASET_NAME = f'setup-test-{ID}'.replace("-", "_") +CLUSTER_NAME = f"setup-test-{ID}" +BUCKET_NAME = f"setup-test-{ID}" +DATASET_NAME = f"setup-test-{ID}".replace("-", "_") CITIBIKE_TABLE = "RAW_DATA" DESTINATION_BLOB_NAME = "setup.py" -JOB_FILE_NAME = f'gs://{BUCKET_NAME}/setup.py' +JOB_FILE_NAME = f"gs://{BUCKET_NAME}/setup.py" TABLE_NAMES = [ CITIBIKE_TABLE, "gas_prices", ] JOB_DETAILS = { # Job configuration - 'placement': { - 'cluster_name': CLUSTER_NAME - }, - 'pyspark_job': { - 'main_python_file_uri': JOB_FILE_NAME, - 'args': [ - BUCKET_NAME, - DATASET_NAME, - "--test", - ], - "jar_file_uris": [ - "gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar" - ], + "placement": {"cluster_name": CLUSTER_NAME}, + "pyspark_job": { + "main_python_file_uri": JOB_FILE_NAME, + "args": [BUCKET_NAME, DATASET_NAME, "--test",], + "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], }, } CLUSTER_DATA = { # Create cluster configuration - 'project_id': PROJECT, - 'cluster_name': CLUSTER_NAME, - 'config': { - 'gce_cluster_config': { - 'zone_uri': '', - }, - 'master_config': { - 'num_instances': 1, - 'machine_type_uri': 'n1-standard-8' - }, - 'worker_config': { - 'num_instances': 6, - 'machine_type_uri': 'n1-standard-8' - }, + "project_id": PROJECT, + "cluster_name": CLUSTER_NAME, + "config": { + "gce_cluster_config": {"zone_uri": "",}, + "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"}, + "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"}, "software_config": { "image_version": "1.5.4-debian10", - "optional_components": [ - "ANACONDA" - ], - } - } + "optional_components": ["ANACONDA"], + }, + }, } @pytest.fixture(autouse=True) def setup_and_teardown_cluster(): # Create cluster using cluster client - cluster_client = dataproc.ClusterControllerClient(client_options={ - 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' - }) + cluster_client = dataproc.ClusterControllerClient( + client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"} + ) operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA) # Wait for cluster to provision @@ -83,8 +65,7 @@ def setup_and_teardown_cluster(): yield # Delete cluster - operation = cluster_client.delete_cluster(PROJECT, REGION, - CLUSTER_NAME) + operation = cluster_client.delete_cluster(PROJECT, REGION, CLUSTER_NAME) operation.result() @@ -132,8 +113,9 @@ def get_dataproc_job_output(result): def assert_table_success_message(table_name, out): """Check table upload success message was printed in job logs.""" - assert re.search(f"Table {table_name} successfully written to BigQuery", out), \ - f"Table {table_name} sucess message not printed in job logs" + assert re.search( + f"Table {table_name} successfully written to BigQuery", out + ), f"Table {table_name} sucess message not printed in job logs" def test_setup(): @@ -141,11 +123,12 @@ def test_setup(): Check table upload success message as well as data in the table itself""" # Submit job to dataproc cluster - job_client = dataproc.JobControllerClient(client_options={ - 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' - }) - response = job_client.submit_job_as_operation(project_id=PROJECT, region=REGION, - job=JOB_DETAILS) + job_client = dataproc.JobControllerClient( + client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"} + ) + response = job_client.submit_job_as_operation( + project_id=PROJECT, region=REGION, job=JOB_DETAILS + ) # Wait for job to complete result = response.result() @@ -160,14 +143,42 @@ def test_setup(): # Query BigQuery Table client = bigquery.Client() + dms_regex = "-?[0-9]+\u00B0-?[0-9]+'-?[0-9]+\"" + regex_dict = { - "tripduration": ["(\\d+(?:\\.\\d+)?) s", "(\\d+(?:\\.\\d+)?) min", "(\\d+(?:\\.\\d+)?) h"], - "gender": ['f', 'F', 'm', 'M', 'u', 'U', 'male', 'MALE', 'female', 'FEMALE', 'unknown', 'UNKNOWN'], - "start_station_latitude": ["[0-9]+" + u"\u00B0" + "[0-9]+\'[0-9]+\""], - "start_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], - "end_station_latitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], - "end_station_longitude": ["-?[0-9]+" + u"\u00B0" + "-?[0-9]+\'-?[0-9]+\""], - "usertype": ["Subscriber", "subscriber", "SUBSCRIBER", "sub", "Customer", "customer", "CUSTOMER", "cust"], + "tripduration": [ + "(\\d+(?:\\.\\d+)?) s", + "(\\d+(?:\\.\\d+)?) min", + "(\\d+(?:\\.\\d+)?) h", + ], + "gender": [ + "f", + "F", + "m", + "M", + "u", + "U", + "male", + "MALE", + "female", + "FEMALE", + "unknown", + "UNKNOWN", + ], + "start_station_latitude": [dms_regex], + "start_station_longitude": [dms_regex], + "end_station_latitude": [dms_regex], + "end_station_longitude": [dms_regex], + "usertype": [ + "Subscriber", + "subscriber", + "SUBSCRIBER", + "sub", + "Customer", + "customer", + "CUSTOMER", + "cust", + ], } for column_name, regexes in regex_dict.items(): @@ -186,5 +197,6 @@ def test_setup(): if row and re.match(f"\\A{regex}\\Z", row): found = True break - assert found, \ - f"No matches to regular expression \"{regex}\" found in column {column_name}" + assert ( + found + ), f'No matches to regular expression "{regex}" found in column {column_name}' From d35b85506f0754ac3dcbf6b686f44a75cb28bb22 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Wed, 12 Aug 2020 17:28:21 -0400 Subject: [PATCH 54/59] change test variables names to match data cleaning --- .../data-ingestion/setup_test.py | 77 ++++++++++--------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 20dd0d9ad85..039b22a88d2 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -1,5 +1,4 @@ -"""Test file for the setup job in the Data Science Onramp sample application -Creates a test Dataproc cluster and runs the job with a --test flag. +"""Test file for the setup job in the Data Science Onramp sample application Creates a test Dataproc cluster and runs the job with a --test flag. The job uploads a subset of the data to BigQuery. Then, data is pulled from BigQuery and checks are made to see if the data is dirty. """ @@ -13,51 +12,55 @@ from google.cloud import storage import pytest -# Set global variables -ID = uuid.uuid4() - -PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] -REGION = "us-central1" -CLUSTER_NAME = f"setup-test-{ID}" -BUCKET_NAME = f"setup-test-{ID}" -DATASET_NAME = f"setup-test-{ID}".replace("-", "_") -CITIBIKE_TABLE = "RAW_DATA" -DESTINATION_BLOB_NAME = "setup.py" -JOB_FILE_NAME = f"gs://{BUCKET_NAME}/setup.py" -TABLE_NAMES = [ - CITIBIKE_TABLE, +# GCP Project +PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"] +TEST_ID = uuid.uuid4() + +# Google Cloud Storage constants +BUCKET_NAME = f"setup-test-{TEST_ID}" +BUCKET_BLOB = "setup.py" + +BQ_DATASET = f"setup-test-{TEST_ID}".replace("-", "_") +BQ_CITIBIKE_TABLE = "RAW_DATA" +BQ_TABLES = [ + BQ_CITIBIKE_TABLE, "gas_prices", ] -JOB_DETAILS = { # Job configuration - "placement": {"cluster_name": CLUSTER_NAME}, - "pyspark_job": { - "main_python_file_uri": JOB_FILE_NAME, - "args": [BUCKET_NAME, DATASET_NAME, "--test",], - "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], - }, -} -CLUSTER_DATA = { # Create cluster configuration - "project_id": PROJECT, - "cluster_name": CLUSTER_NAME, + +# Dataproc constants +DATAPROC_CLUSTER = f"setup-test-{TEST_ID}" +CLUSTER_REGION = "us-central1" +CLUSTER_IMAGE = "1.5.4-debian10" +CLUSTER_CONFIG = { # Dataproc cluster configuration + "project_id": PROJECT_ID, + "cluster_name": DATAPROC_CLUSTER, "config": { "gce_cluster_config": {"zone_uri": "",}, "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"}, "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"}, "software_config": { - "image_version": "1.5.4-debian10", + "image_version": CLUSTER_IMAGE, "optional_components": ["ANACONDA"], }, }, } +DATAPROC_JOB = { # Dataproc job configuration + "placement": {"cluster_name": DATAPROC_CLUSTER}, + "pyspark_job": { + "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}", + "args": [BUCKET_NAME, BQ_DATASET, "--test",], + "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], + }, +} @pytest.fixture(autouse=True) def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient( - client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"} + client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} ) - operation = cluster_client.create_cluster(PROJECT, REGION, CLUSTER_DATA) + operation = cluster_client.create_cluster(PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG) # Wait for cluster to provision operation.result() @@ -65,7 +68,7 @@ def setup_and_teardown_cluster(): yield # Delete cluster - operation = cluster_client.delete_cluster(PROJECT, REGION, CLUSTER_NAME) + operation = cluster_client.delete_cluster(PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER) operation.result() @@ -76,7 +79,7 @@ def setup_and_teardown_bucket(): bucket = storage_client.create_bucket(BUCKET_NAME) # Upload file - blob = bucket.blob(DESTINATION_BLOB_NAME) + blob = bucket.blob(BUCKET_BLOB) blob.upload_from_filename("setup.py") yield @@ -89,12 +92,12 @@ def setup_and_teardown_bucket(): @pytest.fixture(autouse=True) def setup_and_teardown_bq_dataset(): # Dataset is created by the client - bq_client = bigquery.Client(project=PROJECT) + bq_client = bigquery.Client(project=PROJECT_ID) yield # Delete Dataset - bq_client.delete_dataset(DATASET_NAME, delete_contents=True) + bq_client.delete_dataset(BQ_DATASET, delete_contents=True) def get_blob_from_path(path): @@ -124,10 +127,10 @@ def test_setup(): # Submit job to dataproc cluster job_client = dataproc.JobControllerClient( - client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"} + client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} ) response = job_client.submit_job_as_operation( - project_id=PROJECT, region=REGION, job=JOB_DETAILS + project_id=PROJECT_ID, region=CLUSTER_REGION, job=JOB_DETAILS ) # Wait for job to complete @@ -137,7 +140,7 @@ def test_setup(): out = get_dataproc_job_output(result) # Check logs to see if tables were uploaded - for table_name in TABLE_NAMES: + for table_name in BQ_TABLES: assert_table_success_message(table_name, out) # Query BigQuery Table @@ -182,7 +185,7 @@ def test_setup(): } for column_name, regexes in regex_dict.items(): - query = f"SELECT {column_name} FROM `{PROJECT}.{DATASET_NAME}.{CITIBIKE_TABLE}`" + query = f"SELECT {column_name} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_CITIBIKE_TABLE}`" query_job = client.query(query) result = query_job.result() From 6105f79f052f6738d2416eec981f18c933191db0 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Wed, 12 Aug 2020 17:30:16 -0400 Subject: [PATCH 55/59] blacken setup_test file --- data-science-onramp/data-ingestion/setup_test.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 039b22a88d2..fd5f3ce75fc 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -44,7 +44,7 @@ }, }, } -DATAPROC_JOB = { # Dataproc job configuration +DATAPROC_JOB = { # Dataproc job configuration "placement": {"cluster_name": DATAPROC_CLUSTER}, "pyspark_job": { "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}", @@ -60,7 +60,9 @@ def setup_and_teardown_cluster(): cluster_client = dataproc.ClusterControllerClient( client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} ) - operation = cluster_client.create_cluster(PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG) + operation = cluster_client.create_cluster( + PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG + ) # Wait for cluster to provision operation.result() @@ -68,7 +70,9 @@ def setup_and_teardown_cluster(): yield # Delete cluster - operation = cluster_client.delete_cluster(PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER) + operation = cluster_client.delete_cluster( + PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER + ) operation.result() @@ -185,7 +189,9 @@ def test_setup(): } for column_name, regexes in regex_dict.items(): - query = f"SELECT {column_name} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_CITIBIKE_TABLE}`" + query = ( + f"SELECT {column_name} FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_CITIBIKE_TABLE}`" + ) query_job = client.query(query) result = query_job.result() From 35ec8cb10582077fca453e03a8e194f1a4e53655 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Wed, 12 Aug 2020 17:59:11 -0400 Subject: [PATCH 56/59] fix unchanged variable name --- data-science-onramp/data-ingestion/setup_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index fd5f3ce75fc..8e7ab8a40d7 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -134,7 +134,7 @@ def test_setup(): client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} ) response = job_client.submit_job_as_operation( - project_id=PROJECT_ID, region=CLUSTER_REGION, job=JOB_DETAILS + project_id=PROJECT_ID, region=CLUSTER_REGION, job=DATAPROC_JOB ) # Wait for job to complete From 9561f35ee29130cd9df9c0438e3e0657a7b23c5a Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 13 Aug 2020 11:31:10 -0400 Subject: [PATCH 57/59] WIP: address PR comments --- .../data-ingestion/requirements-test.txt | 2 +- .../data-ingestion/requirements.txt | 9 +++---- data-science-onramp/data-ingestion/setup.py | 25 ++++++++++--------- .../data-ingestion/setup_test.py | 11 ++++---- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt index 781d4326c94..2018c08113a 100644 --- a/data-science-onramp/data-ingestion/requirements-test.txt +++ b/data-science-onramp/data-ingestion/requirements-test.txt @@ -1 +1 @@ -pytest==5.3.2 +pytest==6.0.0 diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt index e0328e4aec9..b5edbdf1ad7 100644 --- a/data-science-onramp/data-ingestion/requirements.txt +++ b/data-science-onramp/data-ingestion/requirements.txt @@ -1,7 +1,6 @@ -grpcio==1.29.0 -google-auth==1.16.0 -google-auth-httplib2==0.0.3 -google-cloud==0.34.0 +#grpcio==1.29.0 +#google-auth==1.16.0 +#google-auth-httplib2==0.0.3 google-cloud-storage==1.28.1 -google-cloud-dataproc==0.8.0 +google-cloud-dataproc==2.0.0 google-cloud-bigquery==1.25.0 diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py index cc5245dfdcf..6921947ddca 100644 --- a/data-science-onramp/data-ingestion/setup.py +++ b/data-science-onramp/data-ingestion/setup.py @@ -18,9 +18,6 @@ from pyspark.sql.functions import date_format, expr, UserDefinedFunction, when from pyspark.sql.types import FloatType, StringType, StructField, StructType - -BUCKET_NAME = sys.argv[1] -DATASET_NAME = sys.argv[2] TABLE = "bigquery-public-data.new_york_citibike.citibike_trips" CITIBIKE_TABLE_NAME = "RAW_DATA" EXTERNAL_TABLES = { @@ -96,7 +93,7 @@ def gender(s): def convert_angle(angle): """Converts long and lat to DMS notation""" - if angle is None: + if not angle: return None degrees = int(angle) minutes = int((angle - degrees) * 60) @@ -105,19 +102,19 @@ def convert_angle(angle): return random.choices([str(angle), new_angle], weights=[0.55, 0.45])[0] -def create_bigquery_dataset(): +def create_bigquery_dataset(dataset_name): # Create BigQuery Dataset client = bigquery.Client() - dataset_id = f"{client.project}.{DATASET_NAME}" + dataset_id = f"{client.project}.{dataset_name}" dataset = bigquery.Dataset(dataset_id) dataset.location = "US" dataset = client.create_dataset(dataset) -def write_to_bigquery(df, table_name): +def write_to_bigquery(df, table_name, dataset_name): """Write a dataframe to BigQuery""" client = bigquery.Client() - dataset_id = f"{client.project}.{DATASET_NAME}" + dataset_id = f"{client.project}.{dataset_name}" # Saving the data to BigQuery df.write.format("bigquery").option("table", f"{dataset_id}.{table_name}").save() @@ -126,12 +123,16 @@ def write_to_bigquery(df, table_name): def main(): - # Create a SparkSession under the name "setup". Viewable via the Spark UI + # Get command line arguments + BUCKET_NAME = sys.argv[1] + DATASET_NAME = sys.argv[2] + + # Create a SparkSession under the name "setup" spark = SparkSession.builder.appName("setup").getOrCreate() spark.conf.set("temporaryGcsBucket", BUCKET_NAME) - create_bigquery_dataset() + create_bigquery_dataset(DATASET_NAME) # Whether we are running the job as a test test = False @@ -147,7 +148,7 @@ def main(): for table_name, data in EXTERNAL_TABLES.items(): df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"]) - write_to_bigquery(df, table_name) + write_to_bigquery(df, table_name, DATASET_NAME) # Check if table exists try: @@ -203,7 +204,7 @@ def main(): df = df.union(dup_df) print("Uploading citibike dataset...") - write_to_bigquery(df, CITIBIKE_TABLE_NAME) + write_to_bigquery(df, CITIBIKE_TABLE_NAME, DATASET_NAME) if __name__ == "__main__": diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index 8e7ab8a40d7..d72fdb92ec1 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -35,7 +35,7 @@ "project_id": PROJECT_ID, "cluster_name": DATAPROC_CLUSTER, "config": { - "gce_cluster_config": {"zone_uri": "",}, + "gce_cluster_config": {"zone_uri": ""}, "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"}, "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"}, "software_config": { @@ -48,7 +48,7 @@ "placement": {"cluster_name": DATAPROC_CLUSTER}, "pyspark_job": { "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}", - "args": [BUCKET_NAME, BQ_DATASET, "--test",], + "args": [BUCKET_NAME, BQ_DATASET, "--test"], "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], }, } @@ -58,10 +58,11 @@ def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient( - client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} + #client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} ) + operation = cluster_client.create_cluster( - PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG + project_id=PROJECT_ID, region=CLUSTER_REGION, cluster=CLUSTER_CONFIG ) # Wait for cluster to provision @@ -71,7 +72,7 @@ def setup_and_teardown_cluster(): # Delete cluster operation = cluster_client.delete_cluster( - PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER + project_id=PROJECT_ID, region=CLUSTER_REGION, name=DATAPROC_CLUSTER ) operation.result() From 3242654158e0c01a24f2f004933345d2ce043ab6 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 13 Aug 2020 17:40:10 -0400 Subject: [PATCH 58/59] apply temporary fix for ANACONDA optional component --- data-science-onramp/data-ingestion/setup_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py index d72fdb92ec1..c325b5a0e98 100644 --- a/data-science-onramp/data-ingestion/setup_test.py +++ b/data-science-onramp/data-ingestion/setup_test.py @@ -40,7 +40,7 @@ "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"}, "software_config": { "image_version": CLUSTER_IMAGE, - "optional_components": ["ANACONDA"], + "optional_components": [5], }, }, } @@ -58,7 +58,7 @@ def setup_and_teardown_cluster(): # Create cluster using cluster client cluster_client = dataproc.ClusterControllerClient( - #client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} + client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} ) operation = cluster_client.create_cluster( @@ -72,7 +72,7 @@ def setup_and_teardown_cluster(): # Delete cluster operation = cluster_client.delete_cluster( - project_id=PROJECT_ID, region=CLUSTER_REGION, name=DATAPROC_CLUSTER + project_id=PROJECT_ID, region=CLUSTER_REGION, cluster_name=DATAPROC_CLUSTER ) operation.result() From b82059b1de61c3b1f3eae112629098ce378fa0e3 Mon Sep 17 00:00:00 2001 From: Diego Lopez Date: Thu, 13 Aug 2020 18:24:09 -0400 Subject: [PATCH 59/59] remove data cleaning files --- data-science-onramp/data-cleaning/clean.py | 226 ------------------ data-science-onramp/data-cleaning/clean.sh | 12 - .../data-cleaning/clean_test.py | 144 ----------- .../data-cleaning/requirements-test.txt | 1 - .../data-cleaning/requirements.txt | 6 - 5 files changed, 389 deletions(-) delete mode 100644 data-science-onramp/data-cleaning/clean.py delete mode 100755 data-science-onramp/data-cleaning/clean.sh delete mode 100644 data-science-onramp/data-cleaning/clean_test.py delete mode 100644 data-science-onramp/data-cleaning/requirements-test.txt delete mode 100644 data-science-onramp/data-cleaning/requirements.txt diff --git a/data-science-onramp/data-cleaning/clean.py b/data-science-onramp/data-cleaning/clean.py deleted file mode 100644 index 3c928dd8e9d..00000000000 --- a/data-science-onramp/data-cleaning/clean.py +++ /dev/null @@ -1,226 +0,0 @@ -import datetime -import re -import sys -import time - -from google.cloud import storage -from py4j.protocol import Py4JJavaError -from pyspark.sql import SparkSession -from pyspark.sql.functions import UserDefinedFunction -from pyspark.sql.types import FloatType, IntegerType, StringType - - -PROJECT_ID = sys.argv[1] -BUCKET_NAME = sys.argv[2] -TABLE = f"{PROJECT_ID}.new_york_citibike_trips.RAW_DATA" - - -def trip_duration_udf(duration): - """Convert trip duration to seconds. Return None if negative.""" - if not duration: - return None - - time = re.match(r"\d*.\d*", duration) - - if not time: - return None - - time = float(time[0]) - - if time < 0: - return None - - if "m" in duration: - time *= 60 - elif "h" in duration: - time *= 60 * 60 - - return int(time) - - -def station_name_udf(name): - """Replaces '/' with '&'.""" - return name.replace("/", "&") if name else None - - -def user_type_udf(user): - """Converts user type to 'Subscriber' or 'Customer'.""" - if not user: - return None - - if user.lower().startswith("sub"): - return "Subscriber" - elif user.lower().startswith("cust"): - return "Customer" - - -def gender_udf(gender): - """Converts gender to 'Male' or 'Female'.""" - if not gender: - return None - - if gender.lower().startswith("m"): - return "Male" - elif gender.lower().startswith("f"): - return "Female" - - -def angle_udf(angle): - """Converts DMS notation to degrees. Return None if not in DMS or degrees notation.""" - if not angle: - return None - - dms = re.match(r'(-?\d*).(-?\d*)\'(-?\d*)"', angle) - if dms: - return int(dms[1]) + int(dms[2]) / 60 + int(dms[3]) / (60 * 60) - - degrees = re.match(r"\d*.\d*", angle) - if degrees: - return float(degrees[0]) - - -def compute_time(duration, start, end): - """Calculates duration, start time, and end time from each other if one value is null.""" - time_format = "%Y-%m-%dT%H:%M:%S" - - # Transform to datetime objects - if start: - # Round to nearest second - if "." in start: - start = start[: start.index(".")] - # Convert to datetime - start = datetime.datetime.strptime(start, time_format) - if end: - # Round to nearest second - if "." in end: - end = end[: end.index(".")] - # Convert to datetime - end = datetime.datetime.strptime(end, time_format) - if duration: - # Convert to timedelta - duration = datetime.timedelta(seconds=duration) - - # Calculate missing value - if start and end and not duration: - duration = end - start - elif duration and end and not start: - start = end - duration - elif duration and start and not end: - end = start + duration - - # Transform to primitive types - if duration: - duration = int(duration.total_seconds()) - if start: - start = start.strftime(time_format) - if end: - end = end.strftime(time_format) - - return (duration, start, end) - - -def compute_duration_udf(duration, start, end): - """Calculates duration from start and end time if null.""" - return compute_time(duration, start, end)[0] - - -def compute_start_udf(duration, start, end): - """Calculates start time from duration and end time if null.""" - return compute_time(duration, start, end)[1] - - -def compute_end_udf(duration, start, end): - """Calculates end time from duration and start time if null.""" - return compute_time(duration, start, end)[2] - - -if __name__ == "__main__": - # Create a SparkSession, viewable via the Spark UI - spark = SparkSession.builder.appName("data_cleaning").getOrCreate() - - # Load data into dataframe if table exists - try: - df = spark.read.format("bigquery").option("table", TABLE).load() - except Py4JJavaError as e: - raise Exception(f"Error reading {TABLE}") from e - - # Single-parameter udfs - udfs = { - "start_station_name": UserDefinedFunction(station_name_udf, StringType()), - "end_station_name": UserDefinedFunction(station_name_udf, StringType()), - "tripduration": UserDefinedFunction(trip_duration_udf, IntegerType()), - "usertype": UserDefinedFunction(user_type_udf, StringType()), - "gender": UserDefinedFunction(gender_udf, StringType()), - "start_station_latitude": UserDefinedFunction(angle_udf, FloatType()), - "start_station_longitude": UserDefinedFunction(angle_udf, FloatType()), - "end_station_latitude": UserDefinedFunction(angle_udf, FloatType()), - "end_station_longitude": UserDefinedFunction(angle_udf, FloatType()), - } - - for name, udf in udfs.items(): - df = df.withColumn(name, udf(name)) - - # Multi-parameter udfs - multi_udfs = { - "tripduration": { - "udf": UserDefinedFunction(compute_duration_udf, IntegerType()), - "params": ("tripduration", "starttime", "stoptime"), - }, - "starttime": { - "udf": UserDefinedFunction(compute_start_udf, StringType()), - "params": ("tripduration", "starttime", "stoptime"), - }, - "stoptime": { - "udf": UserDefinedFunction(compute_end_udf, StringType()), - "params": ("tripduration", "starttime", "stoptime"), - }, - } - - for name, obj in multi_udfs.items(): - df = df.withColumn(name, obj["udf"](*obj["params"])) - - # Display sample of rows - df.sample(False, 0.001).show(n=100) - - # Write results to GCS - if "--dry-run" in sys.argv: - print("Data will not be uploaded to GCS") - else: - # Set GCS temp location - path = str(time.time()) - temp_path = "gs://" + BUCKET_NAME + "/" + path - - # Write dataframe to temp location to preserve the data in final location - # This takes time, so final location should not be overwritten with partial data - print("Uploading data to GCS...") - ( - df.write - # gzip the output file - .options(codec="org.apache.hadoop.io.compress.GzipCodec") - # write as csv - .csv(temp_path) - ) - - # Get GCS bucket - storage_client = storage.Client() - source_bucket = storage_client.get_bucket(BUCKET_NAME) - - # Get all files in temp location - blobs = list(source_bucket.list_blobs(prefix=path)) - - # Copy files from temp location to the final location - # This is much quicker than the original write to the temp location - final_path = "clean_data/" - for blob in blobs: - file_match = re.match(path + r"/(part-\d*)[0-9a-zA-Z\-]*.csv.gz", blob.name) - if file_match: - new_blob = final_path + file_match[1] + ".csv.gz" - source_bucket.copy_blob(blob, source_bucket, new_blob) - - # Delete the temp location - for blob in blobs: - blob.delete() - - print( - "Data successfully uploaded to " + "gs://" + BUCKET_NAME + "/" + final_path - ) diff --git a/data-science-onramp/data-cleaning/clean.sh b/data-science-onramp/data-cleaning/clean.sh deleted file mode 100755 index cff45237ce6..00000000000 --- a/data-science-onramp/data-cleaning/clean.sh +++ /dev/null @@ -1,12 +0,0 @@ -# Submit a PySpark job via the Cloud Dataproc Jobs API -# Requires having PROJECT_ID, CLUSTER_NAME and BUCKET_NAME set as -# environment variables - -export CLUSTER_NAME=data-cleaning -export PROJECT_ID=data-science-onramp -export BUCKET_NAME=citibikevd - -gcloud dataproc jobs submit pyspark --cluster ${CLUSTER_NAME} \ - --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \ - --driver-log-levels root=FATAL \ - clean.py -- ${PROJECT_ID} ${BUCKET_NAME} --dry-run \ No newline at end of file diff --git a/data-science-onramp/data-cleaning/clean_test.py b/data-science-onramp/data-cleaning/clean_test.py deleted file mode 100644 index ee28a65da01..00000000000 --- a/data-science-onramp/data-cleaning/clean_test.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -import re -import uuid - -from google.cloud import dataproc_v1 as dataproc -from google.cloud import storage -import pytest - -# Set global variables -PROJECT = os.environ["GCLOUD_PROJECT"] -DATAPROC_CLUSTER = f"clean-test-{uuid.uuid4()}" -BUCKET_NAME = f"clean-test-code-{uuid.uuid4()}" -CLUSTER_REGION = "us-east4" -DESTINATION_BLOB_NAME = "clean.py" -JOB_FILE_NAME = f"gs://{BUCKET_NAME}/clean.py" -JOB_DETAILS = { # Job configuration - "placement": {"cluster_name": DATAPROC_CLUSTER}, - "pyspark_job": { - "main_python_file_uri": JOB_FILE_NAME, - "args": [PROJECT, BUCKET_NAME, "--dry-run",], - "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"], - }, -} -CLUSTER_IMAGE = "1.5.4-debian10" -CLUSTER_DATA = { # Create cluster configuration - "project_id": PROJECT, - "cluster_name": DATAPROC_CLUSTER, - "config": { - "gce_cluster_config": { - "zone_uri": "", - "metadata": {"PIP_PACKAGES": "google-cloud-storage"}, - }, - "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"}, - "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"}, - "software_config": { - "image_version": CLUSTER_IMAGE, - "optional_components": ["ANACONDA"], - }, - }, -} - - -@pytest.fixture(autouse=True) -def setup_and_teardown_cluster(): - # Create cluster using cluster client - cluster_client = dataproc.ClusterControllerClient( - client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} - ) - operation = cluster_client.create_cluster(PROJECT, CLUSTER_REGION, CLUSTER_DATA) - - # Wait for cluster to provision - operation.result() - - yield - - # Delete cluster - operation = cluster_client.delete_cluster(PROJECT, CLUSTER_REGION, DATAPROC_CLUSTER) - operation.result() - - -@pytest.fixture(autouse=True) -def setup_and_teardown_bucket(): - # Create GCS Bucket - storage_client = storage.Client() - bucket = storage_client.create_bucket(BUCKET_NAME) - - # Upload file - blob = bucket.blob(DESTINATION_BLOB_NAME) - blob.upload_from_filename("clean.py") - - yield - - # Delete GCS bucket - bucket = storage_client.get_bucket(BUCKET_NAME) - bucket.delete(force=True) - - -def is_in_table(value, out): - return re.search(f"\\| *{value} *\\|", out) - - -def get_blob_from_path(path): - bucket_name = re.search("dataproc.+?/", path).group(0)[0:-1] - bucket = storage.Client().get_bucket(bucket_name) - output_location = re.search("google-cloud-dataproc.+", path).group(0) - return bucket.blob(output_location) - - -def test_clean(): - """Tests clean.py by submitting it to a Dataproc cluster""" - - # Submit job to Dataproc cluster - job_client = dataproc.JobControllerClient( - client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"} - ) - response = job_client.submit_job_as_operation( - project_id=PROJECT, region=CLUSTER_REGION, job=JOB_DETAILS - ) - - # Wait for job to complete - result = response.result() - - # Get job output - output_location = result.driver_output_resource_uri + ".000000000" - blob = get_blob_from_path(output_location) - out = blob.download_as_string().decode("utf-8") - - # trip duration - assert not is_in_table(r"\d*.\d* s", out) - assert not is_in_table(r"\d*.\d* min", out) - assert not is_in_table(r"\d*.\d* h", out) - - # station latitude & longitude - assert not is_in_table(r"\d+" + "\u00B0" + r"\d+\'\d+\"", out) - - assert is_in_table(r"\d*.\d*", out) - - # gender - assert not is_in_table("M", out) - assert not is_in_table("m", out) - assert not is_in_table("male", out) - assert not is_in_table("MALE", out) - assert not is_in_table("F", out) - assert not is_in_table("f", out) - assert not is_in_table("female", out) - assert not is_in_table("FEMALE", out) - assert not is_in_table("U", out) - assert not is_in_table("u", out) - assert not is_in_table("unknown", out) - assert not is_in_table("UNKNOWN", out) - - assert is_in_table("Male", out) - assert is_in_table("Female", out) - - # customer_plan - assert not is_in_table("subscriber", out) - assert not is_in_table("SUBSCRIBER", out) - assert not is_in_table("sub", out) - assert not is_in_table("customer", out) - assert not is_in_table("CUSTOMER", out) - assert not is_in_table("cust", out) - - assert is_in_table("Subscriber", out) - assert is_in_table("Customer", out) diff --git a/data-science-onramp/data-cleaning/requirements-test.txt b/data-science-onramp/data-cleaning/requirements-test.txt deleted file mode 100644 index 781d4326c94..00000000000 --- a/data-science-onramp/data-cleaning/requirements-test.txt +++ /dev/null @@ -1 +0,0 @@ -pytest==5.3.2 diff --git a/data-science-onramp/data-cleaning/requirements.txt b/data-science-onramp/data-cleaning/requirements.txt deleted file mode 100644 index f435423c623..00000000000 --- a/data-science-onramp/data-cleaning/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -grpcio==1.29.0 -google-auth==1.16.0 -google-auth-httplib2==0.0.3 -google-cloud==0.34.0 -google-cloud-storage==1.28.1 -google-cloud-dataproc==0.8.0 \ No newline at end of file