8000 Add python samples for BQML tutorials (#1691) · cevaris/python-docs-samples@7bbf70b · GitHub
[go: up one dir, main page]

Skip to content

Commit 7bbf70b

Browse files
authored
Add python samples for BQML tutorials (GoogleCloudPlatform#1691)
1 parent 68a4622 commit 7bbf70b

File tree

5 files changed

+1538
-0
lines changed

5 files changed

+1538
-0
lines changed
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Copyright 2018 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START bqml_data_scientist_tutorial_import_and_client]
16+
from google.cloud import bigquery
17+
# [END bqml_data_scientist_tutorial_import_and_client]
18+
import pytest
19+
20+
# [START bqml_data_scientist_tutorial_import_and_client]
21+
client = bigquery.Client()
22+
# [END bqml_data_scientist_tutorial_import_and_client]
23+
24+
25+
@pytest.fixture
26+
def delete_dataset():
27+
yield
28+
client.delete_dataset(
29+
client.dataset('bqml_tutorial'), delete_contents=True)
30+
31+
32+
def test_data_scientist_tutorial(delete_dataset):
33+
# [START bqml_data_scientist_tutorial_create_dataset]
34+
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
35+
dataset.location = 'US'
36+
client.create_dataset(dataset)
37+
# [END bqml_data_scientist_tutorial_create_dataset]
38+
39+
# [START bqml_data_scientist_tutorial_create_model]
40+
sql = """
41+
CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`
42+
OPTIONS(model_type='logistic_reg') AS
43+
SELECT
44+
IF(totals.transactions IS NULL, 0, 1) AS label,
45+
IFNULL(device.operatingSystem, "") AS os,
46+
device.isMobile AS is_mobile,
47+
IFNULL(geoNetwork.country, "") AS country,
48+
IFNULL(totals.pageviews, 0) AS pageviews
49+
FROM
50+
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
51+
WHERE
52+
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
53+
"""
54+
df = client.query(sql).to_dataframe()
55+
print(df)
56+
# [END bqml_data_scientist_tutorial_create_model]
57+
58+
# [START bqml_data_scientist_tutorial_get_training_statistics]
59+
sql = """
60+
SELECT
61+
*
62+
FROM
63+
ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)
64+
"""
65+
df = client.query(sql).to_dataframe()
66+
print(df)
67+
# [END bqml_data_scientist_tutorial_get_training_statistics]
68+
69+
# [START bqml_data_scientist_tutorial_evaluate_model]
70+
sql = """
71+
SELECT
72+
*
73+
FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (
74+
SELECT
75+
IF(totals.transactions IS NULL, 0, 1) AS label,
76+
IFNULL(device.operatingSystem, "") AS os,
77+
device.isMobile AS is_mobile,
78+
IFNULL(geoNetwork.country, "") AS country,
79+
IFNULL(totals.pageviews, 0) AS pageviews
80+
FROM
81+
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
82+
WHERE
83+
_TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
84+
"""
85+
df = client.query(sql).to_dataframe()
86+
print(df)
87+
# [END bqml_data_scientist_tutorial_evaluate_model]
88+
89+
# [START bqml_data_scientist_tutorial_predict_transactions]
90+
sql = """
91+
SELECT
92+
country,
93+
SUM(predicted_label) as total_predicted_purchases
94+
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
95+
SELECT
96+
IFNULL(device.operatingSystem, "") AS os,
97+
device.isMobile AS is_mobile,
98+
IFNULL(totals.pageviews, 0) AS pageviews,
99+
IFNULL(geoNetwork.country, "") AS country
100+
FROM
101+
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
102+
WHERE
103+
_TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
104+
GROUP BY country
105+
ORDER BY total_predicted_purchases DESC
106+
LIMIT 10
107+
"""
108+
df = client.query(sql).to_dataframe()
109+
print(df)
110+
# [END bqml_data_scientist_tutorial_predict_transactions]
111+
112+
# [START bqml_data_scientist_tutorial_predict_purchases]
113+
sql = """
114+
SELECT
115+
fullVisitorId,
116+
SUM(predicted_label) as total_predicted_purchases
117+
FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
118+
SELECT
119+
IFNULL(device.operatingSystem, "") AS os,
120+
device.isMobile AS is_mobile,
121+
IFNULL(totals.pageviews, 0) AS pageviews,
122+
IFNULL(geoNetwork.country, "") AS country,
123+
fullVisitorId
124+
FROM
125+
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
126+
WHERE
127+
_TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
128+
GROUP BY fullVisitorId
129+
ORDER BY total_predicted_purchases DESC
130+
LIMIT 10
131+
"""
132+
df = client.query(sql).to_dataframe()
133+
print(df)
134+
# [END bqml_data_scientist_tutorial_predict_purchases]

bigquery/bqml/ncaa_tutorial_test.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Copyright 2018 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import io
16+
import os
17+
18+
# [START bqml_ncaa_tutorial_import_and_client]
19+
from google.cloud import bigquery
20+
# [END bqml_ncaa_tutorial_import_and_client]
21+
import pytest
22+
23+
# [START bqml_ncaa_tutorial_import_and_client]
24+
client = bigquery.Client()
25+
# [END bqml_ncaa_tutorial_import_and_client]
26+
27+
28+
@pytest.fixture
29+
def delete_dataset():
30+
yield
31+
client.delete_dataset(
32+
client.dataset('bqml_tutorial'), delete_contents=True)
33+
34+
35+
def test_ncaa_tutorial(delete_dataset):
36+
# [START bqml_ncaa_tutorial_create_dataset]
37+
dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
38+
dataset.location = 'US'
39+
client.create_dataset(dataset)
40+
# [END bqml_ncaa_tutorial_create_dataset]
41+
42+
# Create the tables used by the tutorial
43+
# Note: the queries are saved to a file. This should be updated to use the
44+
# saved queries once the library supports running saved queries.
45+
query_filepath_to_table_name = {
46+
'feature_input_query.sql': 'cume_games',
47+
'training_data_query.sql': 'wide_games'
48+
}
49+
resources_directory = os.path.join(os.path.dirname(__file__), 'resources')
50+
for query_filepath, table_name in query_filepath_to_table_name.items():
51+
table_ref = dataset.table(table_name)
52+
job_config = bigquery.QueryJobConfig()
53+
job_config.destination = table_ref
54+
query_filepath = os.path.join(
55+
resources_directory, query_filepath)
56+
sql = io.open(query_filepath, 'r', encoding='utf-8').read()
57+
client.query(sql, job_config=job_config).result()
58+
59+
# [START bqml_ncaa_tutorial_create_model]
60+
sql = """
61+
CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model`
62+
OPTIONS (
63+
model_type='linear_reg',
64+
data_split_eval_fraction=0.1,
65+
max_iteration=50 ) AS
66+
SELECT
67+
* EXCEPT (
68+
game_id, season, scheduled_date,
69+
total_three_points_made,
70+
total_three_points_att),
71+
total_three_points_att as label
72+
FROM
73+
`bqml_tutorial.wide_games`
74+
WHERE
75+
# remove the game to predict
76+
game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83'
77+
"""
78+
df = client.query(sql).to_dataframe()
79+
print(df)
80+
# [END bqml_ncaa_tutorial_create_model]
81+
82+
# [START bqml_ncaa_tutorial_get_training_statistics]
83+
sql = """
84+
SELECT
85+
*
86+
FROM
87+
ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`)
88+
"""
89+
df = client.query(sql).to_dataframe()
90+
print(df)
91+
# [END bqml_ncaa_tutorial_get_training_statistics]
92+
93+
# [START bqml_ncaa_tutorial_evaluate_model]
94+
sql = """
95+
WITH eval_table AS (
96+
SELECT
97+
*,
98+
total_three_points_att AS label
99+
FROM
100+
`bqml_tutorial.wide_games` )
101+
SELECT
102+
*
103+
FROM
104+
ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`,
105+
TABLE eval_table)
106+
"""
107+
df = client.query(sql).to_dataframe()
108+
print(df)
109+
# [END bqml_ncaa_tutorial_evaluate_model]
110+
111+
# [START bqml_ncaa_tutorial_predict_outcomes]
112+
sql = """
113+
WITH game_to_predict AS (
114+
SELECT
115+
*
116+
FROM
117+
`bqml_tutorial.wide_games`
118+
WHERE
119+
game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' )
120+
SELECT
121+
truth.game_id AS game_id,
122+
total_three_points_att,
123+
predicted_total_three_points_att
124+
FROM (
125+
SELECT
126+
game_id,
127+
predicted_label AS predicted_total_three_points_att
128+
FROM
129+
ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`,
130+
table game_to_predict) ) AS predict
131+
JOIN (
132+
SELECT
133+
game_id,
134+
total_three_points_att AS total_three_points_att
135+
FROM
136+
game_to_predict) AS truth
137+
ON
138+
predict.game_id = truth.game_id
139+
"""
140+
df = client.query(sql).to_dataframe()
141+
print(df)
142+
# [END bqml_ncaa_tutorial_predict_outcomes]

bigquery/bqml/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
google-cloud-bigquery[pandas]==1.5.0
2+
flaky
3+
mock
4+
pytest

0 commit comments

Comments
 (0)
0