From ddb99042cb2248793fea1115859d82b63905a986 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Sat, 24 Aug 2024 00:05:53 +0000 Subject: [PATCH] chore: add tpch q14-18 --- tests/benchmark/tpch/q14.py | 25 +++++++++ tests/benchmark/tpch/q15.py | 25 +++++++++ tests/benchmark/tpch/q16.py | 25 +++++++++ tests/benchmark/tpch/q17.py | 25 +++++++++ tests/benchmark/tpch/q18.py | 25 +++++++++ .../bigframes_vendored/tpch/queries/q14.py | 34 +++++++++++++ .../bigframes_vendored/tpch/queries/q15.py | 48 +++++++++++++++++ .../bigframes_vendored/tpch/queries/q16.py | 44 ++++++++++++++++ .../bigframes_vendored/tpch/queries/q17.py | 40 +++++++++++++++ .../bigframes_vendored/tpch/queries/q18.py | 51 +++++++++++++++++++ 10 files changed, 342 insertions(+) create mode 100644 tests/benchmark/tpch/q14.py create mode 100644 tests/benchmark/tpch/q15.py create mode 100644 tests/benchmark/tpch/q16.py create mode 100644 tests/benchmark/tpch/q17.py create mode 100644 tests/benchmark/tpch/q18.py create mode 100644 third_party/bigframes_vendored/tpch/queries/q14.py create mode 100644 third_party/bigframes_vendored/tpch/queries/q15.py create mode 100644 third_party/bigframes_vendored/tpch/queries/q16.py create mode 100644 third_party/bigframes_vendored/tpch/queries/q17.py create mode 100644 third_party/bigframes_vendored/tpch/queries/q18.py diff --git a/tests/benchmark/tpch/q14.py b/tests/benchmark/tpch/q14.py new file mode 100644 index 0000000000..8aa7ed4d2e --- /dev/null +++ b/tests/benchmark/tpch/q14.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q14 as vendored_tpch_q14 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q14.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q15.py b/tests/benchmark/tpch/q15.py new file mode 100644 index 0000000000..511cfbc683 --- /dev/null +++ b/tests/benchmark/tpch/q15.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q15 as vendored_tpch_q15 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q15.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q16.py b/tests/benchmark/tpch/q16.py new file mode 100644 index 0000000000..1d1f4b5f30 --- /dev/null +++ b/tests/benchmark/tpch/q16.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q16 as vendored_tpch_q16 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q16.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q17.py b/tests/benchmark/tpch/q17.py new file mode 100644 index 0000000000..e285cc9fca --- /dev/null +++ b/tests/benchmark/tpch/q17.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q17 as vendored_tpch_q17 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q17.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q18.py b/tests/benchmark/tpch/q18.py new file mode 100644 index 0000000000..de9e4f2f94 --- /dev/null +++ b/tests/benchmark/tpch/q18.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q18 as vendored_tpch_q18 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q18.q, current_path, suffix, dataset_id, session + ) diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py new file mode 100644 index 0000000000..8c25a5897d --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -0,0 +1,34 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q14.py + +from datetime import date + +import bigframes + + +def q(dataset_id: str, session: bigframes.Session): + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + part = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.PART", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = date(1995, 9, 1) + var2 = date(1995, 10, 1) + + merged = lineitem.merge(part, left_on="L_PARTKEY", right_on="P_PARTKEY") + + filtered = merged[(merged["L_SHIPDATE"] >= var1) & (merged["L_SHIPDATE"] < var2)] + + filtered["CONDI_REVENUE"] = ( + filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"]) + ) * filtered["P_TYPE"].str.contains("PROMO").astype("Int64") + + total_revenue = (filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])).sum() + promo_revenue = filtered["CONDI_REVENUE"].sum() + + promo_revenue_percent = 100.00 * promo_revenue / total_revenue + + _ = round(promo_revenue_percent, 2) diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py new file mode 100644 index 0000000000..fae3010e36 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -0,0 +1,48 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q15.py + +from datetime import date + +import bigframes +import bigframes.pandas as bpd + + +def q(dataset_id: str, session: bigframes.Session): + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + supplier = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = date(1996, 1, 1) + var2 = date(1996, 4, 1) + + filtered_lineitem = lineitem[ + (lineitem["L_SHIPDATE"] >= var1) & (lineitem["L_SHIPDATE"] < var2) + ] + filtered_lineitem["REVENUE"] = filtered_lineitem["L_EXTENDEDPRICE"] * ( + 1 - filtered_lineitem["L_DISCOUNT"] + ) + + grouped_revenue = ( + filtered_lineitem.groupby("L_SUPPKEY", as_index=False) + .agg(TOTAL_REVENUE=bpd.NamedAgg(column="REVENUE", aggfunc="sum")) + .rename(columns={"L_SUPPKEY": "SUPPLIER_NO"}) + ) + + joined_data = bpd.merge( + supplier, grouped_revenue, left_on="S_SUPPKEY", right_on="SUPPLIER_NO" + ) + + max_revenue = joined_data["TOTAL_REVENUE"].max() + max_revenue_suppliers = joined_data[joined_data["TOTAL_REVENUE"] == max_revenue] + + max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers[ + "TOTAL_REVENUE" + ].round(2) + q_final = max_revenue_suppliers[ + ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"] + ].sort_values("S_SUPPKEY") + q_final.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py new file mode 100644 index 0000000000..b5eb62547f --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -0,0 +1,44 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q16.py + +import bigframes +import bigframes.pandas as bpd + + +def q(dataset_id: str, session: bigframes.Session): + part = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.PART", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + partsupp = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + supplier = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = "Brand#45" + + supplier = supplier[ + supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) + ]["S_SUPPKEY"] + + q_filtered = part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") + q_filtered = q_filtered[q_filtered["P_BRAND"] != var1] + q_filtered = q_filtered[~q_filtered["P_TYPE"].str.contains("MEDIUM POLISHED")] + q_filtered = q_filtered[q_filtered["P_SIZE"].isin([49, 14, 23, 45, 19, 3, 36, 9])] + + final_df = q_filtered[~q_filtered["PS_SUPPKEY"].isin(supplier)] + + grouped = final_df.groupby(["P_BRAND", "P_TYPE", "P_SIZE"], as_index=False) + result = grouped.agg( + SUPPLIER_CNT=bpd.NamedAgg(column="PS_SUPPKEY", aggfunc="nunique") + ) + + q_final = result.sort_values( + by=["SUPPLIER_CNT", "P_BRAND", "P_TYPE", "P_SIZE"], + ascending=[False, True, True, True], + ) + + q_final.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py new file mode 100644 index 0000000000..a95d128b5c --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q17.py @@ -0,0 +1,40 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q17.py + +import bigframes +import bigframes.pandas as bpd + + +def q(dataset_id: str, session: bigframes.Session): + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + part = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.PART", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + VAR1 = "Brand#23" + VAR2 = "MED BOX" + + filtered_part = part[(part["P_BRAND"] == VAR1) & (part["P_CONTAINER"] == VAR2)] + q1 = bpd.merge( + filtered_part, lineitem, how="left", left_on="P_PARTKEY", right_on="L_PARTKEY" + ) + + grouped = ( + q1.groupby("P_PARTKEY", as_index=False) + .agg(AVG_QUANTITY=bpd.NamedAgg(column="L_QUANTITY", aggfunc="mean")) + .rename(columns={"P_PARTKEY": "KEY"}) + ) + grouped["AVG_QUANTITY"] = grouped["AVG_QUANTITY"] * 0.2 + + q_final = bpd.merge(grouped, q1, left_on="KEY", right_on="P_PARTKEY") + + q_final = q_final[q_final["L_QUANTITY"] < q_final["AVG_QUANTITY"]] + + q_final = bpd.DataFrame( + {"AVG_YEARLY": [(q_final["L_EXTENDEDPRICE"].sum() / 7.0).round(2)]} + ) + + q_final.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q18.py b/third_party/bigframes_vendored/tpch/queries/q18.py new file mode 100644 index 0000000000..dac9f5c438 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q18.py @@ -0,0 +1,51 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q18.py + +import typing + +import bigframes +import bigframes.pandas as bpd + + +def q(dataset_id: str, session: bigframes.Session): + customer = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + orders = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.ORDERS", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = 300 + + q1 = lineitem.groupby("L_ORDERKEY", as_index=False).agg( + SUM_QUANTITY=bpd.NamedAgg(column="L_QUANTITY", aggfunc="sum") + ) + q1 = q1[q1["SUM_QUANTITY"] > var1] + + filtered_orders = orders.merge( + q1, left_on="O_ORDERKEY", right_on="L_ORDERKEY", how="inner" + ) + + result = filtered_orders.merge( + lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY" + ) + result = result.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY") + + final_result = result.groupby( + ["C_NAME", "C_CUSTKEY", "O_ORDERKEY", "O_ORDERDATE", "O_TOTALPRICE"], + as_index=False, + ).agg(COL6=bpd.NamedAgg(column="L_QUANTITY", aggfunc="sum")) + + final_result = final_result.rename(columns={"O_ORDERDATE": "O_ORDERDAT"}) + + final_result = typing.cast(bpd.DataFrame, final_result).sort_values( + ["O_TOTALPRICE", "O_ORDERDAT"], ascending=[False, True] + ) + + q_final = final_result.head(100) + q_final.to_gbq()