From 962e98c2af2d993552161130e2c3ed0115b67812 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 23 May 2024 07:07:23 -0400 Subject: [PATCH] Use int64 for TPC-H keys and set input schema to not nullable --- examples/tpch/convert_data_to_parquet.py | 30 +++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index 5da60bc28..d81ec290d 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -31,10 +31,10 @@ all_schemas = {} all_schemas["customer"] = [ - ("C_CUSTKEY", pyarrow.int32()), + ("C_CUSTKEY", pyarrow.int64()), ("C_NAME", pyarrow.string()), ("C_ADDRESS", pyarrow.string()), - ("C_NATIONKEY", pyarrow.int32()), + ("C_NATIONKEY", pyarrow.int64()), ("C_PHONE", pyarrow.string()), ("C_ACCTBAL", pyarrow.decimal128(15, 2)), ("C_MKTSEGMENT", pyarrow.string()), @@ -42,9 +42,9 @@ ] all_schemas["lineitem"] = [ - ("L_ORDERKEY", pyarrow.int32()), - ("L_PARTKEY", pyarrow.int32()), - ("L_SUPPKEY", pyarrow.int32()), + ("L_ORDERKEY", pyarrow.int64()), + ("L_PARTKEY", pyarrow.int64()), + ("L_SUPPKEY", pyarrow.int64()), ("L_LINENUMBER", pyarrow.int32()), ("L_QUANTITY", pyarrow.decimal128(15, 2)), ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)), @@ -61,15 +61,15 @@ ] all_schemas["nation"] = [ - ("N_NATIONKEY", pyarrow.int32()), + ("N_NATIONKEY", pyarrow.int64()), ("N_NAME", pyarrow.string()), - ("N_REGIONKEY", pyarrow.int32()), + ("N_REGIONKEY", pyarrow.int64()), ("N_COMMENT", pyarrow.string()), ] all_schemas["orders"] = [ - ("O_ORDERKEY", pyarrow.int32()), - ("O_CUSTKEY", pyarrow.int32()), + ("O_ORDERKEY", pyarrow.int64()), + ("O_CUSTKEY", pyarrow.int64()), ("O_ORDERSTATUS", pyarrow.string()), ("O_TOTALPRICE", pyarrow.decimal128(15, 2)), ("O_ORDERDATE", pyarrow.date32()), @@ -80,7 +80,7 @@ ] all_schemas["part"] = [ - ("P_PARTKEY", pyarrow.int32()), + ("P_PARTKEY", pyarrow.int64()), ("P_NAME", pyarrow.string()), ("P_MFGR", pyarrow.string()), ("P_BRAND", pyarrow.string()), @@ -92,21 +92,21 @@ ] all_schemas["partsupp"] = [ - ("PS_PARTKEY", pyarrow.int32()), - ("PS_SUPPKEY", pyarrow.int32()), + ("PS_PARTKEY", pyarrow.int64()), + ("PS_SUPPKEY", pyarrow.int64()), ("PS_AVAILQTY", pyarrow.int32()), ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)), ("PS_COMMENT", pyarrow.string()), ] all_schemas["region"] = [ - ("r_REGIONKEY", pyarrow.int32()), + ("r_REGIONKEY", pyarrow.int64()), ("r_NAME", pyarrow.string()), ("r_COMMENT", pyarrow.string()), ] all_schemas["supplier"] = [ - ("S_SUPPKEY", pyarrow.int32()), + ("S_SUPPKEY", pyarrow.int64()), ("S_NAME", pyarrow.string()), ("S_ADDRESS", pyarrow.string()), ("S_NATIONKEY", pyarrow.int32()), @@ -125,6 +125,8 @@ # in to handle the trailing | in the file output_cols = [r[0] for r in curr_schema] + curr_schema = [ pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema] + # Trailing | requires extra field for in processing curr_schema.append(("some_null", pyarrow.null()))