8000 Use int64 for TPC-H keys and set input schema to not nullable (#714) · datapythonista/datafusion-python@0d42e77 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0d42e77

Browse files
authored
Use int64 for TPC-H keys and set input schema to not nullable (apache#714)
1 parent e029559 commit 0d42e77

File tree

1 file changed

+16
-14
lines changed

1 file changed

+16
-14
lines changed

examples/tpch/convert_data_to_parquet.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,20 @@
3131
all_schemas = {}
3232

3333
all_schemas["customer"] = [
34-
("C_CUSTKEY", pyarrow.int32()),
34+
("C_CUSTKEY", pyarrow.int64()),
3535
("C_NAME", pyarrow.string()),
3636
("C_ADDRESS", pyarrow.string()),
37-
("C_NATIONKEY", pyarrow.int32()),
37+
("C_NATIONKEY", pyarrow.int64()),
3838
("C_PHONE", pyarrow.string()),
3939
("C_ACCTBAL", pyarrow.decimal128(15, 2)),
4040
("C_MKTSEGMENT", pyarrow.string()),
4141
("C_COMMENT", pyarrow.string()),
4242
]
4343

4444
all_schemas["lineitem"] = [
45-
("L_ORDERKEY", pyarrow.int32()),
46-
("L_PARTKEY", pyarrow.int32()),
47-
("L_SUPPKEY", pyarrow.int32()),
45+
("L_ORDERKEY", pyarrow.int64()),
46+
("L_PARTKEY", pyarrow.int64()),
47+
("L_SUPPKEY", pyarrow.int64()),
4848
("L_LINENUMBER", pyarrow.int32()),
4949
("L_QUANTITY", pyarrow.decimal128(15, 2)),
5050
("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)),
@@ -61,15 +61,15 @@
6161
]
6262

6363
all_schemas["nation"] = [
64-
("N_NATIONKEY", pyarrow.int32()),
64+
("N_NATIONKEY", pyarrow.int64()),
6565
("N_NAME", pyarrow.string()),
66-
("N_REGIONKEY", pyarrow.int32()),
66+
("N_REGIONKEY", pyarrow.int64()),
6767
("N_COMMENT", pyarrow.string()),
6868
]
6969

7070
all_schemas["orders"] = [
71-
("O_ORDERKEY", pyarrow.int32()),
72-
("O_CUSTKEY", pyarrow.int32()),
71+
("O_ORDERKEY", pyarrow.int64()),
72+
("O_CUSTKEY", pyarrow.int64()),
7373
("O_ORDERSTATUS", pyarrow.string()),
7474
("O_TOTALPRICE", pyarrow.decimal128(15, 2)),
7575
("O_ORDERDATE", pyarrow.date32()),
@@ -80,7 +80,7 @@
8080
]
8181

8282
all_schemas["part"] = [
83-
("P_PARTKEY", pyarrow.int32()),
83+
("P_PARTKEY", pyarrow.int64()),
8484
("P_NAME", pyarrow.string()),
8585
("P_MFGR", pyarrow.string()),
8686
("P_BRAND", pyarrow.string()),
@@ -92,21 +92,21 @@
9292
]
9393

9494
all_schemas["partsupp"] = [
95-
("PS_PARTKEY", pyarrow.int32()),
96-
("PS_SUPPKEY", pyarrow.int32()),
95+
("PS_PARTKEY", pyarrow.int64()),
96+
("PS_SUPPKEY", pyarrow.int64()),
9797
("PS_AVAILQTY", pyarrow.int32()),
9898
("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)),
9999
("PS_COMMENT", pyarrow.string()),
100100
]
101101

102102
all_schemas["region"] = [
103-
("r_REGIONKEY", pyarrow.int32()),
103+
("r_REGIONKEY", pyarrow.int64()),
104104
("r_NAME", pyarrow.string()),
105105
("r_COMMENT", pyarrow.string()),
106106
]
107107

108108
all_schemas["supplier"] = [
109-
("S_SUPPKEY", pyarrow.int32()),
109+
("S_SUPPKEY", pyarrow.int64()),
110110
("S_NAME", pyarrow.string()),
111111
("S_ADDRESS", pyarrow.string()),
112112
("S_NATIONKEY", pyarrow.int32()),
@@ -125,6 +125,8 @@
125125
# in to handle the trailing | in the file
126126
output_cols = [r[0] for r in curr_schema]
127127

128+
curr_schema = [ pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema]
129+
128130
# Trailing | requires extra field for in processing
129131
curr_schema.append(("some_null", pyarrow.null()))
130132

0 commit comments

Comments
 (0)
0