-
Notifications
You must be signed in to change notification settings - Fork 141
Expand file tree
/
Copy pathrun-sql-bench.sh
More file actions
executable file
·138 lines (123 loc) · 4.35 KB
/
run-sql-bench.sh
File metadata and controls
executable file
·138 lines (123 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors
#
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
# This script is used by the sql-benchmarks.yml workflow.
#
# Usage:
# run-sql-bench.sh <subcommand> <targets> [options]
#
# Arguments:
# subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds)
# targets Comma-separated list of engine:format pairs
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
#
# Options:
# --scale-factor <sf> Scale factor for the benchmark (e.g., 1.0, 10.0)
# --iterations <n> Number of iterations to pass to each benchmark binary
# --remote-storage <url> Remote storage URL (e.g., s3://bucket/path/)
# If provided, runs in remote mode (no lance support).
# --benchmark-id <id> Benchmark ID for error messages (e.g., tpch-s3)
set -Eeu -o pipefail
subcommand="$1"
targets="$2"
shift 2
scale_factor=""
iterations=""
remote_storage=""
benchmark_id=""
while [[ $# -gt 0 ]]; do
case "$1" in
--scale-factor)
scale_factor="$2"
shift 2
;;
--iterations)
iterations="$2"
shift 2
;;
--remote-storage)
remote_storage="$2"
shift 2
;;
--benchmark-id)
benchmark_id="$2"
shift 2
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done
is_remote=false
if [[ -n "$remote_storage" ]]; then
is_remote=true
fi
# Lance on remote storage is not supported. The infrastructure to generate and upload lance files
# to S3 does not exist. If you need lance on S3, you must first implement:
# 1. Lance data generation in data-gen (or a separate step)
# 2. Lance data upload to S3 before this step runs
if $is_remote && echo "$targets" | grep -q 'lance'; then
echo "ERROR: Lance format is not supported for remote storage benchmarks."
echo "Remove 'datafusion:lance' from targets for benchmark '${benchmark_id:-unknown}'."
exit 1
fi
# Extract formats for each engine from the targets string.
# Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
#
# Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
#
# Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
#
# The `|| true` is needed because some benchmarks don't use all engines (e.g., statpopgen only has
# duckdb targets). grep returns exit code 1 when no matches are found. Both greps must be in the
# subshell so that `|| true` covers the case where grep -v receives empty input.
df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
# Build options string.
opts=""
if $is_remote; then
opts="--opt remote-data-dir=$remote_storage"
fi
if [[ -n "$scale_factor" ]]; then
if [[ -n "$opts" ]]; then
opts="--opt scale-factor=$scale_factor $opts"
else
opts="--opt scale-factor=$scale_factor"
fi
fi
if [[ -n "$iterations" ]]; then
opts="-i $iterations $opts"
fi
touch results.json
if [[ -n "$df_formats" ]]; then
# shellcheck disable=SC2086
target/release_debug/datafusion-bench "$subcommand" \
-d gh-json \
--formats "$df_formats" \
$opts \
-o df-results.json
cat df-results.json >> results.json
fi
if [[ -n "$ddb_formats" ]]; then
# shellcheck disable=SC2086
target/release_debug/duckdb-bench "$subcommand" \
-d gh-json \
--formats "$ddb_formats" \
$opts \
--delete-duckdb-database \
-o ddb-results.json
cat ddb-results.json >> results.json
fi
# Lance-bench only runs for local benchmarks.
if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/lance-bench" ]]; then
# shellcheck disable=SC2086
target/release_debug/lance-bench "$subcommand" \
-d gh-json \
$opts \
-o lance-results.json
cat lance-results.json >> results.json
fi