8000 gh-125985: Add free threading scaling micro benchmarks · colesbury/cpython@2805c12 · GitHub 8000
[go: up one dir, main page]

Skip to content

Commit 2805c12

Browse files
committed
pythongh-125985: Add free threading scaling micro benchmarks
These consist of a number of short snippets that help identify scaling bottlenecks in the free threaded interpreter. The current bottlenecks are in calling functions in benchmarks that call functions (due to `LOAD_ATTR` not yet using deferred reference counting) and when accessing thread-local data.
< 10000 div class="d-flex flex-row flex-items-center">
1 parent fed501d commit 2805c12

File tree

1 file changed

+321
-0
lines changed

1 file changed

+321
-0
lines changed
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
# This script runs a set of small benchmarks to help identify scaling
2+
# bottlenecks in the free-threaded interpreter. The benchmarks consists
3+
# of patterns that ought to scale well, but haven't in the past. This is
4+
# typically due to reference count contention or lock contention.
5+
#
6+
# This is not intended to be a general multithreading benchmark suite, nor
7+
# are the benchmarks intended to be representative of real-world workloads.
8+
#
9+
# On Linux, to avoid confounding hardware effects, the script attempts to:
10+
# * Use a single CPU socket (to avoid NUMA effects)
11+
# * Use distinct physical cores (to avoid hyperthreading/SMT effects)
12+
# * Use "performance" cores (Intel, ARM) on CPUs that have performance and
13+
# efficiency cores
14+
#
15+
# It also helps to disable dynamic frequency scaling (i.e., "Turbo Boost")
16+
#
17+
# Intel:
18+
# > echo "1" | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo
19+
#
20+
# AMD:
21+
# > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost
22+
#
23+
24+
import math
25+
import threading
26+
import time
27+
import sys
28+
import os
29+
import queue
30+
import gc
31+
32+
# The iterations in individual benchmarks are scaled by this factor.
33+
WORK_SCALE = 100
34+
35+
ALL_BENCHMARKS = {}
36+
37+
threads = []
38+
in_queues = []
39+
out_queues = []
40+
41+
42+
def register_benchmark(func):
43+
ALL_BENCHMARKS[func.__name__] = func
44+
return func
45+
46+
@register_benchmark
47+
def object_cfunction():
48+
accu = 0
49+
tab = [1] * 100
50+
for i in range(1000 * WORK_SCALE):
51+
tab.pop(0)
52+
tab.append(i)
53+
accu += tab[50]
54+
return accu
55+
56+
@register_benchmark
57+
def cmodule_function():
58+
for i in range(1000 * WORK_SCALE):
59+
math.floor(i * i)
60+
61+
@register_benchmark
62+
def mult_constant():
63+
x = 1.0
64+
for i in range(3000 * WORK_SCALE):
65+
x *= 1.01
66+
67+
def simple_gen():
68+
for i in range(10):
69+
yield i
70+
71+
@register_benchmark
72+
def generator():
73+
accu = 0
74+
for i in range(100 * WORK_SCALE):
75+
for v in simple_gen():
76+
accu += v
77+
return accu
78+
79+
class Counter:
80+
def __init__(self):
81+
self.i = 0
82+
83+
def next_number(self):
84+
self.i += 1
85+
return self.i
86+
87+
@register_benchmark
88+
def pymethod():
89+
c = Counter()
90+
for i in range(1000 * WORK_SCALE):
91+
c.next_number()
92+
return c.i
93+
94+
def next_number(i):
95+
return i + 1
96+
97+
@register_benchmark
98+
def pyfunction():
99+
accu = 0
100+
for i in range(1000 * WORK_SCALE):
101+
accu = next_number(i)
102+
return accu
103+
104+
def double(x):
105+
return x + x
106+
107+
module = sys.modules[__name__]
108+
109+
@register_benchmark
110+
def module_function():
111+
total = 0
112+
for i in range(1000 * WORK_SCALE):
113+
total += module.double(i)
114+
return total
115+
116+
class MyObject:
117+
pass
118+
119+
@register_benchmark
120+
def load_string_const():
121+
accu = 0
122+
for i in range(1000 * WORK_SCALE):
123+
if i == 'a string':
124+
accu += 7
125+
else:
126+
accu += 1
127+
return accu
128+
129+
@register_benchmark
130+
def load_tuple_const():
131+
accu = 0
132+
for i in range(1000 * WORK_SCALE):
133+
if i == (1, 2):
134+
accu += 7
135+
else:
136+
accu += 1
137+
return accu
138+
139+
@register_benchmark
140+
def create_pyobject():
141+
for i in range(1000 * WORK_SCALE):
142+
o = MyObject()
143+
144+
@register_benchmark
145+
def create_closure():
146+
for i in range(1000 * WORK_SCALE):
147+
def foo(x):
148+
return x
149+
foo(i)
150+
151+
@register_benchmark
152+
def create_dict():
153+
for i in range(1000 * WORK_SCALE):
154+
d = {
155+
"key": "value",
156+
}
157+
158+
thread_local = threading.local()
159+
160+
@register_benchmark
161+
def thread_local_read():
162+
tmp = thread_local
163+
tmp.x = 10
164+
for i in range(500 * WORK_SCALE):
165+
_ = tmp.x
166+
_ = tmp.x
167+
_ = tmp.x
168+
_ = tmp.x
169+
_ = tmp.x
170+
171+
172+
def bench_one_thread(func):
173+
t0 = time.perf_counter_ns()
174+
func()
175+
t1 = time.perf_counter_ns()
176+
return t1 - t0
177+
178+
179+
def bench_parallel(func):
180+
t0 = time.perf_counter_ns()
181+
for inq in in_queues:
182+
inq.put(func)
183+
for outq in out_queues:
184+
outq.get()
185+
t1 = time.perf_counter_ns()
186+
return t1 - t0
187+
188+
189+
def benchmark(func):
190+
delta_one_thread = bench_one_thread(func)
191+
delta_many_threads = bench_parallel(func)
192+
193+
speedup = delta_one_thread * len(threads) / delta_many_threads
194+
if speedup >= 1:
195+
factor = speedup
196+
direction = "faster"
197+
else:
198+
factor = 1 / speedup
199+
direction = "slower"
200+
201+
use_color = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
202+
color = ""
203+
if use_color:
204+
if speedup <= 1.1:
205+
color = "\x1b[31m" # red
206+
elif speedup < len(threads)/2:
207+
color = "\x1b[33m" # yellow
208+
209+
print(f"{color}{func.__name__:<18} {round(factor, 1):>4}x {direction}{color and '\x1b[0m'}")
210+
211+
def determine_num_threads_and_affinity():
212+
if sys.platform != "linux":
213+
return [None] * os.cpu_count()
214+
215+
# Try to use `lscpu -p` on Linux
216+
import subprocess
217+
try:
218+
output = subprocess.check_output(["lscpu", "-p=cpu,node,core,MAXMHZ"], text=True)
219+
except (FileNotFoundError, subprocess.CalledProcessError):
220+
return [None] * os.cpu_count()
221+
222+
table = []
223+
for line in output.splitlines():
224+
if line.startswith("#"):
225+
continue
226+
cpu, node, core, maxhz = line.split(",")
227+
table.append((int(cpu), int(node), int(core), float(maxhz)))
228+
229+
cpus = []
230+
cores = set()
231+
max_mhz_all = max(row[3] for row in table)
232+
for cpu, node, core, maxmhz in table:
233+
# Choose only CPUs on the same node, unique cores, and try to avoid
234+
# "efficiency" cores.
235+
if node == 0 and core not in cores and maxmhz == max_mhz_all:
236+
cpus.append(cpu)
237+
cores.add(core)
238+
return cpus
239+
240+
241+
def thread_run(cpu, in_queue, out_queue):
242+
if cpu is not None and hasattr(os, "sched_setaffinity"):
243+
# Set the affinity for the current thread
244+
os.sched_setaffinity(0, (cpu,))
245+
246+
while True:
247+
func = in_queue.get()
248+
if func is None:
249+
break
250+
func()
251+
out_queue.put(None)
252+
253+
254+
def initialize_threads(opts):
255+
if opts.threads == -1:
256+
cpus = determine_num_threads_and_affinity()
257+
else:
258+
cpus = [None] * opts.threads # don't set affinity
259+
260+
print(f"Running benchmarks with {len(cpus)} threads")
261+
for cpu in cpus:
262+
inq = queue.Queue()
263+
outq = queue.Queue()
264+
in_queues.append(inq)
265+
out_queues.append(outq)
266+
t = threading.Thread(target=thread_run, args=(cpu, inq, outq), daemon=True)
267+
threads.append(t)
268+
t.start()
269+
270+
271+
def main(opts):
272+
global WORK_SCALE
273+
if not hasattr(sys, "_is_gil_enabled") or sys._is_gil_enabled():
274+
sys.stderr.write("expected to be run with the GIL disabled\n")
275+
276+
benchmark_names = opts.benchmarks
277+
if benchmark_names:
278+
for name in benchmark_names:
279+
if name not in ALL_BENCHMARKS:
280+
sys.stderr.write(f"Unknown benchmark: {name}\n")
281+
sys.exit(1)
282+
else:
283+
benchmark_names = ALL_BENCHMARKS.keys()
284+
285+
WORK_SCALE = opts.scale
286+
287+
if not opts.baseline_only:
288+
initialize_threads(opts)
289+
290+
do_bench = not opts.baseline_only and not opts.parallel_only
291+
for name in benchmark_names:
292+
func = ALL_BENCHMARKS[name]
293+
if do_bench:
294+
benchmark(func)
295+
continue
296+
297+
if opts.parallel_only:
298+
delta_ns = bench_parallel(func)
299+
else:
300+
delta_ns = bench_one_thread(func)
301+
302+
time_ms = delta_ns / 1_000_000
303+
print(f"{func.__name__:<18} {time_ms:.1f} ms")
304+
305+
306+
if __name__ == "__main__":
307+
import argparse
308+
309+
parser = argparse.ArgumentParser()
310+
parser.add_argument("-t", "--threads", type=int, default=-1,
311+
help="number of threads to use")
312+
parser.add_argument("--scale", type=int, default=100,
313+
help="work scale factor for the benchmark (default=100)")
314+
parser.add_argument("--baseline-only", default=False, action="store_true",
315+
help="only run the baseline benchmarks (single thread)")
316+
parser.add_argument("--parallel-only", default=False, action="store_true",
317+
help="only run the parallel benchmark (many threads)")
318+
parser.add_argument("benchmarks", nargs="*",
319+
help="benchmarks to run")
320+
options = parser.parse_args()
321+
main(options)

0 commit comments

Comments
 (0)
0