8000 GH-133136: Revise QSBR to reduce excess memory held (gh-135473) · python/cpython@d7d6412 · GitHub
[go: up one dir, main page]

Skip to content

Commit d7d6412

Browse files
naschemecolesbury
authored andcommitted
GH-133136: Revise QSBR to reduce excess memory held (gh-135473)
The free threading build uses QSBR to delay the freeing of dictionary keys and list arrays when the objects are accessed by multiple threads in order to allow concurrent reads to proceed with holding the object lock. The requests are processed in batches to reduce execution overhead, but for large memory blocks this can lead to excess memory usage. Take into account the size of the memory block when deciding when to process QSBR requests. Also track the amount of memory being held by QSBR for mimalloc pages. Advance the write sequence if this memory exceeds a limit. Advancing the sequence will allow it to be freed more quickly. Process the held QSBR items from the "eval breaker", rather than from `_PyMem_FreeDelayed()`. This gives a higher chance that the global read sequence has advanced enough so that items can be freed. (cherry picked from commit 113de85) Co-authored-by: Neil Schemenauer <nas-github@arctrix.com> Co-authored-by: Sam Gross <colesbury@gmail.com>
1 parent 090a42b
8000
 commit d7d6412

File tree

9 files changed

+129
-27
lines changed

9 files changed

+129
-27
lines changed

Include/internal/pycore_pymem.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ extern wchar_t *_PyMem_DefaultRawWcsdup(const wchar_t *str);
8888
extern int _PyMem_DebugEnabled(void);
8989

9090
// Enqueue a pointer to be freed possibly after some delay.
91-
extern void _PyMem_FreeDelayed(void *ptr);
91+
extern void _PyMem_FreeDelayed(void *ptr, size_t size);
9292

9393
// Enqueue an object to be freed possibly after some delay
9494
#ifdef Py_GIL_DISABLED

Include/internal/pycore_qsbr.h

8000
Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,21 @@ struct _qsbr_thread_state {
4848
// Thread state (or NULL)
4949
PyThreadState *tstate;
5050

51-
// Used to defer advancing write sequence a fixed number of times
52-
int deferrals;
51+
// Number of held items added by this thread since the last write sequence
52+
// advance
53+
int deferred_count;
54+
55+
// Estimate for the amount of memory that is held by this thread since
56+
// the last write sequence advance
57+
size_t deferred_memory;
58+
59+
// Amount of memory in mimalloc pages deferred from collection. When
60+
// deferred, they are prevented from being used for a different size class
61+
// and in a different thread.
62+
size_t deferred_page_memory;
63+
64+
// True if the deferred memory frees should be processed.
65+
bool should_process;
5366

5467
// Is this thread state allocated?
5568
bool allocated;
@@ -109,11 +122,17 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal)
109122
extern uint64_t
110123
_Py_qsbr_advance(struct _qsbr_shared *shared);
111124

112-
// Batches requests to advance the write sequence. This advances the write
113-
// sequence every N calls, which reduces overhead but increases time to
114-
// reclamation. Returns the new goal.
125+
// Return the next value for the write sequence (current plus the increment).
115126
extern uint64_t
116-
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
127+
_Py_qsbr_shared_next(struct _qsbr_shared *shared);
128+
129+
// Return true if deferred memory frees held by QSBR should be processed to
130+
// determine if they can be safely freed.
131+
static inline bool
132+
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
133+
{
134+
return qsbr->should_process;
135+
}
117136

118137
// Have the read sequences advanced to the given goal? If this returns true,
119138
// it safe to reclaim any memory tagged with the goal (or earlier goal).
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Limit excess memory usage in the :term:`free threading` build when a
2+
large dictionary or list is resized and accessed by multiple threads.

Objects/codeobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3369,7 +3369,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
33693369
}
33703370
memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
33713371
_Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
3372-
_PyMem_FreeDelayed(tlbc);
3372+
_PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *));
33733373
tlbc = new_tlbc;
33743374
}
33753375
char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));

Objects/dictobject.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr)
813813
{
814814
#ifdef Py_GIL_DISABLED
815815
if (use_qsbr) {
816-
_PyMem_FreeDelayed(keys);
816+
_PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys));
817817
return;
818818
}
819819
#endif
@@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr)
858858
assert(values->embedded == 0);
859859
#ifdef Py_GIL_DISABLED
860860
if (use_qsbr) {
861-
_PyMem_FreeDelayed(values);
861+
_PyMem_FreeDelayed(values, values_size_from_count(values->capacity));
862862
return;
863863
}
864864
#endif

Objects/listobject.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr)
6161
#ifdef Py_GIL_DISABLED
6262
_PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item);
6363
if (use_qsbr) {
64-
_PyMem_FreeDelayed(array);
64+
size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *);
65+
_PyMem_FreeDelayed(array, size);
6566
}
6667
else {
6768
PyMem_Free(array);

Objects/obmalloc.c

Lines changed: 90 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,33 @@ _PyMem_mi_page_is_safe_to_free(mi_page_t *page)
124124

125125
}
126126

127+
#ifdef Py_GIL_DISABLED
128+
129+
// If we are deferring collection of more than this amount of memory for
130+
// mimalloc pages, advance the write sequence. Advancing allows these
131+
// pages to be re-used in a different thread or for a different size class.
132+
#define QSBR_PAGE_MEM_LIMIT 4096*20
133+
134+
// Return true if the global write sequence should be advanced for a mimalloc
135+
// page that is deferred from collection.
136+
static bool
137+
should_advance_qsbr_for_page(struct _qsbr_thread_state *qsbr, mi_page_t *page)
138+
{
139+
size_t bsize = mi_page_block_size(page);
140+
size_t page_size = page->capacity*bsize;
141+
if (page_size > QSBR_PAGE_MEM_LIMIT) {
142+
qsbr->deferred_page_memory = 0;
143+
return true;
144+
}
145+
qsbr->deferred_page_memory += page_size;
146+
if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) {
147+
qsbr->deferred_page_memory = 0;
148+
return true;
149+
}
150+
return false;
151+
}
152+
#endif
153+
127154
static bool
128155
_PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
129156
{
@@ -139,7 +166,14 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
139166

140167
_PyMem_mi_page_clear_qsbr(page);
141168
page->retire_expire = 0;
142-
page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr);
169+
170+
if (should_advance_qsbr_for_page(tstate->qsbr, page)) {
171+
page->qsbr_goal = _Py_qsbr_advance(tstate->qsbr->shared);
172+
}
173+
else {
174+
page->qsbr_goal = _Py_qsbr_shared_next(tstate->qsbr->shared);
175+
}
176+
143177
llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
144178
return false;
145179
}
@@ -1141,8 +1175,44 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
11411175
}
11421176
}
11431177

1178+
1179+
#ifdef Py_GIL_DISABLED
1180+
1181+
// For deferred advance on free: the number of deferred items before advancing
1182+
// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
1183+
// want to process a chunk before it overflows.
1184+
#define QSBR_DEFERRED_LIMIT 127
1185+
1186+
// If the deferred memory exceeds 1 MiB, advance the write sequence. This
1187+
// helps limit memory usage due to QSBR delaying frees too long.
1188+
#define QSBR_FREE_MEM_LIMIT 1024*1024
1189+
1190+
// Return true if the global write sequence should be advanced for a deferred
1191+
// memory free.
1192+
static bool
1193+
should_advance_qsbr_for_free(struct _qsbr_thread_state *qsbr, size_t size)
1194+
{
1195+
if (size > QSBR_FREE_MEM_LIMIT) {
1196+
qsbr->deferred_count = 0;
1197+
qsbr->deferred_memory = 0;
1198+
qsbr->should_process = true;
1199+
return true;
1200+
}
1201+
qsbr->deferred_count++;
1202+
qsbr->deferred_memory += size;
1203+
if (qsbr->deferred_count > QSBR_DEFERRED_LIMIT ||
1204+
qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) {
1205+
qsbr->deferred_count = 0;
1206+
qsbr->deferred_memory = 0;
1207+
qsbr->should_process = true;
1208+
return true;
1209+
}
1210+
return false;
1211+
}
1212+
#endif
1213+
11441214
static void
1145-
free_delayed(uintptr_t ptr)
1215+
free_delayed(uintptr_t ptr, size_t size)
11461216
{
11471217
#ifndef Py_GIL_DISABLED
11481218
free_work_item(ptr, NULL, NULL);
@@ -1200,23 +1270,32 @@ free_delayed(uintptr_t ptr)
12001270
}
12011271

12021272
assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
1203-
uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
1273+
uint64_t seq;
1274+
if (should_advance_qsbr_for_free(tstate->qsbr, size)) {
1275+
seq = _Py_qsbr_advance(tstate->qsbr->shared);
1276+
}
1277+
else {
1278+
seq = _Py_qsbr_shared_next(tstate->qsbr->shared);
1279+
}
12041280
buf->array[buf->wr_idx].ptr = ptr;
12051281
buf->array[buf->wr_idx].qsbr_goal = seq;
12061282
buf->wr_idx++;
12071283

12081284
if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
1285+
// Normally the processing of delayed items is done from the eval
1286+
// breaker. Processing here is a safety measure to ensure too much
1287+
// work does not accumulate.
12091288
_PyMem_ProcessDelayed((PyThreadState *)tstate);
12101289
}
12111290
#endif
12121291
}
12131292

12141293
void
1215-
_PyMem_FreeDelayed(void *ptr)
1294+
_PyMem_FreeDelayed(void *ptr, size_t size)
12161295
{
12171296
assert(!((uintptr_t)ptr & 0x01));
12181297
if (ptr != NULL) {
1219-
free_delayed((uintptr_t)ptr);
1298+
free_delayed((uintptr_t)ptr, size);
12201299
}
12211300
}
12221301

@@ -1226,7 +1305,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
12261305
{
12271306
assert(!((uintptr_t)ptr & 0x01));
12281307
if (ptr != NULL) {
1229-
free_delayed(((uintptr_t)ptr)|0x01);
1308+
// We use 0 as the size since we don't have an easy way to know the
1309+
// actual size. If we are freeing many objects, the write sequence
1310+
// will be advanced due to QSBR_DEFERRED_LIMIT.
1311+
free_delayed(((uintptr_t)ptr)|0x01, 0);
12301312
}
12311313
}
12321314
#endif
@@ -1302,6 +1384,8 @@ _PyMem_ProcessDelayed(PyThreadState *tstate)
13021384
PyInterpreterState *interp = tstate->interp;
13031385
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
13041386

1387+
tstate_impl->qsbr->should_process = false;
1388+
13051389
// Process thread-local work
13061390
process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL);
13071391

Python/ceval_gil.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,6 +1387,10 @@ _Py_HandlePending(PyThreadState *tstate)
13871387
_Py_unset_eval_breaker_bit(tstate, _PY_EVAL_EXPLICIT_MERGE_BIT);
13881388
_Py_brc_merge_refcounts(tstate);
13891389
}
1390+
/* Process deferred memory frees held by QSBR */
1391+
if (_Py_qsbr_should_process(((_PyThreadStateImpl *)tstate)->qsbr)) {
1392+
_PyMem_ProcessDelayed(tstate);
1393+
}
13901394
#endif
13911395

13921396
/* GC scheduled to run */

Python/qsbr.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@
4141
// Starting size of the array of qsbr thread states
4242
#define MIN_ARRAY_SIZE 8
4343

44-
// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
45-
// the write sequence.
46-
#define QSBR_DEFERRED_LIMIT 10
47-
4844
// Allocate a QSBR thread state from the freelist
4945
static struct _qsbr_thread_state *
5046
qsbr_allocate(struct _qsbr_shared *shared)
@@ -117,13 +113,9 @@ _Py_qsbr_advance(struct _qsbr_shared *shared)
117113
}
118114

119115
uint64_t
120-
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
116+
_Py_qsbr_shared_next(struct _qsbr_shared *shared)
121117
{
122-
if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
123-
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
124-
}
125-
qsbr->deferrals = 0;
126-
return _Py_qsbr_advance(qsbr->shared);
118+
return _Py_qsbr_shared_current(shared) + QSBR_INCR;
127119
}
128120

129121
static uint64_t

0 commit comments

Comments
 (0)
0