8000 GH-133136: Revise QSBR to reduce excess memory held by nascheme · Pull Request #135473 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

GH-133136: Revise QSBR to reduce excess memory held #135473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 25, 2025
Prev Previous commit
Next Next commit
Revise based on review feedback.
* Keep separate count of mimalloc page memory that is deferred from
  collection.  This memory doesn't get freed by _PyMem_ProcessDelayed().
  We want to advance the write sequence if there is too much of it
  but calling _PyMem_ProcessDelayed() is not helpful.

* Use `process_seq` variable to schedule the next call to
  `_PyMem_ProcessDelayed()`.

* Rename advance functions to have "deferred" in name.

* Move `_Py_qsbr_should_process()` call up one level.
  • Loading branch information
nascheme committed Jun 16, 2025
commit 7ea28fffe1ca379e3f29440b5e0e70d1a5a01435
31 changes: 21 additions & 10 deletions Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,14 @@ struct _qsbr_thread_state {
// the last write sequence advance
size_t deferred_memory;

// Sequence number at time of last "should process" check.
uint64_t seq_last_check;
// Amount of memory in mimalloc pages deferred from collection. When
// deferred, they are prevented from being used for a different size class
// and in a different thread.
size_t deferred_page_memory;

// If non-zero, processing if deferred memory should be performed if the
// read sequence has reached this value.
uint64_t process_seq;

// Is this thread state allocated?
bool allocated;
Expand All @@ -66,7 +72,7 @@ struct _qsbr_thread_state {
// Padding to avoid false sharing
struct _qsbr_pad {
struct _qsbr_thread_state qsbr;
char __padding[64 - sizeof(struct _qsbr_thread_state)];
char __padding[128 - sizeof(struct _qsbr_thread_state)];
};

// Per-interpreter state
Expand Down Expand Up @@ -117,14 +123,19 @@ extern uint64_t
_Py_qsbr_advance(struct _qsbr_shared *shared);

// Advance the write sequence as required and return the sequence goal to use
// for memory to be freed. If the sequence is advanced, this goal is the new
// sequence value, otherwise it is the next sequence value. In either case,
// the goal is higher that any write sequence value already observed by readers.
//
// The 'size' argument is the size in bytes of the memory scheduled to be
// freed. If that size is not available, pass zero as the value.
// for memory to be freed. The 'free_size' argument is the size in bytes of
// the memory scheduled to be freed. If that size is not available, pass zero
// as the value.
extern uint64_t
_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr,
size_t free_size);

// Advance the write sequence as required and return the sequence goal to use
// for a mimalloc page to be collected. The 'page_size' argument is the size
// of the mimalloc page being deferred from collection.
extern uint64_t
_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size);
_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr,
size_t page_size);

// Return true if memory held by QSBR should be processed to determine if it
// can be safely freed.
Expand Down
18 changes: 10 additions & 8 deletions Objects/obmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)
page->retire_expire = 0;

size_t bsize = mi_page_block_size(page);
page->qsbr_goal = _Py_qsbr_advance_with_size(tstate->qsbr, page->capacity*bsize);
page->qsbr_goal = _Py_qsbr_deferred_advance_for_page(tstate->qsbr, page->capacity*bsize);

llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
return false;
Expand Down Expand Up @@ -1203,12 +1203,14 @@ free_delayed(uintptr_t ptr, size_t size)
}

assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
uint64_t seq = _Py_qsbr_advance_with_size(tstate->qsbr, size);
uint64_t seq = _Py_qsbr_deferred_advance_for_free(tstate->qsbr, size);
buf->array[buf->wr_idx].ptr = ptr;
buf->array[buf->wr_idx].qsbr_goal = seq;
buf->wr_idx++;

_PyMem_ProcessDelayed((PyThreadState *)tstate);
if (_Py_qsbr_should_process(tstate->qsbr)) {
_PyMem_ProcessDelayed((PyThreadState *)tstate);
}
#endif
}

Expand All @@ -1227,7 +1229,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed(((uintptr_t)ptr)|0x01, 64);
// We use 0 as the size since we don't have an easy way to know the
// actual size. If we are freeing many objects, the write sequence
// will be advanced due to QSBR_DEFERRED_LIMIT.
free_delayed(((uintptr_t)ptr)|0x01, 0);
}
}
#endif
Expand Down Expand Up @@ -1300,11 +1305,8 @@ maybe_process_interp_queue(struct _Py_mem_interp_free_queue *queue,
void
_PyMem_ProcessDelayed(PyThreadState *tstate)
{
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
if (!_Py_qsbr_should_process(tstate_impl->qsbr)) {
return;
}
PyInterpreterState *interp = tstate->interp;
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;

// Process thread-local work
process_queue(&tstate_impl->mem_free_queue, tstate_impl, true, NULL, NULL);
Expand Down
62 changes: 37 additions & 25 deletions Python/qsbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,19 @@
// Starting size of the array of qsbr thread states
#define MIN_ARRAY_SIZE 8

// For should_advance_qsbr(): the number of deferred items before advancing
// For deferred advance on free: the number of deferred items before advancing
// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
// want to process a chunk before it overflows.
#define QSBR_DEFERRED_LIMIT 127

// If the deferred memory exceeds 1 MiB, we force an advance in the
// shared QSBR sequence number to limit excess memory usage.
#define QSBR_MEM_LIMIT 1024*1024
#define QSBR_FREE_MEM_LIMIT 1024*1024

// If we are deferring collection of more than this amount of memory for
// mimalloc pages, advance the write sequence. Advancing allows these
// pages to be re-used in a different thread or for a different size class.
#define QSBR_PAGE_MEM_LIMIT 4096*10

// Allocate a QSBR thread state from the freelist
static struct _qsbr_thread_state *
Expand Down Expand Up @@ -121,43 +126,50 @@ _Py_qsbr_advance(struct _qsbr_shared *shared)
return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR);
}

static int
should_advance_qsbr(struct _qsbr_thread_state *qsbr, size_t size)
uint64_t
_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr, size_t page_size)
{
qsbr->deferred_count++;
qsbr->deferred_memory += size;
if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT ||
qsbr->deferred_memory > QSBR_MEM_LIMIT) {
qsbr->deferred_count = 0;
qsbr->deferred_memory = 0;
return 1;
qsbr->deferred_page_memory += page_size;
if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) {
qsbr->deferred_page_memory = 0;
// Advance the write sequence and return the updated value as the goal.
return _Py_qsbr_advance(qsbr->shared);
}
return 0;
// Don't advance, return the next sequence value as the goal.
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}

uint64_t
_Py_qsbr_advance_with_size(struct _qsbr_thread_state *qsbr, size_t size)
_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr, size_t free_size)
{
if (should_advance_qsbr(qsbr, size)) {
// Advance the write sequence and return the updated value as the goal.
return _Py_qsbr_advance(qsbr->shared);
}
else {
// Don't advance, return the next sequence value as the goal.
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
qsbr->deferred_count++;
qsbr->deferred_memory += free_size;
if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT ||
qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) {
qsbr->deferred_count = 0;
qsbr->deferred_memory = 0;
// Advance the write sequence
uint64_t seq = _Py_qsbr_advance(qsbr->shared);
if (qsbr->process_seq == 0) {
// Process the queue of deferred frees when the read sequence
// reaches this value. We don't process immediately because
// we want to give readers a chance to advance their sequence.
qsbr->process_seq = seq;
}
// Return current (just advanced) sequence as the goal.
return seq;
}
// Don't advance, return the next sequence value as the goal.
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}

bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
{
if (qsbr->seq_last_check == qsbr->seq) {
// If 'seq' for this thread hasn't advanced, it is unlikely that any
// deferred memory is ready to be freed. Wait longer before trying
// to process.
if (qsbr->process_seq == 0 || qsbr->seq < qsbr->process_seq) {
return false;
}
qsbr->seq_last_check = qsbr->seq;
qsbr->process_seq = 0;
return true;
}

Expand Down
Loading
0