10000 gh-115103: Implement delayed free mechanism for free-threaded builds · python/cpython@a661366 · GitHub
[go: up one dir, main page]

Skip to content

Commit a661366

Browse files
committed
gh-115103: Implement delayed free mechanism for free-threaded builds
This adds `_PyMem_FreeDelayed()` and supporting functions. The `_PyMem_FreeDelayed()` function frees memory with the same allocator as `PyMem_Free()`, but after some delay to ensure that concurrent lock-free readers have finished.
1 parent 35cbc0e commit a661366

File tree

8 files changed

+206
-0
lines changed

8 files changed

+206
-0
lines changed

Include/internal/pycore_interp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ struct _is {
235235

236236
struct _Py_dict_state dict_state;
237237
struct _Py_exc_state exc_state;
238+
struct _Py_mem_interp_free_queue mem_free_queue;
238239

239240
struct ast_state ast;
240241
struct types_state types;

Include/internal/pycore_pymem.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef Py_INTERNAL_PYMEM_H
22
#define Py_INTERNAL_PYMEM_H
33

4+
#include "pycore_llist.h" // struct llist_node
45
#include "pycore_lock.h" // PyMutex
56

67
#ifdef __cplusplus
@@ -48,6 +49,11 @@ struct _pymem_allocators {
4849
PyObjectArenaAllocator obj_arena;
4950
};
5051

52+
struct _Py_mem_interp_free_queue {
53+
int has_work; // true if the queue is not empty
54+
PyMutex mutex; // protects the queue
55+
struct llist_node head; // queue of _mem_work_chunk items
56+
};
5157

5258
/* Set the memory allocator of the specified domain to the default.
5359
Save the old allocator into *old_alloc if it's non-NULL.
@@ -110,6 +116,19 @@ extern int _PyMem_SetupAllocators(PyMemAllocatorName allocator);
110116
/* Is the debug allocator enabled? */
111117
extern int _PyMem_DebugEnabled(void);
112118

119+
// Enqueue a pointer to be freed possibly after some delay.
120+
extern void _PyMem_FreeDelayed(void *ptr);
121+
122+
// Periodically process delayed free requests.
123+
extern void _PyMem_ProcessDelayed(PyThreadState *tstate);
124+
125+
// Abandon all thread-local delayed free requests and push them to the
126+
// interpreter's queue.
127+
extern void _PyMem_AbandonDelayed(PyThreadState *tstate);
128+
129+
// On interpreter shutdown, frees all delayed free requests.
130+
extern void _PyMem_FiniDelayed(PyInterpreterState *interp);
131+
113132
#ifdef __cplusplus
114133
}
115134
#endif

Include/internal/pycore_pymem_init.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ extern void _PyMem_ArenaFree(void *, void *, size_t);
9292
{ NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree }
9393

9494

95+
#define _Py_mem_free_queue_INIT(queue) \
96+
{ \
97+
.head = LLIST_INIT(queue.head), \
98+
}
99+
95100
#ifdef __cplusplus
96101
}
97102
#endif

Include/internal/pycore_runtime_init.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ extern PyTypeObject _PyExc_MemoryError;
176176
.object_state = _py_object_state_INIT(INTERP), \
177177
.dtoa = _dtoa_state_INIT(&(INTERP)), \
178178
.dict_state = _dict_state_INIT, \
179+
.mem_free_queue = _Py_mem_free_queue_INIT(INTERP.mem_free_queue), \
179180
.func_state = { \
180181
.next_version = 1, \
181182
}, \

Include/internal/pycore_tstate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ typedef struct _PyThreadStateImpl {
2222
PyThreadState base;
2323

2424
struct _qsbr_thread_state *qsbr; // only used by free-threaded build
25+
struct llist_node mem_free_queue; // delayed free queue
2526

2627
#ifdef Py_GIL_DISABLED
2728
struct _mimalloc_thread_state mimalloc;

Objects/obmalloc.c

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,176 @@ _PyMem_Strdup(const char *str)
948948
return copy;
949949
}
950950

951+
/***********************************************/
952+
/* Delayed freeing support for Py_GIL_DISABLED */
953+
/***********************************************/
954+
955+
// So that sizeof(struct _mem_work_chunk) is 4096 bytes on 64-bit platforms.
956+
#define WORK_ITEMS_PER_CHUNK 254
957+
958+
// A pointer to be freed once the QSBR read sequence reaches qsbr_goal.
959+
struct _mem_work_item {
960+
void *ptr;
961+
uint64_t qsbr_goal;
962+
};
963+
964+
// A fixed-size buffer of pointers to be freed
965+
struct _mem_work_chunk {
966+
// Linked list node of chunks in queue
967+
struct llist_node node;
968+
969+
Py_ssize_t rd_idx; // index of next item to read
970+
Py_ssize_t wr_idx; // index of next item to write
971+
struct _mem_work_item array[WORK_ITEMS_PER_CHUNK];
972+
};
973+
974+
void
975+
_PyMem_FreeDelayed(void *ptr)
976+
{
977+
#ifndef Py_GIL_DISABLED
978+
PyMem_Free(ptr);
979+
#else
980+
if (_PyRuntime.stoptheworld.world_stopped) {
981+
// Free immediately if the world is stopped, including during
982+
// interpreter shutdown.
983+
PyMem_Free(ptr);
984+
return;
985+
}
986+
987+
_PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
988+
struct llist_node *head = &tstate->mem_free_queue;
989+
990+
struct _mem_work_chunk *buf = NULL;
991+
if (!llist_empty(head)) {
992+
// Try to re-use the last buffer
993+
buf = llist_data(head->prev, struct _mem_work_chunk, node);
994+
if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
995+
// already full
996+
buf = NULL;
997+
}
998+
}
999+
1000+
if (buf == NULL) {
1001+
buf = PyMem_Calloc(1, sizeof(*buf));
1002+
if (buf != NULL) {
1003+
llist_insert_tail(head, &buf->node);
1004+
}
1005+
}
1006+
1007+
if (buf == NULL) {
1008+
// failed to allocate a buffer, free immediately
1009+
_PyEval_StopTheWorld(tstate->base.interp);
1010+
PyMem_Free(ptr);
1011+
_PyEval_StartTheWorld(tstate->base.interp);
1012+
return;
1013+
}
1014+
1015+
assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
1016+
uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
1017+
buf->array[buf->wr_idx].ptr = ptr;
1018+
buf->array[buf->wr_idx].qsbr_goal = seq;
1019+
buf->wr_idx++;
1020+
1021+
if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
1022+
_PyMem_ProcessDelayed((PyThreadState *)tstate);
1023+
}
1024+
#endif
1025+
}
1026+
1027+
static struct _mem_work_chunk *
1028+
work_queue_first(struct llist_node *head)
1029+
{
1030+
return llist_data(head->next, struct _mem_work_chunk, node);
1031+
}
1032+
1033+
static void
1034+
process_work_queue(struct llist_node *head,
1035+
struct _qsbr_thread_state *qsbr)
1036+
{
1037+
while (!llist_empty(head)) {
1038+
struct _mem_work_chunk *buf = work_queue_first(head);
1039+
1040+
if (buf->rd_idx == buf->wr_idx) {
1041+
llist_remove(&buf->node);
1042+
PyMem_Free(buf);
1043+
continue;
1044+
}
1045+
1046+
struct _mem_work_item *item = &buf->array[buf->rd_idx];
1047+
if (!_Py_qsbr_poll(qsbr, item->qsbr_goal)) {
1048+
return;
1049+
}
1050+
1051+
PyMem_Free(item->ptr);
1052+
buf->rd_idx++;
1053+
}
1054+
}
1055+
1056+
static void
1057+
process_interp_work_queue(struct _Py_mem_interp_free_queue *queue,
1058+
struct _qsbr_thread_state *qsbr)
1059+
{
1060+
if (!_Py_atomic_load_int_relaxed(&queue->has_work)) {
1061+
return;
1062+
}
1063+
1064+
PyMutex_Lock(&queue->mutex);
1065+
process_work_queue(&queue->head, qsbr);
1066+
_Py_atomic_store_int_relaxed(&queue->has_work, !llist_empty(&queue->head));
1067+
PyMutex_Unlock(&queue->mutex);
1068+
}
1069+
1070+
void
1071+
_PyMem_ProcessDelayed(PyThreadState *tstate)
1072+
{
1073+
PyInterpreterState *interp = tstate->interp;
1074+
_PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
1075+
1076+
// Process thread-local work
1077+
process_work_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr);
1078+
1079+
// Process shared interpreter work
1080+
process_interp_work_queue(&interp->mem_free_queue, tstate_impl->qsbr);
1081+
}
1082+
1083+
void
1084+
_PyMem_AbandonDelayed(PyThreadState *tstate)
1085+
{
1086+
PyInterpreterState *interp = tstate->interp;
1087+
struct llist_node *queue = &((_PyThreadStateImpl *)tstate)->mem_free_queue;
1088+
1089+
if (llist_empty(queue)) {
1090+
return;
1091+
}
1092+
1093+
// Merge the thread's work queue into the interpreter's work queue.
1094+
PyMutex_Lock(&interp->mem_free_queue.mutex);
1095+
llist_concat(&interp->mem_free_queue.head, queue);
1096+
_Py_atomic_store_int_relaxed(&interp->mem_free_queue.has_work, 1);
1097+
PyMutex_Unlock(&interp->mem_free_queue.mutex);
1098+
1099+
assert(llist_empty(queue)); // the thread's queue is now empty
1100+
}
1101+
1102+
void
1103+
_PyMem_FiniDelayed(PyInterpreterState *interp)
1104+
{
1105+
struct llist_node *head = &interp->mem_free_queue.head;
1106+
while (!llist_empty(head)) {
1107+
struct _mem_work_chunk *buf = work_queue_first(head);
1108+
1109+
while (buf->rd_idx < buf->wr_idx) {
1110+
// Free the remaining items immediately. There should be no other
1111+
// threads accessing the memory at this point during shutdown.
1112+
struct _mem_work_item *item = &buf->array[buf->rd_idx];
1113+
PyMem_Free(item->ptr);
1114+
buf->rd_idx++;
1115+
}
1116+
1117+
llist_remove(&buf->node);
1118+
PyMem_Free(buf);
1119+
}
1120+
}
9511121

9521122
/**************************/
9531123
/* the "object" allocator */

Python/pylifecycle.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1835,6 +1835,9 @@ finalize_interp_clear(PyThreadState *tstate)
18351835

18361836
finalize_interp_types(tstate->interp);
18371837

1838+
/* Free any delayed free requests immediately */
1839+
_PyMem_FiniDelayed(tstate->interp);
1840+
18381841
/* finalize_interp_types may allocate Python objects so we may need to
18391842
abandon mimalloc segments again */
18401843
_PyThreadState_ClearMimallocHeaps(tstate);

Python/pystate.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,7 @@ init_interpreter(PyInterpreterState *interp,
614614
#ifdef Py_GIL_DISABLED
615615
_Py_brc_init_state(interp);
616616
#endif
617+
llist_init(&interp->mem_free_queue.head);
617618
for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) {
618619
interp->monitors.tools[i] = 0;
619620
}
@@ -1345,6 +1346,7 @@ init_threadstate(_PyThreadStateImpl *_tstate,
13451346
// Initialize biased reference counting inter-thread queue
13461347
_Py_brc_init_thread(tstate);
13471348
#endif
1349+
llist_init(&_tstate->mem_free_queue);
13481350

13491351
if (interp->stoptheworld.requested || _PyRuntime.stoptheworld.requested) {
13501352
// Start in the suspended state if there is an ongoing stop-the-world.
@@ -1563,6 +1565,7 @@ PyThreadState_Clear(PyThreadState *tstate)
15631565
// don't call _PyInterpreterState_SetNotRunningMain() yet.
15641566
tstate->on_delete(tstate->on_delete_data);
15651567
}
1568+
15661569
#ifdef Py_GIL_DISABLED
15671570
// Each thread should clear own freelists in free-threading builds.
15681571
_PyFreeListState *freelist_state = _PyFreeListState_GET();
@@ -1572,6 +1575,9 @@ PyThreadState_Clear(PyThreadState *tstate)
15721575
_Py_brc_remove_thread(tstate);
15731576
#endif
15741577

1578+
// Merge our queue of pointers to be freed into the interpreter queue.
1579+
_PyMem_AbandonDelayed(tstate);
1580+
15751581
_PyThreadState_ClearMimallocHeaps(tstate);
15761582

15771583
tstate->_status.cleared = 1;

0 commit comments

Comments
 (0)
0