From 36feeb1bcaa225c82cb11e22526028e7afdf887d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 24 Nov 2023 15:02:08 -0800 Subject: [PATCH 01/37] Skip ENTER_EXECUTOR as deopt target (use vm_data) --- Python/ceval.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Python/ceval.c b/Python/ceval.c index d92ab926f84963..04fe345b0493ed 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1074,6 +1074,18 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int UOP_STAT_INC(uopcode, miss); frame->return_offset = 0; // Dispatch to frame->instr_ptr _PyFrame_SetStackPointer(frame, stack_pointer); + if (frame->instr_ptr->op.code == ENTER_EXECUTOR) { + // Avoid recursing into the same executor over and over + Py_DECREF(current_executor); + next_instr = frame->instr_ptr; + PyCodeObject *code = _PyFrame_GetCode(frame); + oparg = next_instr->op.arg; + _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg&255]; + opcode = executor->vm_data.opcode; + oparg = executor->vm_data.oparg; + DPRINTF(2, "Avoiding ENTER_EXECUTOR in favor of underlying %s\n", _PyOpcode_OpName[opcode]); + DISPATCH_GOTO(); + } Py_DECREF(current_executor); // Fall through // Jump here from ENTER_EXECUTOR From d12533b2aceb85304cea40408270b597e209e96d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 22 Nov 2023 21:19:39 -0800 Subject: [PATCH 02/37] Add an array of 'extras' to UOpExecutor --- Include/internal/pycore_uops.h | 1 + Python/optimizer.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h index 153884f4bd2902..d587114bc556af 100644 --- a/Include/internal/pycore_uops.h +++ b/Include/internal/pycore_uops.h @@ -21,6 +21,7 @@ typedef struct { typedef struct { _PyExecutorObject base; + uintptr_t *extra; // An array of uintptr_t of size base.ob_base.ob_size _PyUOpInstruction trace[1]; } _PyUOpExecutorObject; diff --git a/Python/optimizer.c b/Python/optimizer.c index d44e733bc346fa..7b35ee6a5d03c2 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -379,7 +379,7 @@ PyTypeObject _PyUOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", .tp_basicsize = sizeof(_PyUOpExecutorObject) - sizeof(_PyUOpInstruction), - .tp_itemsize = sizeof(_PyUOpInstruction), + .tp_itemsize = sizeof(_PyUOpInstruction) + sizeof(uintptr_t), .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, @@ -825,6 +825,8 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) if (executor == NULL) { return NULL; } + executor->extra = (uintptr_t *)(executor->trace + length); + memset(executor->extra, 0, sizeof(uintptr_t) * length); int dest = length - 1; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { From f21f2d810b695d17112c9d682ce1569f5dbc79c8 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 24 Nov 2023 11:16:53 -0800 Subject: [PATCH 03/37] Count side exits per uop loc and print if >= 10 --- Python/ceval.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Python/ceval.c b/Python/ceval.c index 04fe345b0493ed..c278a8e9e2e270 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1086,6 +1086,14 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int DPRINTF(2, "Avoiding ENTER_EXECUTOR in favor of underlying %s\n", _PyOpcode_OpName[opcode]); DISPATCH_GOTO(); } + // Increment side exit counter for this uop + int pc = next_uop - 1 - current_executor->trace; + uintptr_t *pcounter = current_executor->extra + pc; + *pcounter += 1; + if (*pcounter >= 10) { + DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", + _PyUopName(uopcode), pc, current_executor, (int)(*pcounter)); + } Py_DECREF(current_executor); // Fall through // Jump here from ENTER_EXECUTOR From 846396589924635967ad4c61bb46832c1f3d2f81 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 24 Nov 2023 16:59:40 -0800 Subject: [PATCH 04/37] Add _PyOptimizer_Anywhere (not yet used) --- Include/cpython/optimizer.h | 4 ++-- Python/optimizer.c | 31 ++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index d521eac79d1b97..80cde17c9be617 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -60,8 +60,8 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); -int -_PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); +int _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); +int _PyOptimizer_Anywhere(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, PyObject **stack_pointer); extern _PyOptimizerObject _PyOptimizer_Default; diff --git a/Python/optimizer.c b/Python/optimizer.c index 7b35ee6a5d03c2..5817e4bf5cc221 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -155,11 +155,12 @@ PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) Py_DECREF(old); } -int -_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) +// src is where to insert ENTER_EXECUTOR +// dest is where to start tracing +static int +optimizer_wherever(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) { - assert(src->op.code == JUMP_BACKWARD); - PyCodeObject *code = (PyCodeObject *)frame->f_executable; + PyCodeObject *code = _PyFrame_GetCode(frame); assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); if (!has_space_for_executor(code, src)) { @@ -189,6 +190,26 @@ _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNI return 1; } +int +_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) +{ + assert(src->op.code == JUMP_BACKWARD); + return optimizer_wherever(frame, src, dest, stack_pointer); +} + +// Start tracing at src and insert ENTER_EXECUTOR at the same place +int +_PyOptimizer_Anywhere(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, PyObject **stack_pointer) +{ + if (src->op.code == JUMP_BACKWARD) { + return 0; + } + if (src->op.code == ENTER_EXECUTOR) { + return 0; + } + return optimizer_wherever(frame, src, src, stack_pointer); +} + _PyExecutorObject * PyUnstable_GetExecutor(PyCodeObject *code, int offset) { @@ -499,7 +520,7 @@ translate_bytecode_to_trace( code = trace_stack[trace_stack_depth].code; \ instr = trace_stack[trace_stack_depth].instr; - DPRINTF(4, + DPRINTF(2, "Optimizing %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), PyUnicode_AsUTF8(code->co_filename), From 640375282c43cee57c5ffa26e74ec886baa32dc6 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 25 Nov 2023 10:35:27 +0900 Subject: [PATCH 05/37] Only jump in ENTER_EXECUTOR if overwriting JUMP_BACKWARD --- Python/bytecodes.c | 3 ++- Python/generated_cases.c.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 1ae83422730f8f..790f81797e2784 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2311,7 +2311,8 @@ dummy_func( uint16_t ucounter = this_instr[1].cache + (1 << 15); uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); // Double-check that the opcode isn't instrumented or something: - if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { + // Also bail if extended oparg (>= 256) + if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD && oparg < 256) { OPT_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, this_instr, next_instr, stack_pointer); ERROR_IF(optimized < 0, error); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 65e6f11f68b38c..6d51f21d83a64e 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3223,7 +3223,8 @@ uint16_t ucounter = this_instr[1].cache + (1 << 15); uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); // Double-check that the opcode isn't instrumented or something: - if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { + // Also bail if extended oparg (>= 256) + if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD && oparg < 256) { OPT_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, this_instr, next_instr, stack_pointer); if (optimized < 0) goto error; From 329deadde1e13e39f94fed3528bb060e01fb10c9 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 25 Nov 2023 11:55:43 +0900 Subject: [PATCH 06/37] Assert base opcode in _Py_Specialize_ForIter --- Python/specialize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/specialize.c b/Python/specialize.c index 7c2a4a42b1dcc3..f8ec59d2ebc273 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -2353,6 +2353,7 @@ int void _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg) { + assert(_PyOpcode_Deopt[instr->op.code] == FOR_ITER); assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER); _PyForIterCache *cache = (_PyForIterCache *)(instr + 1); From f1998c0428d3b34f54a93454575eaba8f2b6b375 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 25 Nov 2023 14:58:52 +0900 Subject: [PATCH 07/37] Disable curses tests in --fast-ci mode (make test) --- Lib/test/libregrtest/cmdline.py | 2 +- Lib/test/test_regrtest.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/test/libregrtest/cmdline.py b/Lib/test/libregrtest/cmdline.py index 0053bce4292f64..35971cdb88ee37 100644 --- a/Lib/test/libregrtest/cmdline.py +++ b/Lib/test/libregrtest/cmdline.py @@ -442,7 +442,7 @@ def _parse_args(args, **kwargs): # Similar to: -u "all,-cpu" --timeout=600 if ns.use is None: ns.use = [] - ns.use.insert(0, ['all', '-cpu']) + ns.use.insert(0, ['all', '-cpu', '-curses']) if ns.timeout is None: ns.timeout = 600 # 10 minutes diff --git a/Lib/test/test_regrtest.py b/Lib/test/test_regrtest.py index e828941f6c779d..3c73ba14bc507e 100644 --- a/Lib/test/test_regrtest.py +++ b/Lib/test/test_regrtest.py @@ -425,6 +425,7 @@ def test_fast_ci(self): args = ['--fast-ci'] use_resources = sorted(cmdline.ALL_RESOURCES) use_resources.remove('cpu') + use_resources.remove('curses') regrtest = self.check_ci_mode(args, use_resources) self.assertEqual(regrtest.timeout, 10 * 60) @@ -432,6 +433,7 @@ def test_fast_ci_python_cmd(self): args = ['--fast-ci', '--python', 'python -X dev'] use_resources = sorted(cmdline.ALL_RESOURCES) use_resources.remove('cpu') + use_resources.remove('curses') regrtest = self.check_ci_mode(args, use_resources, rerun=False) self.assertEqual(regrtest.timeout, 10 * 60) self.assertEqual(regrtest.python_cmd, ('python', '-X', 'dev')) @@ -441,6 +443,7 @@ def test_fast_ci_resource(self): args = ['--fast-ci', '-u-network'] use_resources = sorted(cmdline.ALL_RESOURCES) use_resources.remove('cpu') + use_resources.remove('curses') use_resources.remove('network') self.check_ci_mode(args, use_resources) From b0944e6bdb67a668cba5efd9ea4b10eeba2fa3c1 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 25 Nov 2023 14:59:53 +0900 Subject: [PATCH 08/37] Improve (?) check for executor recursion --- Python/ceval.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index c278a8e9e2e270..f5f5644d6b844f 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1072,20 +1072,30 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyOpcode_OpName[frame->instr_ptr->op.code]); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); - frame->return_offset = 0; // Dispatch to frame->instr_ptr - _PyFrame_SetStackPointer(frame, stack_pointer); if (frame->instr_ptr->op.code == ENTER_EXECUTOR) { // Avoid recursing into the same executor over and over - Py_DECREF(current_executor); next_instr = frame->instr_ptr; PyCodeObject *code = _PyFrame_GetCode(frame); oparg = next_instr->op.arg; - _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg&255]; - opcode = executor->vm_data.opcode; - oparg = executor->vm_data.oparg; - DPRINTF(2, "Avoiding ENTER_EXECUTOR in favor of underlying %s\n", _PyOpcode_OpName[opcode]); - DISPATCH_GOTO(); + _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg]; + if (executor == (_PyExecutorObject *)current_executor) { + opcode = _PyOpcode_Deopt[executor->vm_data.opcode]; + DPRINTF(1, "Avoiding ENTER_EXECUTOR %d in favor of underlying base opcode %s %d\n", + oparg, _PyOpcode_OpName[opcode], executor->vm_data.oparg); + DPRINTF(1, + " for %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)))); + Py_DECREF(current_executor); + oparg = executor->vm_data.oparg; + PRE_DISPATCH_GOTO(); + DISPATCH_GOTO(); + } } + frame->return_offset = 0; // Don't leave this random + _PyFrame_SetStackPointer(frame, stack_pointer); // Increment side exit counter for this uop int pc = next_uop - 1 - current_executor->trace; uintptr_t *pcounter = current_executor->extra + pc; From 26b5f89c6491a513cbb17ef8b853869447b48e77 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 25 Nov 2023 15:01:54 +0900 Subject: [PATCH 09/37] Only generate extra executors for branches --- Python/ceval.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index f5f5644d6b844f..a808fa7b0cd32c 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1100,9 +1100,29 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int int pc = next_uop - 1 - current_executor->trace; uintptr_t *pcounter = current_executor->extra + pc; *pcounter += 1; - if (*pcounter >= 10) { + if (*pcounter == 16 && + (frame->instr_ptr->op.code == POP_JUMP_IF_FALSE || + frame->instr_ptr->op.code == POP_JUMP_IF_TRUE || + frame->instr_ptr->op.code == POP_JUMP_IF_NONE || + frame->instr_ptr->op.code == POP_JUMP_IF_NOT_NONE)) + { DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", _PyUopName(uopcode), pc, current_executor, (int)(*pcounter)); + DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[frame->instr_ptr->op.code]); + // The counter will cycle around in 2**64 executions :-) + int optimized = _PyOptimizer_Anywhere(frame, frame->instr_ptr, stack_pointer); + if (optimized < 0) { + goto error_tier_two; + } + if (optimized) { + DPRINTF(2, "--> Optimized %s @ %d in %p\n", + _PyUopName(uopcode), pc, current_executor); + DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[frame->instr_ptr->op.code]); + } + else { + DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", + _PyUopName(uopcode), pc, current_executor); + } } Py_DECREF(current_executor); // Fall through From 649581cfb50fd544062f641d0bdb9bab2f3cfb04 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 1 Dec 2023 17:50:01 +0900 Subject: [PATCH 10/37] Fix Uop -> UOp --- Python/ceval.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index a808fa7b0cd32c..ea236c3b641fe6 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1107,7 +1107,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int frame->instr_ptr->op.code == POP_JUMP_IF_NOT_NONE)) { DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", - _PyUopName(uopcode), pc, current_executor, (int)(*pcounter)); + _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[frame->instr_ptr->op.code]); // The counter will cycle around in 2**64 executions :-) int optimized = _PyOptimizer_Anywhere(frame, frame->instr_ptr, stack_pointer); @@ -1116,12 +1116,12 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } if (optimized) { DPRINTF(2, "--> Optimized %s @ %d in %p\n", - _PyUopName(uopcode), pc, current_executor); + _PyUOpName(uopcode), pc, current_executor); DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[frame->instr_ptr->op.code]); } else { DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", - _PyUopName(uopcode), pc, current_executor); + _PyUOpName(uopcode), pc, current_executor); } } Py_DECREF(current_executor); From 835bf13739c02740b29b6b5281c6486e88437633 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 1 Dec 2023 18:38:16 +0900 Subject: [PATCH 11/37] WIP --- Include/cpython/optimizer.h | 2 +- Python/ceval.c | 41 +++++++++++++++++++++++-------------- Python/optimizer.c | 9 +++++--- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 80cde17c9be617..5d102a50646485 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -61,7 +61,7 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); int _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); -int _PyOptimizer_Anywhere(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, PyObject **stack_pointer); +int _PyOptimizer_Anywhere(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); extern _PyOptimizerObject _PyOptimizer_Default; diff --git a/Python/ceval.c b/Python/ceval.c index ea236c3b641fe6..012322c5abaa6b 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1072,16 +1072,26 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyOpcode_OpName[frame->instr_ptr->op.code]); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); - if (frame->instr_ptr->op.code == ENTER_EXECUTOR) { + _Py_CODEUNIT *src, *dest; + src = dest = frame->instr_ptr; + opcode = src->op.code; + oparg = src->op.arg; + while (opcode == EXTENDED_ARG) { + src++; + opcode = src->op.code; + oparg = (oparg << 8) | src->op.arg; + } + if (opcode == ENTER_EXECUTOR) { + // frame->instr_ptr --> dest --> EXTENDED_ARG + // src --> ENTER_EXECUTOR // Avoid recursing into the same executor over and over - next_instr = frame->instr_ptr; + next_instr = dest; PyCodeObject *code = _PyFrame_GetCode(frame); - oparg = next_instr->op.arg; - _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg]; + _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg & 0xff]; if (executor == (_PyExecutorObject *)current_executor) { opcode = _PyOpcode_Deopt[executor->vm_data.opcode]; - DPRINTF(1, "Avoiding ENTER_EXECUTOR %d in favor of underlying base opcode %s %d\n", - oparg, _PyOpcode_OpName[opcode], executor->vm_data.oparg); + DPRINTF(1, "Avoiding ENTER_EXECUTOR %d/%d in favor of underlying base opcode %s %d\n", + oparg, oparg & 0xff, _PyOpcode_OpName[opcode], (oparg & 0xffffff00) | executor->vm_data.oparg); DPRINTF(1, " for %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), @@ -1089,7 +1099,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int code->co_firstlineno, 2 * (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)))); Py_DECREF(current_executor); - oparg = executor->vm_data.oparg; + oparg = (oparg & 0xffffff00) | executor->vm_data.oparg; PRE_DISPATCH_GOTO(); DISPATCH_GOTO(); } @@ -1101,23 +1111,23 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int uintptr_t *pcounter = current_executor->extra + pc; *pcounter += 1; if (*pcounter == 16 && - (frame->instr_ptr->op.code == POP_JUMP_IF_FALSE || - frame->instr_ptr->op.code == POP_JUMP_IF_TRUE || - frame->instr_ptr->op.code == POP_JUMP_IF_NONE || - frame->instr_ptr->op.code == POP_JUMP_IF_NOT_NONE)) + (opcode == POP_JUMP_IF_FALSE || + opcode == POP_JUMP_IF_TRUE || + opcode == POP_JUMP_IF_NONE || + opcode == POP_JUMP_IF_NOT_NONE)) { DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); - DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[frame->instr_ptr->op.code]); + DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); // The counter will cycle around in 2**64 executions :-) - int optimized = _PyOptimizer_Anywhere(frame, frame->instr_ptr, stack_pointer); + int optimized = _PyOptimizer_Anywhere(frame, src, dest, stack_pointer); if (optimized < 0) { goto error_tier_two; } if (optimized) { - DPRINTF(2, "--> Optimized %s @ %d in %p\n", + DPRINTF(1, "--> Optimized %s @ %d in %p\n", _PyUOpName(uopcode), pc, current_executor); - DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[frame->instr_ptr->op.code]); + DPRINTF(1, " T1: %s\n", _PyOpcode_OpName[src->op.code]); } else { DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", @@ -1139,6 +1149,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); goto enter_tier_one; } + #if defined(__GNUC__) # pragma GCC diagnostic pop #elif defined(_MSC_VER) /* MS_WINDOWS */ diff --git a/Python/optimizer.c b/Python/optimizer.c index 5817e4bf5cc221..c087255f8babfb 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -197,9 +197,12 @@ _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNI return optimizer_wherever(frame, src, dest, stack_pointer); } -// Start tracing at src and insert ENTER_EXECUTOR at the same place +// Start tracing and insert ENTER_EXECUTOR at the same place. +// Normally src == dest, but when there's an EXTENDED_ARG involved, +// dest points at the preceding EXTENDED_ARG. +// Do not use at JUMP_BACKWARD. Won't replace ENTER_EXECUTOR. int -_PyOptimizer_Anywhere(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, PyObject **stack_pointer) +_PyOptimizer_Anywhere(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) { if (src->op.code == JUMP_BACKWARD) { return 0; @@ -207,7 +210,7 @@ _PyOptimizer_Anywhere(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, PyObject ** if (src->op.code == ENTER_EXECUTOR) { return 0; } - return optimizer_wherever(frame, src, src, stack_pointer); + return optimizer_wherever(frame, src, dest, stack_pointer); } _PyExecutorObject * From 256b156d411e089de0f21a0ab8aafdd247883371 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Dec 2023 19:00:39 +0900 Subject: [PATCH 12/37] Fix where next_instr points upon E_E avoidance --- Python/ceval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index 012322c5abaa6b..9a6199fa4899d3 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1085,7 +1085,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // frame->instr_ptr --> dest --> EXTENDED_ARG // src --> ENTER_EXECUTOR // Avoid recursing into the same executor over and over - next_instr = dest; + next_instr = src; PyCodeObject *code = _PyFrame_GetCode(frame); _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg & 0xff]; if (executor == (_PyExecutorObject *)current_executor) { From 75c7c3265a99b64fa5f044be6bbfaaaeda082319 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Dec 2023 19:22:56 +0900 Subject: [PATCH 13/37] Allow executors with oparg >= 256 --- Python/bytecodes.c | 3 +-- Python/generated_cases.c.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 790f81797e2784..1ae83422730f8f 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2311,8 +2311,7 @@ dummy_func( uint16_t ucounter = this_instr[1].cache + (1 << 15); uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); // Double-check that the opcode isn't instrumented or something: - // Also bail if extended oparg (>= 256) - if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD && oparg < 256) { + if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { OPT_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, this_instr, next_instr, stack_pointer); ERROR_IF(optimized < 0, error); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 6d51f21d83a64e..65e6f11f68b38c 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3223,8 +3223,7 @@ uint16_t ucounter = this_instr[1].cache + (1 << 15); uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); // Double-check that the opcode isn't instrumented or something: - // Also bail if extended oparg (>= 256) - if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD && oparg < 256) { + if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { OPT_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, this_instr, next_instr, stack_pointer); if (optimized < 0) goto error; From a94c7f1f7a7b396d378bd93729dd422c6c83dba3 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 2 Dec 2023 19:48:26 +0900 Subject: [PATCH 14/37] Don't try to optimize with default optimizer --- Python/ceval.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index 9a6199fa4899d3..1c90011462e6ee 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1110,7 +1110,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int int pc = next_uop - 1 - current_executor->trace; uintptr_t *pcounter = current_executor->extra + pc; *pcounter += 1; - if (*pcounter == 16 && + if (*pcounter == 16 && // TODO: use resume_threshold + tstate->interp->optimizer != &_PyOptimizer_Default && (opcode == POP_JUMP_IF_FALSE || opcode == POP_JUMP_IF_TRUE || opcode == POP_JUMP_IF_NONE || From 747a3f07ef1624161ce7b56fea98a9a644e8bae4 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 11 Dec 2023 14:22:49 -0800 Subject: [PATCH 15/37] Use separate 'counters' and 'executors' arrays (The latter as yet unused.) --- Include/internal/pycore_uops.h | 4 +++- Python/ceval.c | 4 ++-- Python/optimizer.c | 8 +++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h index d587114bc556af..b92b2678bfa913 100644 --- a/Include/internal/pycore_uops.h +++ b/Include/internal/pycore_uops.h @@ -21,7 +21,9 @@ typedef struct { typedef struct { _PyExecutorObject base; - uintptr_t *extra; // An array of uintptr_t of size base.ob_base.ob_size + // Auxiliary arrays, allocated after trace[base.ob_size] + uint16_t *counters; // An array of counters + _PyExecutorObject **executors; // An array of executors _PyUOpInstruction trace[1]; } _PyUOpExecutorObject; diff --git a/Python/ceval.c b/Python/ceval.c index 1c90011462e6ee..9d2d1ea57fc053 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1108,7 +1108,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyFrame_SetStackPointer(frame, stack_pointer); // Increment side exit counter for this uop int pc = next_uop - 1 - current_executor->trace; - uintptr_t *pcounter = current_executor->extra + pc; + uint16_t *pcounter = current_executor->counters + pc; *pcounter += 1; if (*pcounter == 16 && // TODO: use resume_threshold tstate->interp->optimizer != &_PyOptimizer_Default && @@ -1120,7 +1120,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); - // The counter will cycle around in 2**64 executions :-) + // The counter will cycle around once the 16 bits overflow int optimized = _PyOptimizer_Anywhere(frame, src, dest, stack_pointer); if (optimized < 0) { goto error_tier_two; diff --git a/Python/optimizer.c b/Python/optimizer.c index c087255f8babfb..cda231ea4099bf 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -403,7 +403,7 @@ PyTypeObject _PyUOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", .tp_basicsize = sizeof(_PyUOpExecutorObject) - sizeof(_PyUOpInstruction), - .tp_itemsize = sizeof(_PyUOpInstruction) + sizeof(uintptr_t), + .tp_itemsize = sizeof(_PyUOpInstruction) + sizeof(uint16_t) + sizeof(_PyExecutorObject *), .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, @@ -849,8 +849,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) if (executor == NULL) { return NULL; } - executor->extra = (uintptr_t *)(executor->trace + length); - memset(executor->extra, 0, sizeof(uintptr_t) * length); + executor->counters = (uint16_t *)(&executor->trace[length]); + memset(executor->counters, 0, sizeof(uint16_t) * length); + executor->executors = (_PyExecutorObject **)(&executor->counters[length]); + memset(executor->executors, 0, sizeof(_PyExecutorObject *) * length); int dest = length - 1; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { From 682cf5a46b90184bae3c6a9853a35097d01dd37e Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 11 Dec 2023 21:48:19 -0800 Subject: [PATCH 16/37] Jump directly to side-exit executors --- Include/cpython/optimizer.h | 2 +- Python/ceval.c | 35 +++++++++++++++++++++++++++++++++-- Python/optimizer.c | 37 ++++++++++++++++++------------------- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 5d102a50646485..ee0823d1533f2d 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -61,7 +61,7 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset); int _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); -int _PyOptimizer_Anywhere(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer); +int _PyOptimizer_Unanchored(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **pexecutor, PyObject **stack_pointer); extern _PyOptimizerObject _PyOptimizer_Default; diff --git a/Python/ceval.c b/Python/ceval.c index 9d2d1ea57fc053..4a690e3af06535 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1108,9 +1108,22 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyFrame_SetStackPointer(frame, stack_pointer); // Increment side exit counter for this uop int pc = next_uop - 1 - current_executor->trace; + _PyExecutorObject **pexecutor = current_executor->executors + pc; + if (*pexecutor != NULL) { + PyCodeObject *code = _PyFrame_GetCode(frame); + DPRINTF(2, "Jumping to new executor for %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); + Py_DECREF(current_executor); + current_executor = (_PyUOpExecutorObject *)*pexecutor; + Py_INCREF(current_executor); + goto enter_tier_two; + } uint16_t *pcounter = current_executor->counters + pc; *pcounter += 1; - if (*pcounter == 16 && // TODO: use resume_threshold + if (*pcounter == 32 && // TODO: use resume_threshold tstate->interp->optimizer != &_PyOptimizer_Default && (opcode == POP_JUMP_IF_FALSE || opcode == POP_JUMP_IF_TRUE || @@ -1121,7 +1134,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); // The counter will cycle around once the 16 bits overflow - int optimized = _PyOptimizer_Anywhere(frame, src, dest, stack_pointer); + int optimized = _PyOptimizer_Unanchored(frame, dest, pexecutor, stack_pointer); if (optimized < 0) { goto error_tier_two; } @@ -1129,6 +1142,24 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int DPRINTF(1, "--> Optimized %s @ %d in %p\n", _PyUOpName(uopcode), pc, current_executor); DPRINTF(1, " T1: %s\n", _PyOpcode_OpName[src->op.code]); + PyCodeObject *code = _PyFrame_GetCode(frame); + DPRINTF(2, "Jumping to fresh executor for %s (%s:%d) at byte offset %d\n", + PyUnicode_AsUTF8(code->co_qualname), + PyUnicode_AsUTF8(code->co_filename), + code->co_firstlineno, + 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); + Py_DECREF(current_executor); + current_executor = (_PyUOpExecutorObject *)*pexecutor; + if (current_executor->trace[0].opcode != uopcode) { + Py_INCREF(current_executor); + goto enter_tier_two; + } + // This is guaranteed to deopt again; forget about it + DPRINTF(2, "It's not an improvement -- discarding trace\n"); + *pexecutor = NULL; + Py_DECREF(current_executor); + next_instr = frame->instr_ptr; + goto resume_frame; } else { DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", diff --git a/Python/optimizer.c b/Python/optimizer.c index cda231ea4099bf..fb11465788043f 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -155,11 +155,10 @@ PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) Py_DECREF(old); } -// src is where to insert ENTER_EXECUTOR -// dest is where to start tracing -static int -optimizer_wherever(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) +int +_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) { + assert(src->op.code == JUMP_BACKWARD); PyCodeObject *code = _PyFrame_GetCode(frame); assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); @@ -190,27 +189,27 @@ optimizer_wherever(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT * return 1; } +// Return an unanchored executor. The caller owns the executor when returning 1. +// No ENTER_EXECUTOR is inserted, nor is the executor added to the code object. int -_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) -{ - assert(src->op.code == JUMP_BACKWARD); - return optimizer_wherever(frame, src, dest, stack_pointer); -} - -// Start tracing and insert ENTER_EXECUTOR at the same place. -// Normally src == dest, but when there's an EXTENDED_ARG involved, -// dest points at the preceding EXTENDED_ARG. -// Do not use at JUMP_BACKWARD. Won't replace ENTER_EXECUTOR. -int -_PyOptimizer_Anywhere(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer) +_PyOptimizer_Unanchored( + _PyInterpreterFrame *frame, + _Py_CODEUNIT *instr, + _PyExecutorObject **pexecutor, + PyObject **stack_pointer) { - if (src->op.code == JUMP_BACKWARD) { + if (instr->op.code == JUMP_BACKWARD || instr->op.code == ENTER_EXECUTOR) { return 0; } - if (src->op.code == ENTER_EXECUTOR) { + PyCodeObject *code = _PyFrame_GetCode(frame); + assert(PyCode_Check(code)); + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyOptimizerObject *opt = interp->optimizer; + if (strcmp(opt->ob_base.ob_type->tp_name, "uop_optimizer") != 0) { return 0; } - return optimizer_wherever(frame, src, dest, stack_pointer); + *pexecutor = NULL; + return opt->optimize(opt, code, instr, pexecutor, (int)(stack_pointer - _PyFrame_Stackbase(frame))); } _PyExecutorObject * From 359c6fc379be15e2a6ddb975cec92733a843b8cf Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 12 Dec 2023 17:18:49 -0800 Subject: [PATCH 17/37] Remove progress check; clean up the rest a big --- Python/ceval.c | 93 +++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 55 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 4a690e3af06535..ea1090b31fafa5 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -752,7 +752,10 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto exit_unwind; } +// Jump here from ENTER_EXECUTOR, and code under the deoptimize label +enter_tier_one: next_instr = frame->instr_ptr; + resume_frame: stack_pointer = _PyFrame_GetStackPointer(frame); @@ -1063,8 +1066,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // Jump here from DEOPT_IF() deoptimize: - // On DEOPT_IF we just repeat the last instruction. + // On DEOPT_IF we must repeat the last instruction. // This presumes nothing was popped from the stack (nor pushed). + // There are some other things to take care of first, though. frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n", uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target, @@ -1072,41 +1076,10 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyOpcode_OpName[frame->instr_ptr->op.code]); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); - _Py_CODEUNIT *src, *dest; - src = dest = frame->instr_ptr; - opcode = src->op.code; - oparg = src->op.arg; - while (opcode == EXTENDED_ARG) { - src++; - opcode = src->op.code; - oparg = (oparg << 8) | src->op.arg; - } - if (opcode == ENTER_EXECUTOR) { - // frame->instr_ptr --> dest --> EXTENDED_ARG - // src --> ENTER_EXECUTOR - // Avoid recursing into the same executor over and over - next_instr = src; - PyCodeObject *code = _PyFrame_GetCode(frame); - _PyExecutorObject *executor = (_PyExecutorObject *)code->co_executors->executors[oparg & 0xff]; - if (executor == (_PyExecutorObject *)current_executor) { - opcode = _PyOpcode_Deopt[executor->vm_data.opcode]; - DPRINTF(1, "Avoiding ENTER_EXECUTOR %d/%d in favor of underlying base opcode %s %d\n", - oparg, oparg & 0xff, _PyOpcode_OpName[opcode], (oparg & 0xffffff00) | executor->vm_data.oparg); - DPRINTF(1, - " for %s (%s:%d) at byte offset %d\n", - PyUnicode_AsUTF8(code->co_qualname), - PyUnicode_AsUTF8(code->co_filename), - code->co_firstlineno, - 2 * (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)))); - Py_DECREF(current_executor); - oparg = (oparg & 0xffffff00) | executor->vm_data.oparg; - PRE_DISPATCH_GOTO(); - DISPATCH_GOTO(); - } - } frame->return_offset = 0; // Don't leave this random _PyFrame_SetStackPointer(frame, stack_pointer); - // Increment side exit counter for this uop + + // Check if there is a side-exit executor here already. int pc = next_uop - 1 - current_executor->trace; _PyExecutorObject **pexecutor = current_executor->executors + pc; if (*pexecutor != NULL) { @@ -1121,14 +1094,30 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int Py_INCREF(current_executor); goto enter_tier_two; } + + // Increment and check side exit counter. uint16_t *pcounter = current_executor->counters + pc; *pcounter += 1; - if (*pcounter == 32 && // TODO: use resume_threshold - tstate->interp->optimizer != &_PyOptimizer_Default && - (opcode == POP_JUMP_IF_FALSE || - opcode == POP_JUMP_IF_TRUE || - opcode == POP_JUMP_IF_NONE || - opcode == POP_JUMP_IF_NOT_NONE)) + if (*pcounter != 32 || // TODO: use resume_threshold + tstate->interp->optimizer == &_PyOptimizer_Default) + { + goto enter_tier_one; + } + + // Decode instruction to look past EXTENDED_ARG. + _Py_CODEUNIT *src, *dest; + src = dest = frame->instr_ptr; + opcode = src->op.code; + if (opcode == EXTENDED_ARG) { + src++; + opcode = src->op.code; + } + + // For selected opcodes build a new executor and enter it now. + if (opcode == POP_JUMP_IF_FALSE || + opcode == POP_JUMP_IF_TRUE || + opcode == POP_JUMP_IF_NONE || + opcode == POP_JUMP_IF_NOT_NONE) { DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); @@ -1138,7 +1127,11 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int if (optimized < 0) { goto error_tier_two; } - if (optimized) { + if (!optimized) { + DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", + _PyUOpName(uopcode), pc, current_executor); + } + else { DPRINTF(1, "--> Optimized %s @ %d in %p\n", _PyUOpName(uopcode), pc, current_executor); DPRINTF(1, " T1: %s\n", _PyOpcode_OpName[src->op.code]); @@ -1150,28 +1143,18 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); Py_DECREF(current_executor); current_executor = (_PyUOpExecutorObject *)*pexecutor; + // TODO: Check at least two uops: _IS_NONE, _POP_JUMP_IF_TRUE/FALSE. if (current_executor->trace[0].opcode != uopcode) { Py_INCREF(current_executor); - goto enter_tier_two; + goto enter_tier_two; // Yes! } // This is guaranteed to deopt again; forget about it - DPRINTF(2, "It's not an improvement -- discarding trace\n"); + DPRINTF(2, "Alas, it's the same uop again -- discarding trace\n"); *pexecutor = NULL; - Py_DECREF(current_executor); - next_instr = frame->instr_ptr; - goto resume_frame; - } - else { - DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", - _PyUOpName(uopcode), pc, current_executor); } } Py_DECREF(current_executor); - // Fall through -// Jump here from ENTER_EXECUTOR -enter_tier_one: - next_instr = frame->instr_ptr; - goto resume_frame; + goto enter_tier_one; // Jump here from _EXIT_TRACE exit_trace: From ca6ed3a82f47e9d9c12ee157a26103b7a3177922 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 12 Dec 2023 17:22:55 -0800 Subject: [PATCH 18/37] Ensure array of executor pointers is 64-bit aligned --- Python/optimizer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index fb11465788043f..83b3e03300159b 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -848,10 +848,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies) if (executor == NULL) { return NULL; } - executor->counters = (uint16_t *)(&executor->trace[length]); - memset(executor->counters, 0, sizeof(uint16_t) * length); - executor->executors = (_PyExecutorObject **)(&executor->counters[length]); + executor->executors = (_PyExecutorObject **)(&executor->trace[length]); + executor->counters = (uint16_t *)(&executor->executors[length]); memset(executor->executors, 0, sizeof(_PyExecutorObject *) * length); + memset(executor->counters, 0, sizeof(uint16_t) * length); int dest = length - 1; /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */ for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) { From e2a26b59c7060d48e73036771b41f4684be9e924 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 12 Dec 2023 18:36:04 -0800 Subject: [PATCH 19/37] Check at least two uops; further cleanup --- Python/ceval.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index ea1090b31fafa5..9b8bfe847276a8 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -752,7 +752,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto exit_unwind; } -// Jump here from ENTER_EXECUTOR, and code under the deoptimize label +// Jump here from ENTER_EXECUTOR and exit_trace. enter_tier_one: next_instr = frame->instr_ptr; @@ -1083,12 +1083,14 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int int pc = next_uop - 1 - current_executor->trace; _PyExecutorObject **pexecutor = current_executor->executors + pc; if (*pexecutor != NULL) { +#ifdef Py_DEBUG PyCodeObject *code = _PyFrame_GetCode(frame); DPRINTF(2, "Jumping to new executor for %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), PyUnicode_AsUTF8(code->co_filename), code->co_firstlineno, 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); +#endif Py_DECREF(current_executor); current_executor = (_PyUOpExecutorObject *)*pexecutor; Py_INCREF(current_executor); @@ -1096,21 +1098,19 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } // Increment and check side exit counter. + next_instr = frame->instr_ptr; uint16_t *pcounter = current_executor->counters + pc; *pcounter += 1; if (*pcounter != 32 || // TODO: use resume_threshold tstate->interp->optimizer == &_PyOptimizer_Default) { - goto enter_tier_one; + goto resume_frame; } // Decode instruction to look past EXTENDED_ARG. - _Py_CODEUNIT *src, *dest; - src = dest = frame->instr_ptr; - opcode = src->op.code; + opcode = next_instr[0].op.code; if (opcode == EXTENDED_ARG) { - src++; - opcode = src->op.code; + opcode = next_instr[1].op.code; } // For selected opcodes build a new executor and enter it now. @@ -1122,39 +1122,49 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); - // The counter will cycle around once the 16 bits overflow - int optimized = _PyOptimizer_Unanchored(frame, dest, pexecutor, stack_pointer); + + int optimized = _PyOptimizer_Unanchored(frame, next_instr, pexecutor, stack_pointer); if (optimized < 0) { goto error_tier_two; } + if (!optimized) { DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", _PyUOpName(uopcode), pc, current_executor); } else { +#ifdef Py_DEBUG DPRINTF(1, "--> Optimized %s @ %d in %p\n", _PyUOpName(uopcode), pc, current_executor); - DPRINTF(1, " T1: %s\n", _PyOpcode_OpName[src->op.code]); PyCodeObject *code = _PyFrame_GetCode(frame); DPRINTF(2, "Jumping to fresh executor for %s (%s:%d) at byte offset %d\n", PyUnicode_AsUTF8(code->co_qualname), PyUnicode_AsUTF8(code->co_filename), code->co_firstlineno, 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); +#endif Py_DECREF(current_executor); current_executor = (_PyUOpExecutorObject *)*pexecutor; - // TODO: Check at least two uops: _IS_NONE, _POP_JUMP_IF_TRUE/FALSE. - if (current_executor->trace[0].opcode != uopcode) { + + // Reject trace if it repeats the uop that just deoptimized. + int jump_opcode = current_executor->trace[0].opcode; + if (jump_opcode == _IS_NONE) { + jump_opcode = current_executor->trace[1].opcode; + } + if (jump_opcode != uopcode) { Py_INCREF(current_executor); - goto enter_tier_two; // Yes! + goto enter_tier_two; // All systems go! } - // This is guaranteed to deopt again; forget about it - DPRINTF(2, "Alas, it's the same uop again -- discarding trace\n"); + + // The trace is guaranteed to deopt again; forget about it. + DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n", + _PyUOpName(jump_opcode)); *pexecutor = NULL; + // It will be decref'ed below. } } Py_DECREF(current_executor); - goto enter_tier_one; + goto resume_frame; // Jump here from _EXIT_TRACE exit_trace: From 38c7aab2a5e801cc695eb48fee09fe7cfb32bc48 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 12 Dec 2023 18:37:04 -0800 Subject: [PATCH 20/37] Move exit_trace up, since it is smaller --- Python/ceval.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 9b8bfe847276a8..bff14acdfc9f75 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1064,6 +1064,14 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int Py_DECREF(current_executor); goto resume_with_error; +// Jump here from _EXIT_TRACE +exit_trace: + _PyFrame_SetStackPointer(frame, stack_pointer); + frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); + Py_DECREF(current_executor); + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); + goto enter_tier_one; + // Jump here from DEOPT_IF() deoptimize: // On DEOPT_IF we must repeat the last instruction. @@ -1165,14 +1173,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } Py_DECREF(current_executor); goto resume_frame; - -// Jump here from _EXIT_TRACE -exit_trace: - _PyFrame_SetStackPointer(frame, stack_pointer); - frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); - Py_DECREF(current_executor); - OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); - goto enter_tier_one; } #if defined(__GNUC__) From 0f64231b80615aa81ac81967baecae364e29cbc0 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 12 Dec 2023 18:55:46 -0800 Subject: [PATCH 21/37] Use configured threshold and exp. backoff for counter --- Python/ceval.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index bff14acdfc9f75..75ab7452ffa5a3 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1106,11 +1106,15 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } // Increment and check side exit counter. + // (Even though we only need it for certain opcodes.) next_instr = frame->instr_ptr; uint16_t *pcounter = current_executor->counters + pc; - *pcounter += 1; - if (*pcounter != 32 || // TODO: use resume_threshold - tstate->interp->optimizer == &_PyOptimizer_Default) + *pcounter += 1 << OPTIMIZER_BITS_IN_COUNTER; + /* We are using unsigned values, but we really want signed values, so + * do the 2s complement comparison manually */ + uint16_t ucounter = *pcounter + (1 << 15); + uint16_t threshold = tstate->interp->optimizer_resume_threshold + (1 << 15); + if (ucounter <= threshold) { goto resume_frame; } @@ -1160,6 +1164,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int jump_opcode = current_executor->trace[1].opcode; } if (jump_opcode != uopcode) { + *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); Py_INCREF(current_executor); goto enter_tier_two; // All systems go! } @@ -1171,6 +1176,18 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // It will be decref'ed below. } } + + // Exponential backoff if we didn't optimize. + int backoff = *pcounter & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); + if (backoff < MINIMUM_TIER2_BACKOFF) { + backoff = MINIMUM_TIER2_BACKOFF; + } + else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) { + backoff++; + } + assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); + *pcounter = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff; + Py_DECREF(current_executor); goto resume_frame; } From 83297dfd4d63fa68fe57ca74d5a6b26fdd54d29c Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 13:18:13 -0800 Subject: [PATCH 22/37] Add API to access sub-interpreters ...and set resume_threshold so they are actually produced. --- Python/optimizer.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 83b3e03300159b..14a9ee5c173d63 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -398,6 +398,32 @@ PySequenceMethods uop_as_sequence = { .sq_item = (ssizeargfunc)uop_item, }; +static PyObject * +sub_executors(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + _PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)self; + Py_ssize_t len = uop_len(executor); + PyObject *list = PyList_New(len); + if (list == NULL) { + return NULL; + } + for (Py_ssize_t i = 0; i < len; i++) { + PyObject *sub = (PyObject *)executor->executors[i]; + if (sub == NULL) { + sub = Py_None; + } + Py_INCREF(sub); + PyList_SET_ITEM(list, i, (PyObject *)sub); + } + return list; +} + +static PyMethodDef uop_executor_methods[] = { + { "is_valid", is_valid, METH_NOARGS, NULL }, + { "sub_executors", sub_executors, METH_NOARGS, NULL }, + { NULL, NULL }, +}; + PyTypeObject _PyUOpExecutor_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) .tp_name = "uop_executor", @@ -406,7 +432,7 @@ PyTypeObject _PyUOpExecutor_Type = { .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, .tp_dealloc = (destructor)uop_dealloc, .tp_as_sequence = &uop_as_sequence, - .tp_methods = executor_methods, + .tp_methods = uop_executor_methods, }; /* TO DO -- Generate these tables */ @@ -960,9 +986,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) return NULL; } opt->optimize = uop_optimize; - opt->resume_threshold = INT16_MAX; - // Need at least 3 iterations to settle specializations. - // A few lower bits of the counter are reserved for other flags. + // The lower bits are reserved for exponential backoff. + opt->resume_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; return (PyObject *)opt; } From d065a94abf1327063831b66057397299c9343b27 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 13:28:04 -0800 Subject: [PATCH 23/37] Move optimizer/executor tests to new file test_capi/test_opt.py --- Lib/test/test_capi/test_misc.py | 535 ------------------------------- Lib/test/test_capi/test_opt.py | 544 ++++++++++++++++++++++++++++++++ 2 files changed, 544 insertions(+), 535 deletions(-) create mode 100644 Lib/test/test_capi/test_opt.py diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 776ee913a02216..123813b949fb7d 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -7,7 +7,6 @@ import importlib.machinery import importlib.util import json -import opcode import os import pickle import queue @@ -2483,540 +2482,6 @@ def func(): self.do_test(func, names) -@contextlib.contextmanager -def temporary_optimizer(opt): - old_opt = _testinternalcapi.get_optimizer() - _testinternalcapi.set_optimizer(opt) - try: - yield - finally: - _testinternalcapi.set_optimizer(old_opt) - - -@contextlib.contextmanager -def clear_executors(func): - # Clear executors in func before and after running a block - func.__code__ = func.__code__.replace() - try: - yield - finally: - func.__code__ = func.__code__.replace() - - -class TestOptimizerAPI(unittest.TestCase): - - def test_get_counter_optimizer_dealloc(self): - # See gh-108727 - def f(): - _testinternalcapi.get_counter_optimizer() - - f() - - def test_get_set_optimizer(self): - old = _testinternalcapi.get_optimizer() - opt = _testinternalcapi.get_counter_optimizer() - try: - _testinternalcapi.set_optimizer(opt) - self.assertEqual(_testinternalcapi.get_optimizer(), opt) - _testinternalcapi.set_optimizer(None) - self.assertEqual(_testinternalcapi.get_optimizer(), None) - finally: - _testinternalcapi.set_optimizer(old) - - - def test_counter_optimizer(self): - # Generate a new function at each call - ns = {} - exec(textwrap.dedent(""" - def loop(): - for _ in range(1000): - pass - """), ns, ns) - loop = ns['loop'] - - for repeat in range(5): - opt = _testinternalcapi.get_counter_optimizer() - with temporary_optimizer(opt): - self.assertEqual(opt.get_count(), 0) - with clear_executors(loop): - loop() - self.assertEqual(opt.get_count(), 1000) - - def test_long_loop(self): - "Check that we aren't confused by EXTENDED_ARG" - - # Generate a new function at each call - ns = {} - exec(textwrap.dedent(""" - def nop(): - pass - - def long_loop(): - for _ in range(10): - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); - """), ns, ns) - long_loop = ns['long_loop'] - - opt = _testinternalcapi.get_counter_optimizer() - with temporary_optimizer(opt): - self.assertEqual(opt.get_count(), 0) - long_loop() - self.assertEqual(opt.get_count(), 10) - - def test_code_restore_for_ENTER_EXECUTOR(self): - def testfunc(x): - i = 0 - while i < x: - i += 1 - - opt = _testinternalcapi.get_counter_optimizer() - with temporary_optimizer(opt): - testfunc(1000) - code, replace_code = testfunc.__code__, testfunc.__code__.replace() - self.assertEqual(code, replace_code) - self.assertEqual(hash(code), hash(replace_code)) - - -def get_first_executor(func): - code = func.__code__ - co_code = code.co_code - JUMP_BACKWARD = opcode.opmap["JUMP_BACKWARD"] - for i in range(0, len(co_code), 2): - if co_code[i] == JUMP_BACKWARD: - try: - return _testinternalcapi.get_executor(code, i) - except ValueError: - pass - return None - - -class TestExecutorInvalidation(unittest.TestCase): - - def setUp(self): - self.old = _testinternalcapi.get_optimizer() - self.opt = _testinternalcapi.get_counter_optimizer() - _testinternalcapi.set_optimizer(self.opt) - - def tearDown(self): - _testinternalcapi.set_optimizer(self.old) - - def test_invalidate_object(self): - # Generate a new set of functions at each call - ns = {} - func_src = "\n".join( - f""" - def f{n}(): - for _ in range(1000): - pass - """ for n in range(5) - ) - exec(textwrap.dedent(func_src), ns, ns) - funcs = [ ns[f'f{n}'] for n in range(5)] - objects = [object() for _ in range(5)] - - for f in funcs: - f() - executors = [get_first_executor(f) for f in funcs] - # Set things up so each executor depends on the objects - # with an equal or lower index. - for i, exe in enumerate(executors): - self.assertTrue(exe.is_valid()) - for obj in objects[:i+1]: - _testinternalcapi.add_executor_dependency(exe, obj) - self.assertTrue(exe.is_valid()) - # Assert that the correct executors are invalidated - # and check that nothing crashes when we invalidate - # an executor mutliple times. - for i in (4,3,2,1,0): - _testinternalcapi.invalidate_executors(objects[i]) - for exe in executors[i:]: - self.assertFalse(exe.is_valid()) - for exe in executors[:i]: - self.assertTrue(exe.is_valid()) - - def test_uop_optimizer_invalidation(self): - # Generate a new function at each call - ns = {} - exec(textwrap.dedent(""" - def f(): - for i in range(1000): - pass - """), ns, ns) - f = ns['f'] - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - f() - exe = get_first_executor(f) - self.assertTrue(exe.is_valid()) - _testinternalcapi.invalidate_executors(f.__code__) - self.assertFalse(exe.is_valid()) - -class TestUops(unittest.TestCase): - - def test_basic_loop(self): - def testfunc(x): - i = 0 - while i < x: - i += 1 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(1000) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_SET_IP", uops) - self.assertIn("LOAD_FAST", uops) - - def test_extended_arg(self): - "Check EXTENDED_ARG handling in superblock creation" - ns = {} - exec(textwrap.dedent(""" - def many_vars(): - # 260 vars, so z9 should have index 259 - a0 = a1 = a2 = a3 = a4 = a5 = a6 = a7 = a8 = a9 = 42 - b0 = b1 = b2 = b3 = b4 = b5 = b6 = b7 = b8 = b9 = 42 - c0 = c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = c9 = 42 - d0 = d1 = d2 = d3 = d4 = d5 = d6 = d7 = d8 = d9 = 42 - e0 = e1 = e2 = e3 = e4 = e5 = e6 = e7 = e8 = e9 = 42 - f0 = f1 = f2 = f3 = f4 = f5 = f6 = f7 = f8 = f9 = 42 - g0 = g1 = g2 = g3 = g4 = g5 = g6 = g7 = g8 = g9 = 42 - h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = h8 = h9 = 42 - i0 = i1 = i2 = i3 = i4 = i5 = i6 = i7 = i8 = i9 = 42 - j0 = j1 = j2 = j3 = j4 = j5 = j6 = j7 = j8 = j9 = 42 - k0 = k1 = k2 = k3 = k4 = k5 = k6 = k7 = k8 = k9 = 42 - l0 = l1 = l2 = l3 = l4 = l5 = l6 = l7 = l8 = l9 = 42 - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m8 = m9 = 42 - n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = n8 = n9 = 42 - o0 = o1 = o2 = o3 = o4 = o5 = o6 = o7 = o8 = o9 = 42 - p0 = p1 = p2 = p3 = p4 = p5 = p6 = p7 = p8 = p9 = 42 - q0 = q1 = q2 = q3 = q4 = q5 = q6 = q7 = q8 = q9 = 42 - r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = r8 = r9 = 42 - s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 = s8 = s9 = 42 - t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 42 - u0 = u1 = u2 = u3 = u4 = u5 = u6 = u7 = u8 = u9 = 42 - v0 = v1 = v2 = v3 = v4 = v5 = v6 = v7 = v8 = v9 = 42 - w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = 42 - x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = x8 = x9 = 42 - y0 = y1 = y2 = y3 = y4 = y5 = y6 = y7 = y8 = y9 = 42 - z0 = z1 = z2 = z3 = z4 = z5 = z6 = z7 = z8 = z9 = 42 - while z9 > 0: - z9 = z9 - 1 - """), ns, ns) - many_vars = ns["many_vars"] - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - ex = get_first_executor(many_vars) - self.assertIsNone(ex) - many_vars() - - ex = get_first_executor(many_vars) - self.assertIsNotNone(ex) - self.assertIn(("LOAD_FAST", 259, 0), list(ex)) - - def test_unspecialized_unpack(self): - # An example of an unspecialized opcode - def testfunc(x): - i = 0 - while i < x: - i += 1 - a, b = {1: 2, 3: 3} - assert a == 1 and b == 3 - i = 0 - while i < x: - i += 1 - - opt = _testinternalcapi.get_uop_optimizer() - - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_UNPACK_SEQUENCE", uops) - - def test_pop_jump_if_false(self): - def testfunc(n): - i = 0 - while i < n: - i += 1 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_IS_TRUE_POP", uops) - - def test_pop_jump_if_none(self): - def testfunc(a): - for x in a: - if x is None: - x = 0 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(range(20)) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_IS_NOT_NONE_POP", uops) - - def test_pop_jump_if_not_none(self): - def testfunc(a): - for x in a: - x = None - if x is not None: - x = 0 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(range(20)) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_IS_NONE_POP", uops) - - def test_pop_jump_if_true(self): - def testfunc(n): - i = 0 - while not i >= n: - i += 1 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_IS_FALSE_POP", uops) - - def test_jump_backward(self): - def testfunc(n): - i = 0 - while i < n: - i += 1 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_JUMP_TO_TOP", uops) - - def test_jump_forward(self): - def testfunc(n): - a = 0 - while a < n: - if a < 0: - a = -a - else: - a = +a - a += 1 - return a - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - # Since there is no JUMP_FORWARD instruction, - # look for indirect evidence: the += operator - self.assertIn("_BINARY_OP_ADD_INT", uops) - - def test_for_iter_range(self): - def testfunc(n): - total = 0 - for i in range(n): - total += i - return total - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - total = testfunc(20) - self.assertEqual(total, 190) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - # for i, (opname, oparg) in enumerate(ex): - # print(f"{i:4d}: {opname:<20s} {oparg:3d}") - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_NOT_EXHAUSTED_RANGE", uops) - # Verification that the jump goes past END_FOR - # is done by manual inspection of the output - - def test_for_iter_list(self): - def testfunc(a): - total = 0 - for i in a: - total += i - return total - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - a = list(range(20)) - total = testfunc(a) - self.assertEqual(total, 190) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - # for i, (opname, oparg) in enumerate(ex): - # print(f"{i:4d}: {opname:<20s} {oparg:3d}") - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_NOT_EXHAUSTED_LIST", uops) - # Verification that the jump goes past END_FOR - # is done by manual inspection of the output - - def test_for_iter_tuple(self): - def testfunc(a): - total = 0 - for i in a: - total += i - return total - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - a = tuple(range(20)) - total = testfunc(a) - self.assertEqual(total, 190) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - # for i, (opname, oparg) in enumerate(ex): - # print(f"{i:4d}: {opname:<20s} {oparg:3d}") - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_NOT_EXHAUSTED_TUPLE", uops) - # Verification that the jump goes past END_FOR - # is done by manual inspection of the output - - def test_list_edge_case(self): - def testfunc(it): - for x in it: - pass - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - a = [1, 2, 3] - it = iter(a) - testfunc(it) - a.append(4) - with self.assertRaises(StopIteration): - next(it) - - def test_call_py_exact_args(self): - def testfunc(n): - def dummy(x): - return x+1 - for i in range(n): - dummy(i) - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_PUSH_FRAME", uops) - self.assertIn("_BINARY_OP_ADD_INT", uops) - - def test_branch_taken(self): - def testfunc(n): - for i in range(n): - if i < 0: - i = 0 - else: - i = 1 - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - testfunc(20) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_GUARD_IS_FALSE_POP", uops) - - def test_for_iter_tier_two(self): - class MyIter: - def __init__(self, n): - self.n = n - def __iter__(self): - return self - def __next__(self): - self.n -= 1 - if self.n < 0: - raise StopIteration - return self.n - - def testfunc(n, m): - x = 0 - for i in range(m): - for j in MyIter(n): - x += 1000*i + j - return x - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - x = testfunc(10, 10) - - self.assertEqual(x, sum(range(10)) * 10010) - - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - uops = {opname for opname, _, _ in ex} - self.assertIn("_FOR_ITER_TIER_TWO", uops) - - def test_confidence_score(self): - def testfunc(n): - bits = 0 - for i in range(n): - if i & 0x01: - bits += 1 - if i & 0x02: - bits += 1 - if i&0x04: - bits += 1 - if i&0x08: - bits += 1 - if i&0x10: - bits += 1 - if i&0x20: - bits += 1 - return bits - - opt = _testinternalcapi.get_uop_optimizer() - with temporary_optimizer(opt): - x = testfunc(20) - - self.assertEqual(x, 40) - ex = get_first_executor(testfunc) - self.assertIsNotNone(ex) - ops = [opname for opname, _, _ in ex] - count = ops.count("_GUARD_IS_TRUE_POP") - # Because Each 'if' halves the score, the second branch is - # too much already. - self.assertEqual(count, 1) - - @unittest.skipUnless(support.Py_GIL_DISABLED, 'need Py_GIL_DISABLED') class TestPyThreadId(unittest.TestCase): def test_py_thread_id(self): diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py new file mode 100644 index 00000000000000..9f4731103c9413 --- /dev/null +++ b/Lib/test/test_capi/test_opt.py @@ -0,0 +1,544 @@ +import contextlib +import opcode +import textwrap +import unittest + +import _testinternalcapi + + +@contextlib.contextmanager +def temporary_optimizer(opt): + old_opt = _testinternalcapi.get_optimizer() + _testinternalcapi.set_optimizer(opt) + try: + yield + finally: + _testinternalcapi.set_optimizer(old_opt) + + +@contextlib.contextmanager +def clear_executors(func): + # Clear executors in func before and after running a block + func.__code__ = func.__code__.replace() + try: + yield + finally: + func.__code__ = func.__code__.replace() + + +class TestOptimizerAPI(unittest.TestCase): + + def test_get_counter_optimizer_dealloc(self): + # See gh-108727 + def f(): + _testinternalcapi.get_counter_optimizer() + + f() + + def test_get_set_optimizer(self): + old = _testinternalcapi.get_optimizer() + opt = _testinternalcapi.get_counter_optimizer() + try: + _testinternalcapi.set_optimizer(opt) + self.assertEqual(_testinternalcapi.get_optimizer(), opt) + _testinternalcapi.set_optimizer(None) + self.assertEqual(_testinternalcapi.get_optimizer(), None) + finally: + _testinternalcapi.set_optimizer(old) + + + def test_counter_optimizer(self): + # Generate a new function at each call + ns = {} + exec(textwrap.dedent(""" + def loop(): + for _ in range(1000): + pass + """), ns, ns) + loop = ns['loop'] + + for repeat in range(5): + opt = _testinternalcapi.get_counter_optimizer() + with temporary_optimizer(opt): + self.assertEqual(opt.get_count(), 0) + with clear_executors(loop): + loop() + self.assertEqual(opt.get_count(), 1000) + + def test_long_loop(self): + "Check that we aren't confused by EXTENDED_ARG" + + # Generate a new function at each call + ns = {} + exec(textwrap.dedent(""" + def nop(): + pass + + def long_loop(): + for _ in range(10): + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop(); + """), ns, ns) + long_loop = ns['long_loop'] + + opt = _testinternalcapi.get_counter_optimizer() + with temporary_optimizer(opt): + self.assertEqual(opt.get_count(), 0) + long_loop() + self.assertEqual(opt.get_count(), 10) + + def test_code_restore_for_ENTER_EXECUTOR(self): + def testfunc(x): + i = 0 + while i < x: + i += 1 + + opt = _testinternalcapi.get_counter_optimizer() + with temporary_optimizer(opt): + testfunc(1000) + code, replace_code = testfunc.__code__, testfunc.__code__.replace() + self.assertEqual(code, replace_code) + self.assertEqual(hash(code), hash(replace_code)) + + +def get_first_executor(func): + code = func.__code__ + co_code = code.co_code + JUMP_BACKWARD = opcode.opmap["JUMP_BACKWARD"] + for i in range(0, len(co_code), 2): + if co_code[i] == JUMP_BACKWARD: + try: + return _testinternalcapi.get_executor(code, i) + except ValueError: + pass + return None + + +class TestExecutorInvalidation(unittest.TestCase): + + def setUp(self): + self.old = _testinternalcapi.get_optimizer() + self.opt = _testinternalcapi.get_counter_optimizer() + _testinternalcapi.set_optimizer(self.opt) + + def tearDown(self): + _testinternalcapi.set_optimizer(self.old) + + def test_invalidate_object(self): + # Generate a new set of functions at each call + ns = {} + func_src = "\n".join( + f""" + def f{n}(): + for _ in range(1000): + pass + """ for n in range(5) + ) + exec(textwrap.dedent(func_src), ns, ns) + funcs = [ ns[f'f{n}'] for n in range(5)] + objects = [object() for _ in range(5)] + + for f in funcs: + f() + executors = [get_first_executor(f) for f in funcs] + # Set things up so each executor depends on the objects + # with an equal or lower index. + for i, exe in enumerate(executors): + self.assertTrue(exe.is_valid()) + for obj in objects[:i+1]: + _testinternalcapi.add_executor_dependency(exe, obj) + self.assertTrue(exe.is_valid()) + # Assert that the correct executors are invalidated + # and check that nothing crashes when we invalidate + # an executor mutliple times. + for i in (4,3,2,1,0): + _testinternalcapi.invalidate_executors(objects[i]) + for exe in executors[i:]: + self.assertFalse(exe.is_valid()) + for exe in executors[:i]: + self.assertTrue(exe.is_valid()) + + def test_uop_optimizer_invalidation(self): + # Generate a new function at each call + ns = {} + exec(textwrap.dedent(""" + def f(): + for i in range(1000): + pass + """), ns, ns) + f = ns['f'] + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + f() + exe = get_first_executor(f) + self.assertTrue(exe.is_valid()) + _testinternalcapi.invalidate_executors(f.__code__) + self.assertFalse(exe.is_valid()) + +class TestUops(unittest.TestCase): + + def test_basic_loop(self): + def testfunc(x): + i = 0 + while i < x: + i += 1 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(1000) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_SET_IP", uops) + self.assertIn("LOAD_FAST", uops) + + def test_extended_arg(self): + "Check EXTENDED_ARG handling in superblock creation" + ns = {} + exec(textwrap.dedent(""" + def many_vars(): + # 260 vars, so z9 should have index 259 + a0 = a1 = a2 = a3 = a4 = a5 = a6 = a7 = a8 = a9 = 42 + b0 = b1 = b2 = b3 = b4 = b5 = b6 = b7 = b8 = b9 = 42 + c0 = c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = c9 = 42 + d0 = d1 = d2 = d3 = d4 = d5 = d6 = d7 = d8 = d9 = 42 + e0 = e1 = e2 = e3 = e4 = e5 = e6 = e7 = e8 = e9 = 42 + f0 = f1 = f2 = f3 = f4 = f5 = f6 = f7 = f8 = f9 = 42 + g0 = g1 = g2 = g3 = g4 = g5 = g6 = g7 = g8 = g9 = 42 + h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = h8 = h9 = 42 + i0 = i1 = i2 = i3 = i4 = i5 = i6 = i7 = i8 = i9 = 42 + j0 = j1 = j2 = j3 = j4 = j5 = j6 = j7 = j8 = j9 = 42 + k0 = k1 = k2 = k3 = k4 = k5 = k6 = k7 = k8 = k9 = 42 + l0 = l1 = l2 = l3 = l4 = l5 = l6 = l7 = l8 = l9 = 42 + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m8 = m9 = 42 + n0 = n1 = n2 = n3 = n4 = n5 = n6 = n7 = n8 = n9 = 42 + o0 = o1 = o2 = o3 = o4 = o5 = o6 = o7 = o8 = o9 = 42 + p0 = p1 = p2 = p3 = p4 = p5 = p6 = p7 = p8 = p9 = 42 + q0 = q1 = q2 = q3 = q4 = q5 = q6 = q7 = q8 = q9 = 42 + r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = r8 = r9 = 42 + s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7 = s8 = s9 = 42 + t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 42 + u0 = u1 = u2 = u3 = u4 = u5 = u6 = u7 = u8 = u9 = 42 + v0 = v1 = v2 = v3 = v4 = v5 = v6 = v7 = v8 = v9 = 42 + w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = 42 + x0 = x1 = x2 = x3 = x4 = x5 = x6 = x7 = x8 = x9 = 42 + y0 = y1 = y2 = y3 = y4 = y5 = y6 = y7 = y8 = y9 = 42 + z0 = z1 = z2 = z3 = z4 = z5 = z6 = z7 = z8 = z9 = 42 + while z9 > 0: + z9 = z9 - 1 + """), ns, ns) + many_vars = ns["many_vars"] + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + ex = get_first_executor(many_vars) + self.assertIsNone(ex) + many_vars() + + ex = get_first_executor(many_vars) + self.assertIsNotNone(ex) + self.assertIn(("LOAD_FAST", 259, 0), list(ex)) + + def test_unspecialized_unpack(self): + # An example of an unspecialized opcode + def testfunc(x): + i = 0 + while i < x: + i += 1 + a, b = {1: 2, 3: 3} + assert a == 1 and b == 3 + i = 0 + while i < x: + i += 1 + + opt = _testinternalcapi.get_uop_optimizer() + + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_UNPACK_SEQUENCE", uops) + + def test_pop_jump_if_false(self): + def testfunc(n): + i = 0 + while i < n: + i += 1 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_TRUE_POP", uops) + + def test_pop_jump_if_none(self): + def testfunc(a): + for x in a: + if x is None: + x = 0 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(range(20)) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_NOT_NONE_POP", uops) + + def test_pop_jump_if_not_none(self): + def testfunc(a): + for x in a: + x = None + if x is not None: + x = 0 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(range(20)) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_NONE_POP", uops) + + def test_pop_jump_if_true(self): + def testfunc(n): + i = 0 + while not i >= n: + i += 1 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_FALSE_POP", uops) + + def test_jump_backward(self): + def testfunc(n): + i = 0 + while i < n: + i += 1 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_JUMP_TO_TOP", uops) + + def test_jump_forward(self): + def testfunc(n): + a = 0 + while a < n: + if a < 0: + a = -a + else: + a = +a + a += 1 + return a + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + # Since there is no JUMP_FORWARD instruction, + # look for indirect evidence: the += operator + self.assertIn("_BINARY_OP_ADD_INT", uops) + + def test_for_iter_range(self): + def testfunc(n): + total = 0 + for i in range(n): + total += i + return total + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + total = testfunc(20) + self.assertEqual(total, 190) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + # for i, (opname, oparg) in enumerate(ex): + # print(f"{i:4d}: {opname:<20s} {oparg:3d}") + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_NOT_EXHAUSTED_RANGE", uops) + # Verification that the jump goes past END_FOR + # is done by manual inspection of the output + + def test_for_iter_list(self): + def testfunc(a): + total = 0 + for i in a: + total += i + return total + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + a = list(range(20)) + total = testfunc(a) + self.assertEqual(total, 190) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + # for i, (opname, oparg) in enumerate(ex): + # print(f"{i:4d}: {opname:<20s} {oparg:3d}") + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_NOT_EXHAUSTED_LIST", uops) + # Verification that the jump goes past END_FOR + # is done by manual inspection of the output + + def test_for_iter_tuple(self): + def testfunc(a): + total = 0 + for i in a: + total += i + return total + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + a = tuple(range(20)) + total = testfunc(a) + self.assertEqual(total, 190) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + # for i, (opname, oparg) in enumerate(ex): + # print(f"{i:4d}: {opname:<20s} {oparg:3d}") + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_NOT_EXHAUSTED_TUPLE", uops) + # Verification that the jump goes past END_FOR + # is done by manual inspection of the output + + def test_list_edge_case(self): + def testfunc(it): + for x in it: + pass + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + a = [1, 2, 3] + it = iter(a) + testfunc(it) + a.append(4) + with self.assertRaises(StopIteration): + next(it) + + def test_call_py_exact_args(self): + def testfunc(n): + def dummy(x): + return x+1 + for i in range(n): + dummy(i) + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_PUSH_FRAME", uops) + self.assertIn("_BINARY_OP_ADD_INT", uops) + + def test_branch_taken(self): + def testfunc(n): + for i in range(n): + if i < 0: + i = 0 + else: + i = 1 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(20) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_FALSE_POP", uops) + + def test_for_iter_tier_two(self): + class MyIter: + def __init__(self, n): + self.n = n + def __iter__(self): + return self + def __next__(self): + self.n -= 1 + if self.n < 0: + raise StopIteration + return self.n + + def testfunc(n, m): + x = 0 + for i in range(m): + for j in MyIter(n): + x += 1000*i + j + return x + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + x = testfunc(10, 10) + + self.assertEqual(x, sum(range(10)) * 10010) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_FOR_ITER_TIER_TWO", uops) + + def test_confidence_score(self): + def testfunc(n): + bits = 0 + for i in range(n): + if i & 0x01: + bits += 1 + if i & 0x02: + bits += 1 + if i&0x04: + bits += 1 + if i&0x08: + bits += 1 + if i&0x10: + bits += 1 + if i&0x20: + bits += 1 + return bits + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + x = testfunc(20) + + self.assertEqual(x, 40) + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + ops = [opname for opname, _, _ in ex] + count = ops.count("_GUARD_IS_TRUE_POP") + # Because Each 'if' halves the score, the second branch is + # too much already. + self.assertEqual(count, 1) + + +if __name__ == "__main__": + unittest.main() From 075ab91692d0ee6fcd1c8c519bf982e9a17c15ae Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 13:44:08 -0800 Subject: [PATCH 24/37] In _PyOptimizer_Unanchored, assert not ENTER_EXECUTOR, accept JUMP_BACKWARD --- Python/optimizer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 14a9ee5c173d63..44cd3fd4809614 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -198,9 +198,7 @@ _PyOptimizer_Unanchored( _PyExecutorObject **pexecutor, PyObject **stack_pointer) { - if (instr->op.code == JUMP_BACKWARD || instr->op.code == ENTER_EXECUTOR) { - return 0; - } + assert(instr->op.code != ENTER_EXECUTOR); PyCodeObject *code = _PyFrame_GetCode(frame); assert(PyCode_Check(code)); PyInterpreterState *interp = _PyInterpreterState_GET(); From c54daef2697211e1c88eef82a9b801defaea5649 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 13:51:35 -0800 Subject: [PATCH 25/37] Call DISPATCH() directly from exit_trace --- Python/ceval.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 75ab7452ffa5a3..1897ac8ef89a97 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1066,11 +1066,10 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // Jump here from _EXIT_TRACE exit_trace: - _PyFrame_SetStackPointer(frame, stack_pointer); - frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); + frame->instr_ptr = next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); Py_DECREF(current_executor); OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); - goto enter_tier_one; + DISPATCH(); // Jump here from DEOPT_IF() deoptimize: From 934a11529c48a3c1da11e86002b618b9f847b331 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 13:52:29 -0800 Subject: [PATCH 26/37] Correct comment on deoptimize --- Python/ceval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index 1897ac8ef89a97..8d82f94c2bea98 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1073,7 +1073,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // Jump here from DEOPT_IF() deoptimize: - // On DEOPT_IF we must repeat the last instruction. + // On DEOPT_IF we must execute the target instruction. // This presumes nothing was popped from the stack (nor pushed). // There are some other things to take care of first, though. frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); From 8f5e623544161721318c43a8e189f6edf24be92b Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 13:55:39 -0800 Subject: [PATCH 27/37] Remove enter_tier_one label --- Python/ceval.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 8d82f94c2bea98..3a7496bae80659 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -752,10 +752,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto exit_unwind; } -// Jump here from ENTER_EXECUTOR and exit_trace. -enter_tier_one: next_instr = frame->instr_ptr; - resume_frame: stack_pointer = _PyFrame_GetStackPointer(frame); From 10b98f12d9c233a15f1fbfc8e51684d2887ed075 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 14:03:34 -0800 Subject: [PATCH 28/37] Add test --- Lib/test/test_capi/test_opt.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 9f4731103c9413..a6ce3a22a32aca 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -539,6 +539,27 @@ def testfunc(n): # too much already. self.assertEqual(count, 1) + def test_side_exits(self): + def testfunc(): + for _ in range(100): + for i in range(100): + if i >= 70: + i = 0 + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc() + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_GUARD_IS_FALSE_POP", uops) + subs = [sub for sub in ex.sub_executors() if sub is not None] + self.assertGreater(len(subs), 0) + sub = subs[0] + sub_uops = {opname for opname, _, _ in sub} + self.assertIn("_GUARD_IS_TRUE_POP", sub_uops) + if __name__ == "__main__": unittest.main() From 1450ca627181f4c33e0801bc9c945edca0e5d43d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 15:11:50 -0800 Subject: [PATCH 29/37] Fix memory leak --- Python/ceval.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/ceval.c b/Python/ceval.c index 3a7496bae80659..5709ef7a2d0cf0 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1112,6 +1112,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int uint16_t threshold = tstate->interp->optimizer_resume_threshold + (1 << 15); if (ucounter <= threshold) { + Py_DECREF(current_executor); goto resume_frame; } From dcde4d3cf83a48f4abe795b72eaf13190ec662d7 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 15:25:37 -0800 Subject: [PATCH 30/37] Clear sub-executors array upon dealloc --- Python/optimizer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Python/optimizer.c b/Python/optimizer.c index 44cd3fd4809614..7d23862d4948f5 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -342,6 +342,11 @@ PyUnstable_Optimizer_NewCounter(void) static void uop_dealloc(_PyUOpExecutorObject *self) { _Py_ExecutorClear((_PyExecutorObject *)self); + if (self->executors != NULL) { + for (int i = Py_SIZE(self); --i >= 0; ) { + Py_XDECREF(self->executors[i]); + } + } PyObject_Free(self); } From 15df63fbfc42c3eb21acc660b5fd23948821ced8 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 15:30:25 -0800 Subject: [PATCH 31/37] Add blurb --- .../2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst new file mode 100644 index 00000000000000..dbc4d9a8309b3c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst @@ -0,0 +1,2 @@ +In the Tier 2 interpreter, add side exits to sub-executors for certain +micro-opcodes (currently only conditional branches). From c786418c25e8eaf9fba5206793249b4d64e9b19d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 15:43:38 -0800 Subject: [PATCH 32/37] Avoid redundant stack frame saves/restores --- Python/ceval.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 5709ef7a2d0cf0..43cf10203517a9 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -755,6 +755,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int next_instr = frame->instr_ptr; resume_frame: stack_pointer = _PyFrame_GetStackPointer(frame); +resume_frame_using_stack_pointer: #ifdef LLTRACE lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS()); @@ -1081,7 +1082,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); UOP_STAT_INC(uopcode, miss); frame->return_offset = 0; // Don't leave this random - _PyFrame_SetStackPointer(frame, stack_pointer); // Check if there is a side-exit executor here already. int pc = next_uop - 1 - current_executor->trace; @@ -1113,7 +1113,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int if (ucounter <= threshold) { Py_DECREF(current_executor); - goto resume_frame; + goto resume_frame_using_stack_pointer; } // Decode instruction to look past EXTENDED_ARG. @@ -1186,7 +1186,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int *pcounter = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff; Py_DECREF(current_executor); - goto resume_frame; + goto resume_frame_using_stack_pointer; } #if defined(__GNUC__) From ee0734b71013983e48d4544685f9883dcab2587d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 13 Dec 2023 17:14:47 -0800 Subject: [PATCH 33/37] Revert "Disable curses tests in --fast-ci mode (make test)" This has no business being in this PR. This reverts commit f1998c0428d3b34f54a93454575eaba8f2b6b375. --- Lib/test/libregrtest/cmdline.py | 2 +- Lib/test/test_regrtest.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/Lib/test/libregrtest/cmdline.py b/Lib/test/libregrtest/cmdline.py index 35971cdb88ee37..0053bce4292f64 100644 --- a/Lib/test/libregrtest/cmdline.py +++ b/Lib/test/libregrtest/cmdline.py @@ -442,7 +442,7 @@ def _parse_args(args, **kwargs): # Similar to: -u "all,-cpu" --timeout=600 if ns.use is None: ns.use = [] - ns.use.insert(0, ['all', '-cpu', '-curses']) + ns.use.insert(0, ['all', '-cpu']) if ns.timeout is None: ns.timeout = 600 # 10 minutes diff --git a/Lib/test/test_regrtest.py b/Lib/test/test_regrtest.py index 3c73ba14bc507e..e828941f6c779d 100644 --- a/Lib/test/test_regrtest.py +++ b/Lib/test/test_regrtest.py @@ -425,7 +425,6 @@ def test_fast_ci(self): args = ['--fast-ci'] use_resources = sorted(cmdline.ALL_RESOURCES) use_resources.remove('cpu') - use_resources.remove('curses') regrtest = self.check_ci_mode(args, use_resources) self.assertEqual(regrtest.timeout, 10 * 60) @@ -433,7 +432,6 @@ def test_fast_ci_python_cmd(self): args = ['--fast-ci', '--python', 'python -X dev'] use_resources = sorted(cmdline.ALL_RESOURCES) use_resources.remove('cpu') - use_resources.remove('curses') regrtest = self.check_ci_mode(args, use_resources, rerun=False) self.assertEqual(regrtest.timeout, 10 * 60) self.assertEqual(regrtest.python_cmd, ('python', '-X', 'dev')) @@ -443,7 +441,6 @@ def test_fast_ci_resource(self): args = ['--fast-ci', '-u-network'] use_resources = sorted(cmdline.ALL_RESOURCES) use_resources.remove('cpu') - use_resources.remove('curses') use_resources.remove('network') self.check_ci_mode(args, use_resources) From f5b317a6a1fcc4c68ad1448f0d36617127a66834 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 14 Dec 2023 09:34:00 -0800 Subject: [PATCH 34/37] Fix compiler warning about int/Py_ssize_t --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 7d23862d4948f5..9a38d599cf39da 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -343,7 +343,7 @@ static void uop_dealloc(_PyUOpExecutorObject *self) { _Py_ExecutorClear((_PyExecutorObject *)self); if (self->executors != NULL) { - for (int i = Py_SIZE(self); --i >= 0; ) { + for (Py_ssize_t i = Py_SIZE(self); --i >= 0; ) { Py_XDECREF(self->executors[i]); } } From 4804a3c93d09990a4db029338d09314fc47d34ae Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 15 Dec 2023 20:54:32 -0800 Subject: [PATCH 35/37] Be less casual about incref/decref current executor --- Python/ceval.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 16d283ef9646a1..82fdc88569665d 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1085,9 +1085,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int code->co_firstlineno, 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); #endif + _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor); Py_DECREF(current_executor); - current_executor = (_PyUOpExecutorObject *)*pexecutor; - Py_INCREF(current_executor); + current_executor = new_executor; goto enter_tier_two; } @@ -1142,17 +1142,17 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int code->co_firstlineno, 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); #endif - Py_DECREF(current_executor); - current_executor = (_PyUOpExecutorObject *)*pexecutor; + _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor); // Reject trace if it repeats the uop that just deoptimized. - int jump_opcode = current_executor->trace[0].opcode; + int jump_opcode = new_executor->trace[0].opcode; if (jump_opcode == _IS_NONE) { - jump_opcode = current_executor->trace[1].opcode; + jump_opcode = new_executor->trace[1].opcode; } if (jump_opcode != uopcode) { *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); - Py_INCREF(current_executor); + Py_DECREF(current_executor); + current_executor = new_executor; goto enter_tier_two; // All systems go! } @@ -1160,7 +1160,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n", _PyUOpName(jump_opcode)); *pexecutor = NULL; - // It will be decref'ed below. + Py_DECREF(new_executor); } } From 46c7d2609451aef876e73bd457084aa8eff210d9 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 16 Dec 2023 15:32:24 -0800 Subject: [PATCH 36/37] Slightly nicer way to handle refcounts --- Python/ceval.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 82fdc88569665d..b01616724ab371 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1122,7 +1122,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); - int optimized = _PyOptimizer_Unanchored(frame, next_instr, pexecutor, stack_pointer); + _PyExecutorObject *tmp_executor = NULL; + int optimized = _PyOptimizer_Unanchored(frame, next_instr, &tmp_executor, stack_pointer); if (optimized < 0) { goto error_tier_two; } @@ -1142,7 +1143,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int code->co_firstlineno, 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); #endif - _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor); + _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(tmp_executor); // Reject trace if it repeats the uop that just deoptimized. int jump_opcode = new_executor->trace[0].opcode; @@ -1150,6 +1151,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int jump_opcode = new_executor->trace[1].opcode; } if (jump_opcode != uopcode) { + *pexecutor = tmp_executor; *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); Py_DECREF(current_executor); current_executor = new_executor; @@ -1159,7 +1161,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int // The trace is guaranteed to deopt again; forget about it. DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n", _PyUOpName(jump_opcode)); - *pexecutor = NULL; + Py_DECREF(tmp_executor); Py_DECREF(new_executor); } } From b9912798c89bcd84583f4170953b4266409c4de9 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 16 Dec 2023 16:43:03 -0800 Subject: [PATCH 37/37] Silence compiler warning --- Python/ceval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index b01616724ab371..93e2d705d84584 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1074,7 +1074,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int frame->return_offset = 0; // Don't leave this random // Check if there is a side-exit executor here already. - int pc = next_uop - 1 - current_executor->trace; + int pc = (int)(next_uop - 1 - current_executor->trace); _PyExecutorObject **pexecutor = current_executor->executors + pc; if (*pexecutor != NULL) { #ifdef Py_DEBUG