From 530fa9b79bef8185e4da79cdc09e8bb9987fe030 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Mon, 30 May 2022 15:37:18 +0100 Subject: [PATCH 1/2] Use exponential backoff to avoid excessive specialization attempts. --- Include/internal/pycore_code.h | 32 ++++++++++++- ...2-05-30-15-35-42.gh-issue-93354.RZk8gs.rst | 3 ++ Python/ceval.c | 46 ++++++++++--------- Python/specialize.c | 43 +++++++++-------- 4 files changed, 79 insertions(+), 45 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-05-30-15-35-42.gh-issue-93354.RZk8gs.rst diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 2cfa319f2d2902..c7142c02fe1d02 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -227,8 +227,10 @@ extern void _PyLineTable_InitAddressRange( extern int _PyLineTable_NextAddressRange(PyCodeAddressRange *range); extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); - -#define ADAPTIVE_CACHE_BACKOFF 64 +/* With a 16 counter, we have 12 bits for the counter value, and 4 bits for the backoff */ +#define ADAPTIVE_BACKOFF_BITS 4 +/* The initial counter value is backoff**2-1, so a backoff of 5 gives a value of 31 */ +#define ADAPTIVE_BACKOFF_START 5 /* Specialization functions */ @@ -423,6 +425,32 @@ write_location_entry_start(uint8_t *ptr, int code, int length) } +static inline uint16_t +adaptive_counter_bits(int value, int backoff) { + return (value << ADAPTIVE_BACKOFF_BITS) | + (backoff & ((1< MAX_BACKOFF_VALUE) { + backoff = MAX_BACKOFF_VALUE; + } + unsigned int value = (1 << backoff) - 1; + return adaptive_counter_bits(value, backoff); +} + + #ifdef __cplusplus } #endif diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-05-30-15-35-42.gh-issue-93354.RZk8gs.rst b/Misc/NEWS.d/next/Core and Builtins/2022-05-30-15-35-42.gh-issue-93354.RZk8gs.rst new file mode 100644 index 00000000000000..dcfe6a9b6ba3a5 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-05-30-15-35-42.gh-issue-93354.RZk8gs.rst @@ -0,0 +1,3 @@ +Use exponential backoff for specialization counters in the interpreter. Can +reduce the number of failed specializations significantly and avoid slowdown +for those parts of a program that are not suitable for specialization. diff --git a/Python/ceval.c b/Python/ceval.c index fdbf0d4d6e99de..36546929e19c43 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1561,7 +1561,11 @@ eval_frame_handle_pending(PyThreadState *tstate) dtrace_function_entry(frame); \ } +#define ADAPTIVE_COUNTER_IS_ZERO(cache) \ + (cache)->counter < (1<counter -= (1<counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *sub = TOP(); PyObject *container = SECOND(); next_instr--; @@ -2167,7 +2171,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(BINARY_SUBSCR, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(BINARY_SUBSCR); } } @@ -2321,7 +2325,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(STORE_SUBSCR_ADAPTIVE) { _PyStoreSubscrCache *cache = (_PyStoreSubscrCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *sub = TOP(); PyObject *container = SECOND(); next_instr--; @@ -2332,7 +2336,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(STORE_SUBSCR, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(STORE_SUBSCR); } } @@ -2815,7 +2819,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(UNPACK_SEQUENCE_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyUnpackSequenceCache *cache = (_PyUnpackSequenceCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *seq = TOP(); next_instr--; _Py_Specialize_UnpackSequence(seq, next_instr, oparg); @@ -2823,7 +2827,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(UNPACK_SEQUENCE, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(UNPACK_SEQUENCE); } } @@ -3056,7 +3060,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(LOAD_GLOBAL_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyLoadGlobalCache *cache = (_PyLoadGlobalCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *name = GETITEM(names, oparg>>1); next_instr--; if (_Py_Specialize_LoadGlobal(GLOBALS(), BUILTINS(), next_instr, name) < 0) { @@ -3066,7 +3070,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(LOAD_GLOBAL, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(LOAD_GLOBAL); } } @@ -3480,7 +3484,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(LOAD_ATTR_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyAttrCache *cache = (_PyAttrCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *owner = TOP(); PyObject *name = GETITEM(names, oparg); next_instr--; @@ -3491,7 +3495,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(LOAD_ATTR, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(LOAD_ATTR); } } @@ -3589,7 +3593,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(STORE_ATTR_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyAttrCache *cache = (_PyAttrCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *owner = TOP(); PyObject *name = GETITEM(names, oparg); next_instr--; @@ -3600,7 +3604,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(STORE_ATTR, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(STORE_ATTR); } } @@ -3719,7 +3723,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(COMPARE_OP_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyCompareOpCache *cache = (_PyCompareOpCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *right = TOP(); PyObject *left = SECOND(); next_instr--; @@ -3728,7 +3732,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(COMPARE_OP, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(COMPARE_OP); } } @@ -4526,7 +4530,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(LOAD_METHOD_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyLoadMethodCache *cache = (_PyLoadMethodCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *owner = TOP(); PyObject *name = GETITEM(names, oparg); next_instr--; @@ -4537,7 +4541,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(LOAD_METHOD, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(LOAD_METHOD); } } @@ -4775,7 +4779,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(CALL_ADAPTIVE) { _PyCallCache *cache = (_PyCallCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { next_instr--; int is_meth = is_method(stack_pointer, oparg); int nargs = oparg + is_meth; @@ -4789,7 +4793,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(CALL, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); goto call_function; } } @@ -5521,7 +5525,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int TARGET(BINARY_OP_ADAPTIVE) { assert(cframe.use_tracing == 0); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)next_instr; - if (cache->counter == 0) { + if (ADAPTIVE_COUNTER_IS_ZERO(cache)) { PyObject *lhs = SECOND(); PyObject *rhs = TOP(); next_instr--; @@ -5530,7 +5534,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } else { STAT_INC(BINARY_OP, deferred); - cache->counter--; + DECREMENT_ADAPTIVE_COUNTER(cache); JUMP_TO_INSTRUCTION(BINARY_OP); } } @@ -5658,7 +5662,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int assert(adaptive_opcode); _Py_SET_OPCODE(next_instr[-1], adaptive_opcode); STAT_INC(opcode, deopt); - *counter = ADAPTIVE_CACHE_BACKOFF; + *counter = adaptive_counter_start(); } next_instr--; DISPATCH_GOTO(); diff --git a/Python/specialize.c b/Python/specialize.c index a2fb460388055d..4d30a2aea5f3be 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -321,7 +321,7 @@ _PyCode_Quicken(PyCodeObject *code) } static inline int -initial_counter_value(void) { +miss_counter_start(void) { /* Starting value for the counter. * This value needs to be not too low, otherwise * it would cause excessive de-optimization. @@ -743,12 +743,12 @@ _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr, PyObject *name) fail: STAT_INC(LOAD_ATTR, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return 0; success: STAT_INC(LOAD_ATTR, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); return 0; } @@ -825,12 +825,12 @@ _Py_Specialize_StoreAttr(PyObject *owner, _Py_CODEUNIT *instr, PyObject *name) fail: STAT_INC(STORE_ATTR, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return 0; success: STAT_INC(STORE_ATTR, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); return 0; } @@ -1041,14 +1041,13 @@ _Py_Specialize_LoadMethod(PyObject *owner, _Py_CODEUNIT *instr, PyObject *name) success: STAT_INC(LOAD_METHOD, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); return 0; fail: STAT_INC(LOAD_METHOD, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return 0; - } int @@ -1124,12 +1123,12 @@ _Py_Specialize_LoadGlobal( fail: STAT_INC(LOAD_GLOBAL, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return 0; success: STAT_INC(LOAD_GLOBAL, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); return 0; } @@ -1253,12 +1252,12 @@ _Py_Specialize_BinarySubscr( fail: STAT_INC(BINARY_SUBSCR, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return 0; success: STAT_INC(BINARY_SUBSCR, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); return 0; } @@ -1357,12 +1356,12 @@ _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub, _Py_CODEUNIT *ins fail: STAT_INC(STORE_SUBSCR, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return 0; success: STAT_INC(STORE_SUBSCR, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); return 0; } @@ -1674,12 +1673,12 @@ _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr, int nargs, if (fail) { STAT_INC(CALL, failure); assert(!PyErr_Occurred()); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); } else { STAT_INC(CALL, success); assert(!PyErr_Occurred()); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); } return 0; } @@ -1827,11 +1826,11 @@ _Py_Specialize_BinaryOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr, } SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs)); STAT_INC(BINARY_OP, failure); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return; success: STAT_INC(BINARY_OP, success); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); } @@ -1953,11 +1952,11 @@ _Py_Specialize_CompareOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr, SPECIALIZATION_FAIL(COMPARE_OP, compare_op_fail_kind(lhs, rhs)); failure: STAT_INC(COMPARE_OP, failure); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return; success: STAT_INC(COMPARE_OP, success); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); } #ifdef Py_STATS @@ -2003,11 +2002,11 @@ _Py_Specialize_UnpackSequence(PyObject *seq, _Py_CODEUNIT *instr, int oparg) SPECIALIZATION_FAIL(UNPACK_SEQUENCE, unpack_sequence_fail_kind(seq)); failure: STAT_INC(UNPACK_SEQUENCE, failure); - cache->counter = ADAPTIVE_CACHE_BACKOFF; + cache->counter = adaptive_counter_backoff(cache->counter); return; success: STAT_INC(UNPACK_SEQUENCE, success); - cache->counter = initial_counter_value(); + cache->counter = miss_counter_start(); } #ifdef Py_STATS From 89acc3e605211a544b6b2313522e9c3c84c1261f Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Mon, 30 May 2022 18:56:58 +0100 Subject: [PATCH 2/2] Add comment explaining counters and exponential backoff. --- Include/internal/pycore_code.h | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index c7142c02fe1d02..9d33ed71e78907 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -227,11 +227,6 @@ extern void _PyLineTable_InitAddressRange( extern int _PyLineTable_NextAddressRange(PyCodeAddressRange *range); extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); -/* With a 16 counter, we have 12 bits for the counter value, and 4 bits for the backoff */ -#define ADAPTIVE_BACKOFF_BITS 4 -/* The initial counter value is backoff**2-1, so a backoff of 5 gives a value of 31 */ -#define ADAPTIVE_BACKOFF_START 5 - /* Specialization functions */ extern int _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr, @@ -425,14 +420,32 @@ write_location_entry_start(uint8_t *ptr, int code, int length) } +/** Counters + * The first 16-bit value in each inline cache is a counter. + * When counting misses, the counter is treated as a simple unsigned value. + * + * When counting executions until the next specialization attempt, + * exponential backoff is used to reduce the number of specialization failures. + * The high 12 bits store the counter, the low 4 bits store the backoff exponent. + * On a specialization failure, the backoff exponent is incremented and the + * counter set to (2**backoff - 1). + * Backoff == 6 -> starting counter == 63, backoff == 10 -> starting counter == 1023. + */ + +/* With a 16-bit counter, we have 12 bits for the counter value, and 4 bits for the backoff */ +#define ADAPTIVE_BACKOFF_BITS 4 +/* The initial counter value is 31 == 2**ADAPTIVE_BACKOFF_START - 1 */ +#define ADAPTIVE_BACKOFF_START 5 + +#define MAX_BACKOFF_VALUE (16 - ADAPTIVE_BACKOFF_BITS) + + static inline uint16_t adaptive_counter_bits(int value, int backoff) { return (value << ADAPTIVE_BACKOFF_BITS) | (backoff & ((1<