8000 GH-109369: Add machinery for deoptimizing tier2 executors, both individually and globally. by markshannon · Pull Request #110384 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

GH-109369: Add machinery for deoptimizing tier2 executors, both individually and globally. #110384

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 23, 2023
Merged
Prev Previous commit
Next Next commit
Tidy up code and add test
  • Loading branch information
markshannon committed Oct 4, 2023
commit 60306d818b2e3b1ff2d4d87865762964c9bf23a1
9 changes: 7 additions & 2 deletions Include/cpython/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ typedef struct _PyExecutorLinkListNode {
struct _PyExecutorObject *previous;
} _PyExecutorLinkListNode;

/* Bloom filter with m = 256 */

/* Bloom filter with m = 256
* https://en.wikipedia.org/wiki/Bloom_filter */
#define BLOOM_FILTER_WORDS 8

typedef struct _bloom_filter {
Expand Down Expand Up @@ -61,10 +63,13 @@ _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_

extern _PyOptimizerObject _PyOptimizer_Default;

void _Py_ExecutorInit(_PyExecutorObject *);
void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *);
void _Py_ExecutorClear(_PyExecutorObject *);
void _Py_BloomFilter_Init(_PyBloomFilter *);
void _Py_BloomFilter_Add(_PyBloomFilter *bloom, void *obj);
PyAPI_FUNC(void) _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj);
PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj);
extern void _Py_Executors_InvalidateAll(PyInterpreterState *interp);

/* For testing */
PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewCounter(void);
Expand Down
43 changes: 30 additions & 13 deletions Lib/test/test_capi/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2387,6 +2387,20 @@ def testfunc(x):
self.assertEqual(code, replace_code)
self.assertEqual(hash(code), hash(replace_code))


def get_first_executor(func):
code = func.__code__
co_code = code.co_code
JUMP_BACKWARD = opcode.opmap["JUMP_BACKWARD"]
for i in range(0, len(co_code), 2):
if co_code[i] == JUMP_BACKWARD:
try:
return _testinternalcapi.get_executor(code, i)
except ValueError:
pass
return None


class TestExecutorInvalidation(unittest.TestCase):

def setUp(self):
Expand Down Expand Up @@ -2431,19 +2445,22 @@ def f{n}():
for exe in executors[:i]:
self.assertTrue(exe.valid)


def get_first_executor(func):
code = func.__code__
co_code = code.co_code
JUMP_BACKWARD = opcode.opmap["JUMP_BACKWARD"]
for i in range(0, len(co_code), 2):
if co_code[i] == JUMP_BACKWARD:
try:
return _testinternalcapi.get_executor(code, i)
except ValueError:
pass
return None

def test_uop_optimizer_invalidation(self):
# Generate a new function at each call
ns = {}
exec(textwrap.dedent("""
def f():
for i in range(1000):
pass
"""), ns, ns)
f = ns['f']
opt = _testinternalcapi.get_uop_optimizer()
with temporary_optimizer(opt):
f()
exe = get_first_executor(f)
self.assertTrue(exe.valid)
_testinternalcapi.invalidate_executors(f.__code__)
self.assertFalse(exe.valid)

class TestUops(unittest.TestCase):

Expand Down
2 changes: 2 additions & 0 deletions Python/instrumentation.c
Original file line number Diff line number Diff line change
Expand Up @@ -1804,6 +1804,7 @@ _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events)
return -1;
}
set_global_version(interp, new_version);
_Py_Executors_InvalidateAll(interp);
return instrument_all_executing_code_objects(interp);
}

Expand Down Expand Up @@ -1833,6 +1834,7 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent
/* Force instrumentation update */
code->_co_instrumentation_version -= MONITORING_VERSION_INCREMENT;
}
_Py_Executors_InvalidateDependency(interp, code);
if (_Py_Instrument(code, interp)) {
return -1;
}
Expand Down
69 changes: 43 additions & 26 deletions Python/optimizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ counter_optimize(
executor->optimizer = (_PyCounterOptimizerObject *)self;
executor->next_instr = instr;
*exec_ptr = (_PyExecutorObject *)executor;
_Py_ExecutorInit((_PyExecutorObject *)executor);
_PyBloomFilter empty;
_Py_BloomFilter_Init(&empty);
_Py_ExecutorInit((_PyExecutorObject *)executor, &empty);
return 1;
}

Expand Down Expand Up @@ -367,6 +369,12 @@ PySequenceMethods uop_as_sequence = {
.sq_item = (ssizeargfunc)uop_item,
};


static PyMemberDef uop_members[] = {
{ "valid", Py_T_UBYTE, offsetof(_PyUOpExecutorObject, base.vm_data.valid), Py_READONLY, "is valid?" },
{ NULL }
};

static PyTypeObject UOpExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_executor",
Expand All @@ -375,6 +383,7 @@ static PyTypeObject UOpExecutor_Type = {
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_dealloc = (destructor)uop_dealloc,
.tp_as_sequence = &uop_as_sequence,
.tp_members = uop_members,
};

static int
Expand Down Expand Up @@ -410,9 +419,11 @@ translate_bytecode_to_trace(
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyUOpInstruction *trace,
int buffer_size)
int buffer_size,
_PyBloomFilter *dependencies)
{
PyCodeObject *initial_code = code;
_Py_BloomFilter_Add(dependencies, initial_code);
_Py_CODEUNIT *initial_instr = instr;
int trace_length = 0;
int max_length = buffer_size;
Expand Down Expand Up @@ -738,6 +749,7 @@ translate_bytecode_to_trace(
// Increment IP to the return address
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1;
TRACE_STACK_PUSH();
_Py_BloomFilter_Add(dependencies, new_code);
code = new_code;
instr = _PyCode_CODE(code);
DPRINTF(2,
Expand Down Expand Up @@ -896,8 +908,10 @@ uop_optimize(
_PyExecutorObject **exec_ptr,
int curr_stackentries)
{
_PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies);
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH);
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
if (trace_length <= 0) {
// Error or nothing translated
return trace_length;
Expand All @@ -914,7 +928,7 @@ uop_optimize(
}
executor->base.execute = _PyUopExecute;
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
_Py_ExecutorInit((_PyExecutorObject *)executor);
_Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
*exec_ptr = (_PyExecutorObject *)executor;
return 1;
}
Expand All @@ -936,6 +950,8 @@ static PyTypeObject UOpOptimizer_Type = {
PyObject *
PyUnstable_Optimizer_NewUOpOptimizer(void)
{
PyType_Ready(&UOpExecutor_Type);
PyType_Ready(&UOpOptimizer_Type);
_PyOptimizerObject *opt = PyObject_New(_PyOptimizerObject, &UOpOptimizer_Type);
if (opt == NULL) {
return NULL;
Expand Down Expand Up @@ -968,19 +984,25 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
* giving a false positive of (60/256)**6 == 0.0001
*
* We choose k = 6, rather than a higher number as
* it means the false positive rate goes slower for high n.
* it means the false positive rate grows slower for high n.
*
* n = 5, k = 6 => fp = 2.6e-6
* n = 5, k = 8 => fp = 3.5e-7
* n = 10, k = 6 => fp = 1.6e-4
* n = 10, k = 8 => fp = 0.9e-4
* n = 15, k = 6 => fp = 0.18%
* n = 15, k = 8 => fp = 0.23%
* n = 20, k = 6 => fp = 1.1%
* n = 20, k = 8 => fp = 2.3%
*
* The above analysis assumes perfect hash functions,
* but we don't have those, so it is approximate.
* but those don't exist, so the real false positive
* rates may be worse.
*/

#define K 6

/* TO DO -- Use more modern hash functions with better distribution of bits */
static uint32_t
address_to_hash(void *ptr, uint32_t seed, uint32_t multiplier) {
assert(ptr != NULL);
Expand All @@ -1004,12 +1026,17 @@ static const uint32_t seeds[2] = {
2147483647,
};

static void
address_to_hashes(void *ptr, _PyBloomFilter *bloom)
void
_Py_BloomFilter_Init(_PyBloomFilter *bloom)
{
for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
bloom->bits[i] = 0;
}
}

void
_Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr)
{
/* Set k (6) bits */
for (int i = 0; i < 2; i++) {
uint32_t hash = address_to_hash(ptr, seeds[i], multipliers[i]);
Expand All @@ -1021,16 +1048,6 @@ address_to_hashes(void *ptr, _PyBloomFilter *bloom)
}
}

static void
add_to_bloom_filter(_PyBloomFilter *bloom, PyObject *obj)
{
_PyBloomFilter hashes;
address_to_hashes(obj, &hashes);
for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
bloom->bits[i] |= hashes.bits[i];
}
}

static bool
bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes)
{
Expand Down Expand Up @@ -1091,11 +1108,11 @@ unlink_executor(_PyExecutorObject *executor)

/* This must be called by optimizers before using the executor */
void
_Py_ExecutorInit(_PyExecutorObject *executor)
_Py_ExecutorInit(_PyExecutorObject *executor, _PyBloomFilter *dependency_set)
{
executor->vm_data.valid = true;
for (int i = 0; i < BLOOM_FILTER_WORDS; i++) {
executor->vm_data.bloom.bits[i] = 0;
executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
}
link_executor(executor);
}
Expand All @@ -1111,7 +1128,7 @@ void
_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
{
assert(executor->vm_data.valid = true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not the issue, but something I noticed while combing over this:

Suggested change
assert(executor->vm_data.valid = true);
assert(executor->vm_data.valid == true);

add_to_bloom_filter(&executor->vm_data.bloom, obj);
_Py_BloomFilter_Add(&executor->vm_data.bloom, obj);
}

/* Invalidate all executors that depend on `obj`
Expand All @@ -1120,23 +1137,23 @@ _Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
void
_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj)
{
_PyBloomFilter hashes;
address_to_hashes(obj, &hashes);
_PyBloomFilter obj_filter;
_Py_BloomFilter_Init(&obj_filter);
_Py_BloomFilter_Add(&obj_filter, obj);
/* Walk the list of executors */
/* TO DO -- Use a tree to avoid traversing as many objects */
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
assert(exec->vm_data.valid);
_PyExecutorObject *next = exec->vm_data.links.next;
if (bloom_filter_may_contain(&exec->vm_data.bloom, &hashes)) {
if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter)) {
exec->vm_data.valid = false;
unlink_executor(exec);
}
exec = next;
}
}

/* Invalidate all executors
*/
/* Invalidate all executors */
void
_Py_Executors_InvalidateAll(PyInterpreterState *interp)
{
Expand Down
0