From eb5ec7411fa6e662450ef7a6f79d7de4722951d9 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 12:23:30 -0800 Subject: [PATCH 1/9] Replace ROT*/DUP* with SWAP/COPY --- Doc/library/dis.rst | 54 ++------ Include/opcode.h | 117 +++++++++-------- Lib/importlib/_bootstrap_external.py | 4 +- Lib/opcode.py | 7 +- Lib/test/test__opcode.py | 1 - Lib/test/test_dis.py | 4 +- Lib/test/test_peepholer.py | 4 +- Python/ceval.c | 68 ++-------- Python/compile.c | 182 +++++++++++---------------- Python/opcode_targets.h | 72 +++++------ 10 files changed, 190 insertions(+), 323 deletions(-) diff --git a/Doc/library/dis.rst b/Doc/library/dis.rst index af28e5c1159342..5b97ee31edf5e8 100644 --- a/Doc/library/dis.rst +++ b/Doc/library/dis.rst @@ -326,7 +326,8 @@ The Python compiler currently generates the following bytecode instructions. .. opcode:: NOP - Do nothing code. Used as a placeholder by the bytecode optimizer. + Do nothing code. Used as a placeholder by the bytecode optimizer, and to + generate line tracing events. .. opcode:: POP_TOP @@ -334,38 +335,19 @@ The Python compiler currently generates the following bytecode instructions. Removes the top-of-stack (TOS) item. -.. opcode:: ROT_TWO - - Swaps the two top-most stack items. - - -.. opcode:: ROT_THREE - - Lifts second and third stack item one position up, moves top down to position - three. - - -.. opcode:: ROT_FOUR - - Lifts second, third and fourth stack items one position up, moves top down - to position four. - - .. versionadded:: 3.8 - - -.. opcode:: DUP_TOP +.. opcode:: COPY (i) - Duplicates the reference on top of the stack. + Push the *i*-th item to the top of the stack. The item is not removed from its + original location. - .. versionadded:: 3.2 + .. versionadded:: 3.11 -.. opcode:: DUP_TOP_TWO +.. opcode:: SWAP (i) - Duplicates the two references on top of the stack, leaving them in the - same order. + Swap TOS with the item at position *i*. - .. versionadded:: 3.2 + .. versionadded:: 3.11 **Unary operations** @@ -669,8 +651,6 @@ iterations of the loop. success (``True``) or failure (``False``). -All of the following opcodes use their arguments. - .. opcode:: STORE_NAME (namei) Implements ``name = TOS``. *namei* is the index of *name* in the attribute @@ -1197,22 +1177,6 @@ All of the following opcodes use their arguments. success (``True``) or failure (``False``). -.. opcode:: ROT_N (count) - - Lift the top *count* stack items one position up, and move TOS down to - position *count*. - - .. versionadded:: 3.10 - - -.. opcode:: COPY (i) - - Push the *i*-th item to the top of the stack. The item is not removed from its - original location. - - .. versionadded:: 3.11 - - .. opcode:: RESUME (where) A no-op. Performs internal tracing, debugging and optimization checks. diff --git a/Include/opcode.h b/Include/opcode.h index c0686bd2249ce9..09c3e310ad752d 100644 --- a/Include/opcode.h +++ b/Include/opcode.h @@ -8,11 +8,6 @@ extern "C" { /* Instruction opcodes for compiled code */ #define POP_TOP 1 -#define ROT_TWO 2 -#define ROT_THREE 3 -#define DUP_TOP 4 -#define DUP_TOP_TWO 5 -#define ROT_FOUR 6 #define NOP 9 #define UNARY_POSITIVE 10 #define UNARY_NEGATIVE 11 @@ -56,7 +51,7 @@ extern "C" { #define DELETE_ATTR 96 #define STORE_GLOBAL 97 #define DELETE_GLOBAL 98 -#define ROT_N 99 +#define SWAP 99 #define LOAD_CONST 100 #define LOAD_NAME 101 #define BUILD_TUPLE 102 @@ -116,61 +111,61 @@ extern "C" { #define PRECALL_METHOD 168 #define CALL_NO_KW 169 #define CALL_KW 170 -#define BINARY_OP_ADAPTIVE 7 -#define BINARY_OP_ADD_INT 8 -#define BINARY_OP_ADD_FLOAT 13 -#define BINARY_OP_ADD_UNICODE 14 -#define BINARY_OP_INPLACE_ADD_UNICODE 16 -#define BINARY_OP_MULTIPLY_INT 17 -#define BINARY_OP_MULTIPLY_FLOAT 18 -#define BINARY_OP_SUBTRACT_INT 19 -#define BINARY_OP_SUBTRACT_FLOAT 20 -#define COMPARE_OP_ADAPTIVE 21 -#define COMPARE_OP_FLOAT_JUMP 22 -#define COMPARE_OP_INT_JUMP 23 -#define COMPARE_OP_STR_JUMP 24 -#define BINARY_SUBSCR_ADAPTIVE 26 -#define BINARY_SUBSCR_GETITEM 27 -#define BINARY_SUBSCR_LIST_INT 28 -#define BINARY_SUBSCR_TUPLE_INT 29 -#define BINARY_SUBSCR_DICT 34 -#define STORE_SUBSCR_ADAPTIVE 36 -#define STORE_SUBSCR_LIST_INT 37 -#define STORE_SUBSCR_DICT 38 -#define CALL_NO_KW_ADAPTIVE 39 -#define CALL_NO_KW_BUILTIN_O 40 -#define CALL_NO_KW_BUILTIN_FAST 41 -#define CALL_NO_KW_LEN 42 -#define CALL_NO_KW_ISINSTANCE 43 -#define CALL_NO_KW_PY_SIMPLE 44 -#define CALL_NO_KW_LIST_APPEND 45 -#define CALL_NO_KW_METHOD_DESCRIPTOR_O 46 -#define CALL_NO_KW_TYPE_1 47 -#define CALL_NO_KW_BUILTIN_CLASS_1 48 -#define CALL_NO_KW_METHOD_DESCRIPTOR_FAST 55 -#define JUMP_ABSOLUTE_QUICK 56 -#define LOAD_ATTR_ADAPTIVE 57 -#define LOAD_ATTR_INSTANCE_VALUE 58 -#define LOAD_ATTR_WITH_HINT 59 -#define LOAD_ATTR_SLOT 62 -#define LOAD_ATTR_MODULE 63 -#define LOAD_GLOBAL_ADAPTIVE 64 -#define LOAD_GLOBAL_MODULE 65 -#define LOAD_GLOBAL_BUILTIN 66 -#define LOAD_METHOD_ADAPTIVE 67 -#define LOAD_METHOD_CACHED 72 -#define LOAD_METHOD_CLASS 76 -#define LOAD_METHOD_MODULE 77 -#define LOAD_METHOD_NO_DICT 78 -#define STORE_ATTR_ADAPTIVE 79 -#define STORE_ATTR_INSTANCE_VALUE 80 -#define STORE_ATTR_SLOT 81 -#define STORE_ATTR_WITH_HINT 87 -#define LOAD_FAST__LOAD_FAST 131 -#define STORE_FAST__LOAD_FAST 140 -#define LOAD_FAST__LOAD_CONST 141 -#define LOAD_CONST__LOAD_FAST 143 -#define STORE_FAST__STORE_FAST 150 +#define BINARY_OP_ADAPTIVE 2 +#define BINARY_OP_ADD_INT 3 +#define BINARY_OP_ADD_FLOAT 4 +#define BINARY_OP_ADD_UNICODE 5 +#define BINARY_OP_INPLACE_ADD_UNICODE 6 +#define BINARY_OP_MULTIPLY_INT 7 +#define BINARY_OP_MULTIPLY_FLOAT 8 +#define BINARY_OP_SUBTRACT_INT 13 +#define BINARY_OP_SUBTRACT_FLOAT 14 +#define COMPARE_OP_ADAPTIVE 16 +#define COMPARE_OP_FLOAT_JUMP 17 +#define COMPARE_OP_INT_JUMP 18 +#define COMPARE_OP_STR_JUMP 19 +#define BINARY_SUBSCR_ADAPTIVE 20 +#define BINARY_SUBSCR_GETITEM 21 +#define BINARY_SUBSCR_LIST_INT 22 +#define BINARY_SUBSCR_TUPLE_INT 23 +#define BINARY_SUBSCR_DICT 24 +#define STORE_SUBSCR_ADAPTIVE 26 +#define STORE_SUBSCR_LIST_INT 27 +#define STORE_SUBSCR_DICT 28 +#define CALL_NO_KW_ADAPTIVE 29 +#define CALL_NO_KW_BUILTIN_O 34 +#define CALL_NO_KW_BUILTIN_FAST 36 +#define CALL_NO_KW_LEN 37 +#define CALL_NO_KW_ISINSTANCE 38 +#define CALL_NO_KW_PY_SIMPLE 39 +#define CALL_NO_KW_LIST_APPEND 40 +#define CALL_NO_KW_METHOD_DESCRIPTOR_O 41 +#define CALL_NO_KW_TYPE_1 42 +#define CALL_NO_KW_BUILTIN_CLASS_1 43 +#define CALL_NO_KW_METHOD_DESCRIPTOR_FAST 44 +#define JUMP_ABSOLUTE_QUICK 45 +#define LOAD_ATTR_ADAPTIVE 46 +#define LOAD_ATTR_INSTANCE_VALUE 47 +#define LOAD_ATTR_WITH_HINT 48 +#define LOAD_ATTR_SLOT 55 +#define LOAD_ATTR_MODULE 56 +#define LOAD_GLOBAL_ADAPTIVE 57 +#define LOAD_GLOBAL_MODULE 58 +#define LOAD_GLOBAL_BUILTIN 59 +#define LOAD_METHOD_ADAPTIVE 62 +#define LOAD_METHOD_CACHED 63 +#define LOAD_METHOD_CLASS 64 +#define LOAD_METHOD_MODULE 65 +#define LOAD_METHOD_NO_DICT 66 +#define STORE_ATTR_ADAPTIVE 67 +#define STORE_ATTR_INSTANCE_VALUE 72 +#define STORE_ATTR_SLOT 76 +#define STORE_ATTR_WITH_HINT 77 +#define LOAD_FAST__LOAD_FAST 78 +#define STORE_FAST__LOAD_FAST 79 +#define LOAD_FAST__LOAD_CONST 80 +#define LOAD_CONST__LOAD_FAST 81 +#define STORE_FAST__STORE_FAST 87 #define DO_TRACING 255 #ifdef NEED_OPCODE_JUMP_TABLES static uint32_t _PyOpcode_RelativeJump[8] = { diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 1560e60dbb9252..bb6ed7c8705b30 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -381,6 +381,8 @@ def _write_atomic(path, data, mode=0o666): # Python 3.11a4 3473 (Add POP_JUMP_IF_NOT_NONE/POP_JUMP_IF_NONE opcodes) # Python 3.11a4 3474 (Add RESUME opcode) # Python 3.11a5 3475 (Add RETURN_GENERATOR opcode) +# Python 3.11a5 3476 (Replace DUP_TOP/DUP_TOP_TWO with COPY and +# ROT_TWO/ROT_THREE/ROT_FOUR/ROT_N with SWAP) # Python 3.12 will start with magic number 3500 @@ -394,7 +396,7 @@ def _write_atomic(path, data, mode=0o666): # Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array # in PC/launcher.c must also be updated. -MAGIC_NUMBER = (3475).to_bytes(2, 'little') + b'\r\n' +MAGIC_NUMBER = (3476).to_bytes(2, 'little') + b'\r\n' _RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c _PYCACHE = '__pycache__' diff --git a/Lib/opcode.py b/Lib/opcode.py index 73b41d22df2fc5..e090c16b26abe5 100644 --- a/Lib/opcode.py +++ b/Lib/opcode.py @@ -55,11 +55,6 @@ def jabs_op(name, op): # Blank lines correspond to available opcodes def_op('POP_TOP', 1) -def_op('ROT_TWO', 2) -def_op('ROT_THREE', 3) -def_op('DUP_TOP', 4) -def_op('DUP_TOP_TWO', 5) -def_op('ROT_FOUR', 6) def_op('NOP', 9) def_op('UNARY_POSITIVE', 10) @@ -116,7 +111,7 @@ def jabs_op(name, op): name_op('DELETE_ATTR', 96) # "" name_op('STORE_GLOBAL', 97) # "" name_op('DELETE_GLOBAL', 98) # "" -def_op('ROT_N', 99) +def_op('SWAP', 99) def_op('LOAD_CONST', 100) # Index in const list hasconst.append(100) name_op('LOAD_NAME', 101) # Index in name list diff --git a/Lib/test/test__opcode.py b/Lib/test/test__opcode.py index f6b6b3d3532bd0..7c1c0cfdb069b8 100644 --- a/Lib/test/test__opcode.py +++ b/Lib/test/test__opcode.py @@ -11,7 +11,6 @@ class OpcodeTests(unittest.TestCase): def test_stack_effect(self): self.assertEqual(stack_effect(dis.opmap['POP_TOP']), -1) - self.assertEqual(stack_effect(dis.opmap['DUP_TOP_TWO']), 2) self.assertEqual(stack_effect(dis.opmap['BUILD_SLICE'], 0), -1) self.assertEqual(stack_effect(dis.opmap['BUILD_SLICE'], 1), -1) self.assertEqual(stack_effect(dis.opmap['BUILD_SLICE'], 3), -2) diff --git a/Lib/test/test_dis.py b/Lib/test/test_dis.py index 19a4be2c4132b1..e8e8a3d6244cfe 100644 --- a/Lib/test/test_dis.py +++ b/Lib/test/test_dis.py @@ -1195,8 +1195,8 @@ def _prepare_test_cases(): Instruction(opname='CALL_NO_KW', opcode=169, arg=1, argval=1, argrepr='', offset=156, starts_line=None, is_jump_target=False, positions=None), Instruction(opname='POP_TOP', opcode=1, arg=None, argval=None, argrepr='', offset=158, starts_line=None, is_jump_target=False, positions=None), Instruction(opname='LOAD_CONST', opcode=100, arg=0, argval=None, argrepr='None', offset=160, starts_line=25, is_jump_target=False, positions=None), - Instruction(opname='DUP_TOP', opcode=4, arg=None, argval=None, argrepr='', offset=162, starts_line=None, is_jump_target=False, positions=None), - Instruction(opname='DUP_TOP', opcode=4, arg=None, argval=None, argrepr='', offset=164, starts_line=None, is_jump_target=False, positions=None), + Instruction(opname='LOAD_CONST', opcode=100, arg=0, argval=None, argrepr='None', offset=162, starts_line=None, is_jump_target=False, positions=None), + Instruction(opname='LOAD_CONST', opcode=100, arg=0, argval=None, argrepr='None', offset=164, starts_line=None, is_jump_target=False, positions=None), Instruction(opname='CALL_NO_KW', opcode=169, arg=3, argval=3, argrepr='', offset=166, starts_line=None, is_jump_target=False, positions=None), Instruction(opname='POP_TOP', opcode=1, arg=None, argval=None, argrepr='', offset=168, starts_line=None, is_jump_target=False, positions=None), Instruction(opname='JUMP_FORWARD', opcode=110, arg=25, argval=222, argrepr='to 222', offset=170, starts_line=None, is_jump_target=False, positions=None), diff --git a/Lib/test/test_peepholer.py b/Lib/test/test_peepholer.py index 8306c896a57f4e..659f654b5c6769 100644 --- a/Lib/test/test_peepholer.py +++ b/Lib/test/test_peepholer.py @@ -119,8 +119,8 @@ def f(): def test_pack_unpack(self): for line, elem in ( ('a, = a,', 'LOAD_CONST',), - ('a, b = a, b', 'ROT_TWO',), - ('a, b, c = a, b, c', 'ROT_THREE',), + ('a, b = a, b', 'SWAP',), + ('a, b, c = a, b, c', 'SWAP',), ): with self.subTest(line=line): code = compile(line,'','single') diff --git a/Python/ceval.c b/Python/ceval.c index 9aaddd99edacf7..a65f5280117a23 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1434,8 +1434,6 @@ eval_frame_handle_pending(PyThreadState *tstate) #define PEEK(n) (stack_pointer[-(n)]) #define SET_TOP(v) (stack_pointer[-1] = (v)) #define SET_SECOND(v) (stack_pointer[-2] = (v)) -#define SET_THIRD(v) (stack_pointer[-3] = (v)) -#define SET_FOURTH(v) (stack_pointer[-4] = (v)) #define BASIC_STACKADJ(n) (stack_pointer += n) #define BASIC_PUSH(v) (*stack_pointer++ = (v)) #define BASIC_POP() (*--stack_pointer) @@ -1920,54 +1918,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, InterpreterFrame *frame, int thr DISPATCH(); } - TARGET(ROT_TWO) { - PyObject *top = TOP(); - PyObject *second = SECOND(); - SET_TOP(second); - SET_SECOND(top); - DISPATCH(); - } - - TARGET(ROT_THREE) { - PyObject *top = TOP(); - PyObject *second = SECOND(); - PyObject *third = THIRD(); - SET_TOP(second); - SET_SECOND(third); - SET_THIRD(top); - DISPATCH(); - } - - TARGET(ROT_FOUR) { - PyObject *top = TOP(); - PyObject *second = SECOND(); - PyObject *third = THIRD(); - PyObject *fourth = FOURTH(); - SET_TOP(second); - SET_SECOND(third); - SET_THIRD(fourth); - SET_FOURTH(top); - DISPATCH(); - } - - TARGET(DUP_TOP) { - PyObject *top = TOP(); - Py_INCREF(top); - PUSH(top); - DISPATCH(); - } - - TARGET(DUP_TOP_TWO) { - PyObject *top = TOP(); - PyObject *second = SECOND(); - Py_INCREF(top); - Py_INCREF(second); - STACK_GROW(2); - SET_TOP(top); - SET_SECOND(second); - DISPATCH(); - } - TARGET(UNARY_POSITIVE) { PyObject *value = TOP(); PyObject *res = PyNumber_Positive(value); @@ -5165,14 +5115,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, InterpreterFrame *frame, int thr DISPATCH(); } - TARGET(ROT_N) { - PyObject *top = TOP(); - memmove(&PEEK(oparg - 1), &PEEK(oparg), - sizeof(PyObject*) * (oparg - 1)); - PEEK(oparg) = top; - DISPATCH(); - } - TARGET(COPY) { assert(oparg != 0); PyObject *peek = PEEK(oparg); @@ -5216,6 +5158,14 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, InterpreterFrame *frame, int thr } } + TARGET(SWAP) { + assert(oparg != 0); + PyObject *top = TOP(); + SET_TOP(PEEK(oparg)); + PEEK(oparg) = top; + DISPATCH(); + } + TARGET(EXTENDED_ARG) { int oldoparg = oparg; NEXTOPARG(); @@ -7375,7 +7325,7 @@ format_awaitable_error(PyThreadState *tstate, PyTypeObject *type, int prevprevop "that does not implement __await__: %.100s", type->tp_name); } - else if (prevopcode == WITH_EXCEPT_START || (prevopcode == CALL_NO_KW && prevprevopcode == DUP_TOP)) { + else if (prevopcode == WITH_EXCEPT_START || (prevopcode == CALL_NO_KW && prevprevopcode == LOAD_CONST)) { _PyErr_Format(tstate, PyExc_TypeError, "'async with' received an object from __aexit__ " "that does not implement __await__: %.100s", diff --git a/Python/compile.c b/Python/compile.c index 5d32959db3b65b..04749c73978b65 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -867,14 +867,8 @@ stack_effect(int opcode, int oparg, int jump) /* Stack manipulation */ case POP_TOP: return -1; - case ROT_TWO: - case ROT_THREE: - case ROT_FOUR: + case SWAP: return 0; - case DUP_TOP: - return 1; - case DUP_TOP_TWO: - return 2; /* Unary operators */ case UNARY_POSITIVE: @@ -1093,8 +1087,6 @@ stack_effect(int opcode, int oparg, int jump) case MATCH_SEQUENCE: case MATCH_KEYS: return 1; - case ROT_N: - return 0; case COPY: return 1; case BINARY_OP: @@ -1825,8 +1817,8 @@ compiler_pop_fblock(struct compiler *c, enum fblocktype t, basicblock *b) static int compiler_call_exit_with_nones(struct compiler *c) { ADDOP_LOAD_CONST(c, Py_None); - ADDOP(c, DUP_TOP); - ADDOP(c, DUP_TOP); + ADDOP_LOAD_CONST(c, Py_None); + ADDOP_LOAD_CONST(c, Py_None); ADDOP_I(c, CALL_NO_KW, 3); return 1; } @@ -1885,7 +1877,7 @@ compiler_unwind_fblock(struct compiler *c, struct fblockinfo *info, case FOR_LOOP: /* Pop the iterator */ if (preserve_tos) { - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } ADDOP(c, POP_TOP); return 1; @@ -1915,11 +1907,11 @@ compiler_unwind_fblock(struct compiler *c, struct fblockinfo *info, case FINALLY_END: if (preserve_tos) { - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } ADDOP(c, POP_TOP); /* exc_value */ if (preserve_tos) { - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } ADDOP(c, POP_BLOCK); ADDOP(c, POP_EXCEPT); @@ -1930,7 +1922,7 @@ compiler_unwind_fblock(struct compiler *c, struct fblockinfo *info, SET_LOC(c, (stmt_ty)info->fb_datum); ADDOP(c, POP_BLOCK); if (preserve_tos) { - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } if(!compiler_call_exit_with_nones(c)) { return 0; @@ -1952,7 +1944,7 @@ compiler_unwind_fblock(struct compiler *c, struct fblockinfo *info, ADDOP(c, POP_BLOCK); } if (preserve_tos) { - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } ADDOP(c, POP_BLOCK); ADDOP(c, POP_EXCEPT); @@ -1965,7 +1957,7 @@ compiler_unwind_fblock(struct compiler *c, struct fblockinfo *info, case POP_VALUE: if (preserve_tos) { - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } ADDOP(c, POP_TOP); return 1; @@ -2642,7 +2634,7 @@ compiler_class(struct compiler *c, stmt_ty s) assert(i == 0); ADDOP_I(c, LOAD_CLOSURE, i); - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); str = PyUnicode_InternFromString("__classcell__"); if (!str || !compiler_nameop(c, str, Store)) { Py_XDECREF(str); @@ -2838,8 +2830,8 @@ compiler_jump_if(struct compiler *c, expr_ty e, basicblock *next, int cond) for (i = 0; i < n; i++) { VISIT(c, expr, (expr_ty)asdl_seq_GET(e->v.Compare.comparators, i)); - ADDOP(c, DUP_TOP); - ADDOP(c, ROT_THREE); + ADDOP_I(c, SWAP, 2); + ADDOP_I(c, COPY, 2); ADDOP_COMPARE(c, asdl_seq_GET(e->v.Compare.ops, i)); ADDOP_JUMP(c, POP_JUMP_IF_FALSE, cleanup); NEXT_BLOCK(c); @@ -3495,9 +3487,9 @@ compiler_try_except(struct compiler *c, stmt_ty s) [] POP_BLOCK [] JUMP_FORWARD L0 - [exc] L1: DUP_TOP ) save copy of the original exception + [exc] L1: COPY 1 ) save copy of the original exception [orig, exc] BUILD_LIST ) list for raised/reraised excs ("result") - [orig, exc, res] ROT_TWO + [orig, exc, res] SWAP 2 [orig, res, exc] [orig, res, exc, E1] JUMP_IF_NOT_EG_MATCH L2 @@ -3517,12 +3509,12 @@ compiler_try_except(struct compiler *c, stmt_ty s) [orig, res, rest] Ln+1: LIST_APPEND 1 ) add unhandled exc to res (could be None) [orig, res] PREP_RERAISE_STAR - [exc] DUP_TOP + [exc] COPY 1 [exc, exc] POP_JUMP_IF_NOT_NONE RER [exc] POP_TOP [] JUMP_FORWARD L0 - [exc] RER: ROT_TWO + [exc] RER: SWAP 2 [exc, prev_exc_info] POP_EXCEPT [exc] RERAISE 0 @@ -3587,19 +3579,19 @@ compiler_try_star_except(struct compiler *c, stmt_ty s) if (i == 0) { /* Push the original EG into the stack */ /* - [exc] DUP_TOP + [exc] COPY 1 [orig, exc] */ - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); /* create empty list for exceptions raised/reraise in the except* blocks */ /* [orig, exc] BUILD_LIST - [orig, exc, []] ROT_TWO + [orig, exc, []] SWAP 2 [orig, [], exc] */ ADDOP_I(c, BUILD_LIST, 0); - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); } if (handler->v.ExceptHandler.type) { VISIT(c, expr, handler->v.ExceptHandler.type); @@ -3687,7 +3679,7 @@ compiler_try_star_except(struct compiler *c, stmt_ty s) compiler_use_next_block(c, reraise_star); ADDOP(c, PREP_RERAISE_STAR); - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); ADDOP_JUMP(c, POP_JUMP_IF_NOT_NONE, reraise); NEXT_BLOCK(c); @@ -3698,7 +3690,7 @@ compiler_try_star_except(struct compiler *c, stmt_ty s) ADDOP_JUMP(c, JUMP_FORWARD, end); compiler_use_next_block(c, reraise); ADDOP(c, POP_BLOCK); - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); ADDOP(c, POP_EXCEPT); ADDOP_I(c, RERAISE, 0); compiler_use_next_block(c, cleanup); @@ -3756,7 +3748,7 @@ compiler_import_as(struct compiler *c, identifier name, identifier asname) if (dot == -1) { break; } - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); ADDOP(c, POP_TOP); } if (!compiler_nameop(c, asname, Store)) { @@ -3956,8 +3948,9 @@ compiler_visit_stmt(struct compiler *c, stmt_ty s) n = asdl_seq_LEN(s->v.Assign.targets); VISIT(c, expr, s->v.Assign.value); for (i = 0; i < n; i++) { - if (i < n - 1) - ADDOP(c, DUP_TOP); + if (i < n - 1) { + ADDOP_I(c, COPY, 1); + } VISIT(c, expr, (expr_ty)asdl_seq_GET(s->v.Assign.targets, i)); } @@ -4516,8 +4509,8 @@ compiler_compare(struct compiler *c, expr_ty e) for (i = 0; i < n; i++) { VISIT(c, expr, (expr_ty)asdl_seq_GET(e->v.Compare.comparators, i)); - ADDOP(c, DUP_TOP); - ADDOP(c, ROT_THREE); + ADDOP_I(c, SWAP, 2); + ADDOP_I(c, COPY, 2); ADDOP_COMPARE(c, asdl_seq_GET(e->v.Compare.ops, i)); ADDOP_JUMP(c, JUMP_IF_FALSE_OR_POP, cleanup); NEXT_BLOCK(c); @@ -4529,7 +4522,7 @@ compiler_compare(struct compiler *c, expr_ty e) return 0; ADDOP_JUMP_NOLINE(c, JUMP_FORWARD, end); compiler_use_next_block(c, cleanup); - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); ADDOP(c, POP_TOP); compiler_use_next_block(c, end); } @@ -5675,7 +5668,7 @@ compiler_visit_expr1(struct compiler *c, expr_ty e) switch (e->kind) { case NamedExpr_kind: VISIT(c, expr, e->v.NamedExpr.value); - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); VISIT(c, expr, e->v.NamedExpr.target); break; case BoolOp_kind: @@ -5841,7 +5834,7 @@ compiler_augassign(struct compiler *c, stmt_ty s) switch (e->kind) { case Attribute_kind: VISIT(c, expr, e->v.Attribute.value); - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); int old_lineno = c->u->u_lineno; c->u->u_lineno = e->end_lineno; ADDOP_NAME(c, LOAD_ATTR, e->v.Attribute.attr, names); @@ -5850,7 +5843,8 @@ compiler_augassign(struct compiler *c, stmt_ty s) case Subscript_kind: VISIT(c, expr, e->v.Subscript.value); VISIT(c, expr, e->v.Subscript.slice); - ADDOP(c, DUP_TOP_TWO); + ADDOP_I(c, COPY, 2); + ADDOP_I(c, COPY, 2); ADDOP(c, BINARY_SUBSCR); break; case Name_kind: @@ -5877,11 +5871,12 @@ compiler_augassign(struct compiler *c, stmt_ty s) switch (e->kind) { case Attribute_kind: c->u->u_lineno = e->end_lineno; - ADDOP(c, ROT_TWO); + ADDOP_I(c, SWAP, 2); ADDOP_NAME(c, STORE_ATTR, e->v.Attribute.attr, names); break; case Subscript_kind: - ADDOP(c, ROT_THREE); + ADDOP_I(c, SWAP, 3); + ADDOP_I(c, SWAP, 2); ADDOP(c, STORE_SUBSCR); break; case Name_kind: @@ -6233,6 +6228,15 @@ compiler_error_duplicate_store(struct compiler *c, identifier n) return compiler_error(c, "multiple assignments to name %R in pattern", n); } +static int +pattern_helper_rotate(struct compiler *c, int count) +{ + while (1 < count) { + ADDOP_I(c, SWAP, count--); + } + return 1; +} + static int pattern_helper_store_name(struct compiler *c, identifier n, pattern_context *pc) { @@ -6252,7 +6256,8 @@ pattern_helper_store_name(struct compiler *c, identifier n, pattern_context *pc) return compiler_error_duplicate_store(c, n); } // Rotate this object underneath any items we need to preserve: - ADDOP_I(c, ROT_N, pc->on_top + PyList_GET_SIZE(pc->stores) + 1); + Py_ssize_t rotations = pc->on_top + PyList_GET_SIZE(pc->stores) + 1; + RETURN_IF_FALSE(pattern_helper_rotate(c, rotations)); return !PyList_Append(pc->stores, n); } @@ -6321,7 +6326,7 @@ pattern_helper_sequence_subscr(struct compiler *c, asdl_pattern_seq *patterns, assert(WILDCARD_STAR_CHECK(pattern)); continue; } - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); if (i < star) { ADDOP_LOAD_CONST_NEW(c, PyLong_FromSsize_t(i)); } @@ -6370,7 +6375,7 @@ compiler_pattern_as(struct compiler *c, pattern_ty p, pattern_context *pc) } // Need to make a copy for (possibly) storing later: pc->on_top++; - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); RETURN_IF_FALSE(compiler_pattern(c, p->v.MatchAs.pattern, pc)); // Success! Store it: pc->on_top--; @@ -6445,7 +6450,7 @@ compiler_pattern_class(struct compiler *c, pattern_ty p, pattern_context *pc) } ADDOP_LOAD_CONST_NEW(c, attr_names); ADDOP_I(c, MATCH_CLASS, nargs); - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); ADDOP_LOAD_CONST(c, Py_None); ADDOP_I(c, IS_OP, 1); // TOS is now a tuple of (nargs + nattrs) attributes (or None): @@ -6563,7 +6568,7 @@ compiler_pattern_mapping(struct compiler *c, pattern_ty p, pattern_context *pc) ADDOP(c, MATCH_KEYS); // There's now a tuple of keys and a tuple of values on top of the subject: pc->on_top += 2; - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); ADDOP_LOAD_CONST(c, Py_None); ADDOP_I(c, IS_OP, 1); RETURN_IF_FALSE(jump_to_fail_pop(c, pc, POP_JUMP_IF_FALSE)); @@ -6587,13 +6592,12 @@ compiler_pattern_mapping(struct compiler *c, pattern_ty p, pattern_context *pc) // for key in TOS: // del rest[key] ADDOP_I(c, BUILD_MAP, 0); // [subject, keys, empty] - ADDOP(c, ROT_THREE); // [empty, subject, keys] - ADDOP(c, ROT_TWO); // [empty, keys, subject] + ADDOP_I(c, SWAP, 3); // [empty, keys, subject] ADDOP_I(c, DICT_UPDATE, 2); // [copy, keys] ADDOP_I(c, UNPACK_SEQUENCE, size); // [copy, keys...] while (size) { ADDOP_I(c, COPY, 1 + size--); // [copy, keys..., copy] - ADDOP(c, ROT_TWO); // [copy, keys..., copy, key] + ADDOP_I(c, SWAP, 2); // [copy, keys..., copy, key] ADDOP(c, DELETE_SUBSCR); // [copy, keys...] } RETURN_IF_FALSE(pattern_helper_store_name(c, star_target, pc)); @@ -6638,7 +6642,7 @@ compiler_pattern_or(struct compiler *c, pattern_ty p, pattern_context *pc) pc->fail_pop = NULL; pc->fail_pop_size = 0; pc->on_top = 0; - if (!compiler_addop(c, DUP_TOP) || !compiler_pattern(c, alt, pc)) { + if (!compiler_addop_i(c, COPY, 1) || !compiler_pattern(c, alt, pc)) { goto error; } // Success! @@ -6689,9 +6693,9 @@ compiler_pattern_or(struct compiler *c, pattern_ty p, pattern_context *pc) // rotated = pc_stores[:rotations] // del pc_stores[:rotations] // pc_stores[icontrol-istores:icontrol-istores] = rotated - // Do the same thing to the stack, using several ROT_Ns: + // Do the same thing to the stack, using several SWAPs: while (rotations--) { - if (!compiler_addop_i(c, ROT_N, icontrol + 1)) { + if (!pattern_helper_rotate(c, icontrol + 1)){ goto error; } } @@ -6717,7 +6721,7 @@ compiler_pattern_or(struct compiler *c, pattern_ty p, pattern_context *pc) } compiler_use_next_block(c, end); Py_ssize_t nstores = PyList_GET_SIZE(control); - // There's a bunch of stuff on the stack between any where the new stores + // There's a bunch of stuff on the stack between where the new stores // are and where they need to be: // - The other stores. // - A copy of the subject. @@ -6726,7 +6730,7 @@ compiler_pattern_or(struct compiler *c, pattern_ty p, pattern_context *pc) Py_ssize_t nrots = nstores + 1 + pc->on_top + PyList_GET_SIZE(pc->stores); for (Py_ssize_t i = 0; i < nstores; i++) { // Rotate this capture to its proper place on the stack: - if (!compiler_addop_i(c, ROT_N, nrots)) { + if (!pattern_helper_rotate(c, nrots)) { goto error; } // Update the list of previous stores with this new name, checking for @@ -6885,7 +6889,7 @@ compiler_match_inner(struct compiler *c, stmt_ty s, pattern_context *pc) SET_LOC(c, m->pattern); // Only copy the subject if we're *not* on the last case: if (i != cases - has_default - 1) { - ADDOP(c, DUP_TOP); + ADDOP_I(c, COPY, 1); } RETURN_IF_FALSE(pc->stores = PyList_New(0)); // Irrefutable cases must be either guarded, last, or both: @@ -8421,35 +8425,11 @@ fold_tuple_on_constants(struct compiler *c, } -// Eliminate n * ROT_N(n). -static void -fold_rotations(struct instr *inst, int n) +static int +swaptimize(struct instr *inst) { - for (int i = 0; i < n; i++) { - int rot; - switch (inst[i].i_opcode) { - case ROT_N: - rot = inst[i].i_oparg; - break; - case ROT_FOUR: - rot = 4; - break; - case ROT_THREE: - rot = 3; - break; - case ROT_TWO: - rot = 2; - break; - default: - return; - } - if (rot != n) { - return; - } - } - for (int i = 0; i < n; i++) { - inst[i].i_opcode = NOP; - } + // TODO + return 0; } // Attempt to eliminate jumps to jumps by updating inst to jump to @@ -8578,12 +8558,16 @@ optimize_basic_block(struct compiler *c, basicblock *bb, PyObject *consts) bb->b_instr[i+1].i_opcode = NOP; break; case 2: - inst->i_opcode = ROT_TWO; + inst->i_opcode = SWAP; + inst->i_oparg = 2; bb->b_instr[i+1].i_opcode = NOP; + i--; break; case 3: - inst->i_opcode = ROT_THREE; - bb->b_instr[i+1].i_opcode = ROT_TWO; + inst->i_opcode = SWAP; + inst->i_oparg = 3; + bb->b_instr[i+1].i_opcode = NOP; + i--; } break; } @@ -8691,30 +8675,8 @@ optimize_basic_block(struct compiler *c, basicblock *bb, PyObject *consts) i -= jump_thread(inst, target, FOR_ITER); } break; - case ROT_N: - switch (oparg) { - case 0: - case 1: - inst->i_opcode = NOP; - continue; - case 2: - inst->i_opcode = ROT_TWO; - break; - case 3: - inst->i_opcode = ROT_THREE; - break; - case 4: - inst->i_opcode = ROT_FOUR; - break; - } - if (i >= oparg - 1) { - fold_rotations(inst - oparg + 1, oparg); - } - break; - case COPY: - if (oparg == 1) { - inst->i_opcode = DUP_TOP; - } + case SWAP: + i += swaptimize(inst); break; default: /* All HAS_CONST opcodes should be handled with LOAD_CONST */ diff --git a/Python/opcode_targets.h b/Python/opcode_targets.h index 11ac0e975fdcd8..86ffdb05d80a28 100644 --- a/Python/opcode_targets.h +++ b/Python/opcode_targets.h @@ -1,45 +1,40 @@ static void *opcode_targets[256] = { &&_unknown_opcode, &&TARGET_POP_TOP, - &&TARGET_ROT_TWO, - &&TARGET_ROT_THREE, - &&TARGET_DUP_TOP, - &&TARGET_DUP_TOP_TWO, - &&TARGET_ROT_FOUR, &&TARGET_BINARY_OP_ADAPTIVE, &&TARGET_BINARY_OP_ADD_INT, - &&TARGET_NOP, - &&TARGET_UNARY_POSITIVE, - &&TARGET_UNARY_NEGATIVE, - &&TARGET_UNARY_NOT, &&TARGET_BINARY_OP_ADD_FLOAT, &&TARGET_BINARY_OP_ADD_UNICODE, - &&TARGET_UNARY_INVERT, &&TARGET_BINARY_OP_INPLACE_ADD_UNICODE, &&TARGET_BINARY_OP_MULTIPLY_INT, &&TARGET_BINARY_OP_MULTIPLY_FLOAT, + &&TARGET_NOP, + &&TARGET_UNARY_POSITIVE, + &&TARGET_UNARY_NEGATIVE, + &&TARGET_UNARY_NOT, &&TARGET_BINARY_OP_SUBTRACT_INT, &&TARGET_BINARY_OP_SUBTRACT_FLOAT, + &&TARGET_UNARY_INVERT, &&TARGET_COMPARE_OP_ADAPTIVE, &&TARGET_COMPARE_OP_FLOAT_JUMP, &&TARGET_COMPARE_OP_INT_JUMP, &&TARGET_COMPARE_OP_STR_JUMP, - &&TARGET_BINARY_SUBSCR, &&TARGET_BINARY_SUBSCR_ADAPTIVE, &&TARGET_BINARY_SUBSCR_GETITEM, &&TARGET_BINARY_SUBSCR_LIST_INT, &&TARGET_BINARY_SUBSCR_TUPLE_INT, - &&TARGET_GET_LEN, - &&TARGET_MATCH_MAPPING, - &&TARGET_MATCH_SEQUENCE, - &&TARGET_MATCH_KEYS, &&TARGET_BINARY_SUBSCR_DICT, - &&TARGET_PUSH_EXC_INFO, + &&TARGET_BINARY_SUBSCR, &&TARGET_STORE_SUBSCR_ADAPTIVE, &&TARGET_STORE_SUBSCR_LIST_INT, &&TARGET_STORE_SUBSCR_DICT, &&TARGET_CALL_NO_KW_ADAPTIVE, + &&TARGET_GET_LEN, + &&TARGET_MATCH_MAPPING, + &&TARGET_MATCH_SEQUENCE, + &&TARGET_MATCH_KEYS, &&TARGET_CALL_NO_KW_BUILTIN_O, + &&TARGET_PUSH_EXC_INFO, &&TARGET_CALL_NO_KW_BUILTIN_FAST, &&TARGET_CALL_NO_KW_LEN, &&TARGET_CALL_NO_KW_ISINSTANCE, @@ -48,45 +43,50 @@ static void *opcode_targets[256] = { &&TARGET_CALL_NO_KW_METHOD_DESCRIPTOR_O, &&TARGET_CALL_NO_KW_TYPE_1, &&TARGET_CALL_NO_KW_BUILTIN_CLASS_1, + &&TARGET_CALL_NO_KW_METHOD_DESCRIPTOR_FAST, + &&TARGET_JUMP_ABSOLUTE_QUICK, + &&TARGET_LOAD_ATTR_ADAPTIVE, + &&TARGET_LOAD_ATTR_INSTANCE_VALUE, + &&TARGET_LOAD_ATTR_WITH_HINT, &&TARGET_WITH_EXCEPT_START, &&TARGET_GET_AITER, &&TARGET_GET_ANEXT, &&TARGET_BEFORE_ASYNC_WITH, &&TARGET_BEFORE_WITH, &&TARGET_END_ASYNC_FOR, - &&TARGET_CALL_NO_KW_METHOD_DESCRIPTOR_FAST, - &&TARGET_JUMP_ABSOLUTE_QUICK, - &&TARGET_LOAD_ATTR_ADAPTIVE, - &&TARGET_LOAD_ATTR_INSTANCE_VALUE, - &&TARGET_LOAD_ATTR_WITH_HINT, - &&TARGET_STORE_SUBSCR, - &&TARGET_DELETE_SUBSCR, &&TARGET_LOAD_ATTR_SLOT, &&TARGET_LOAD_ATTR_MODULE, &&TARGET_LOAD_GLOBAL_ADAPTIVE, &&TARGET_LOAD_GLOBAL_MODULE, &&TARGET_LOAD_GLOBAL_BUILTIN, + &&TARGET_STORE_SUBSCR, + &&TARGET_DELETE_SUBSCR, &&TARGET_LOAD_METHOD_ADAPTIVE, + &&TARGET_LOAD_METHOD_CACHED, + &&TARGET_LOAD_METHOD_CLASS, + &&TARGET_LOAD_METHOD_MODULE, + &&TARGET_LOAD_METHOD_NO_DICT, + &&TARGET_STORE_ATTR_ADAPTIVE, &&TARGET_GET_ITER, &&TARGET_GET_YIELD_FROM_ITER, &&TARGET_PRINT_EXPR, &&TARGET_LOAD_BUILD_CLASS, - &&TARGET_LOAD_METHOD_CACHED, + &&TARGET_STORE_ATTR_INSTANCE_VALUE, &&TARGET_GET_AWAITABLE, &&TARGET_LOAD_ASSERTION_ERROR, &&TARGET_RETURN_GENERATOR, - &&TARGET_LOAD_METHOD_CLASS, - &&TARGET_LOAD_METHOD_MODULE, - &&TARGET_LOAD_METHOD_NO_DICT, - &&TARGET_STORE_ATTR_ADAPTIVE, - &&TARGET_STORE_ATTR_INSTANCE_VALUE, &&TARGET_STORE_ATTR_SLOT, + &&TARGET_STORE_ATTR_WITH_HINT, + &&TARGET_LOAD_FAST__LOAD_FAST, + &&TARGET_STORE_FAST__LOAD_FAST, + &&TARGET_LOAD_FAST__LOAD_CONST, + &&TARGET_LOAD_CONST__LOAD_FAST, &&TARGET_LIST_TO_TUPLE, &&TARGET_RETURN_VALUE, &&TARGET_IMPORT_STAR, &&TARGET_SETUP_ANNOTATIONS, &&TARGET_YIELD_VALUE, - &&TARGET_STORE_ATTR_WITH_HINT, + &&TARGET_STORE_FAST__STORE_FAST, &&TARGET_PREP_RERAISE_STAR, &&TARGET_POP_EXCEPT, &&TARGET_STORE_NAME, @@ -98,7 +98,7 @@ static void *opcode_targets[256] = { &&TARGET_DELETE_ATTR, &&TARGET_STORE_GLOBAL, &&TARGET_DELETE_GLOBAL, - &&TARGET_ROT_N, + &&TARGET_SWAP, &&TARGET_LOAD_CONST, &&TARGET_LOAD_NAME, &&TARGET_BUILD_TUPLE, @@ -130,7 +130,7 @@ static void *opcode_targets[256] = { &&TARGET_POP_JUMP_IF_NOT_NONE, &&TARGET_POP_JUMP_IF_NONE, &&TARGET_RAISE_VARARGS, - &&TARGET_LOAD_FAST__LOAD_FAST, + &&_unknown_opcode, &&TARGET_MAKE_FUNCTION, &&TARGET_BUILD_SLICE, &&TARGET_JUMP_NO_INTERRUPT, @@ -139,17 +139,17 @@ static void *opcode_targets[256] = { &&TARGET_LOAD_DEREF, &&TARGET_STORE_DEREF, &&TARGET_DELETE_DEREF, - &&TARGET_STORE_FAST__LOAD_FAST, - &&TARGET_LOAD_FAST__LOAD_CONST, + &&_unknown_opcode, + &&_unknown_opcode, &&TARGET_CALL_FUNCTION_EX, - &&TARGET_LOAD_CONST__LOAD_FAST, + &&_unknown_opcode, &&TARGET_EXTENDED_ARG, &&TARGET_LIST_APPEND, &&TARGET_SET_ADD, &&TARGET_MAP_ADD, &&TARGET_LOAD_CLASSDEREF, &&TARGET_COPY_FREE_VARS, - &&TARGET_STORE_FAST__STORE_FAST, + &&_unknown_opcode, &&TARGET_RESUME, &&TARGET_MATCH_CLASS, &&_unknown_opcode, From 96f0ce52b7af46c8d7363ab8dc7e957759ecfa62 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 15:09:48 -0800 Subject: [PATCH 2/9] Implement a "swaptimizer" --- Python/compile.c | 104 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 98 insertions(+), 6 deletions(-) diff --git a/Python/compile.c b/Python/compile.c index 3d9f72fb0004a2..9e3bea304e0952 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -8438,11 +8438,103 @@ fold_tuple_on_constants(struct compiler *c, } -static int -swaptimize(struct instr *inst) -{ - // TODO - return 0; +// Replace an arbitrary run of SWAPs and NOPs with an optimal one that has the +// same effect. +static int +swaptimize(basicblock *block, int start) +{ + // NOTE: Running "python -m test test_patma" serves as a good, quick stress + // test for this function. Make sure to blow away cached *.pyc files first! + assert(block->b_instr[start].i_opcode == SWAP); + // Find the end of the current run: + int stop = start; + while (++stop < block->b_iused && (block->b_instr[stop].i_opcode == SWAP || + block->b_instr[stop].i_opcode == NOP)); + // Find the maximum "depth" of the stack changes: + int depth = 0; + for (int i = start; i < stop; i++) { + if (block->b_instr[i].i_opcode == SWAP) { + depth = Py_MAX(depth, block->b_instr[i].i_oparg); + } + } + // Create an array of {0, 1, 2, ..., depth - 1}: + int stack[depth]; + for (int i = 0; i < depth; i++) { + stack[i] = i; + } + // Simulate the combined effect of these instructions by "running" them on + // our "stack": + for (int i = start; i < stop; i++) { + if (block->b_instr[i].i_opcode == SWAP) { + int oparg = block->b_instr[i].i_oparg; + int top = stack[0]; + stack[0] = stack[oparg - 1]; + stack[oparg - 1] = top; + } + } + // Now we can begin! Our approach here is based on a solution to a closely + // related problem (https://cs.stackexchange.com/a/13938). It's easiest to + // think of this algorithm as determining the steps needed to efficiently + // "un-shuffle" our stack. By performing the moves in *reverse* order, + // though, we can efficiently *shuffle* it! For this reason, we will be + // replacing instructions starting from the *end* of the run, and moving + // towards the *start*. Since the solution is optimal, we don't need to + // worry about running out of space: + int i = stop; + for (int item = 0; item < depth; item++) { + // We will be replacing items with -1 to mark them as visited: + if (stack[item] < 0) { + continue; + } + // Okay, we've found an item that hasn't been visited. It forms a cycle + // with zero or more other items; traversing the cycle and swapping each + // item by its value will put them all in the right place: + int previous_i = i; + int size = 0; + while (0 <= stack[item]) { + // Skip the actual swap if our item is zero (this happens exactly + // twice, at the beginning and end of the first cycle). Swapping the + // top item with itself is pointless: + if (item) { + // SWAP(item + 1), since swaps are 1-indexed. We'll update the + // actual opcode later: + assert(start < i); + block->b_instr[--i].i_oparg = item + 1; + } + size++; + // Mark the current item as visited, and move on to the next one: + int next_item = stack[item]; + stack[item] = -1; + item = next_item; + } + assert(size); + // Visit the original item to complete the cycle: + if (item && start < i) { + block->b_instr[--i].i_oparg = item + 1; + } + // If there was only one item in this cycle, then we just swapped it + // with itself twice! No meaningful work was done, so reset the current + // position (we'll overwrite those useless swaps on the next traversal): + if (size == 1) { + i = previous_i; + } + // If i == start before anchor == depth, it means that all remaining + // items form cycles with themselves, and are already in the correct + // location. Just bail if that happens: + if (i == start) { + break; + } + } + // NOP out any unused instructions: + while (start < i) { + block->b_instr[start++].i_opcode = NOP; + } + // Ensure that any used instructions are indeed SWAPs: + while (start < stop) { + block->b_instr[start++].i_opcode = SWAP; + } + // Nothing more to be done here! Jump ahead to the end of the original run: + return stop - 1; } // Attempt to eliminate jumps to jumps by updating inst to jump to @@ -8689,7 +8781,7 @@ optimize_basic_block(struct compiler *c, basicblock *bb, PyObject *consts) } break; case SWAP: - i += swaptimize(inst); + i = swaptimize(bb, i); break; default: /* All HAS_CONST opcodes should be handled with LOAD_CONST */ From 1ab5f21836814dd7c11a8c8fa739d86f8b9436b2 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 17:05:59 -0800 Subject: [PATCH 3/9] Simplify things further --- Python/compile.c | 126 +++++++++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 69 deletions(-) diff --git a/Python/compile.c b/Python/compile.c index 9e3bea304e0952..14d09e5e1b4553 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -6241,6 +6241,7 @@ compiler_error_duplicate_store(struct compiler *c, identifier n) return compiler_error(c, "multiple assignments to name %R in pattern", n); } +// Duplicate the effect of 3.10's ROT_* instructions using SWAPs. static int pattern_helper_rotate(struct compiler *c, int count) { @@ -6687,7 +6688,8 @@ compiler_pattern_or(struct compiler *c, pattern_ty p, pattern_context *pc) // this; the current solution is potentially very // inefficient when each alternative subpattern binds lots // of names in different orders. It's fine for reasonable - // cases, though. + // cases, though, and the peephole optimizer will ensure + // that the final code is as efficient as possible. assert(istores < icontrol); Py_ssize_t rotations = istores + 1; // Perform the same rotation on pc->stores: @@ -6706,7 +6708,8 @@ compiler_pattern_or(struct compiler *c, pattern_ty p, pattern_context *pc) // rotated = pc_stores[:rotations] // del pc_stores[:rotations] // pc_stores[icontrol-istores:icontrol-istores] = rotated - // Do the same thing to the stack, using several SWAPs: + // Do the same thing to the stack, using several + // rotations: while (rotations--) { if (!pattern_helper_rotate(c, icontrol + 1)){ goto error; @@ -8437,37 +8440,43 @@ fold_tuple_on_constants(struct compiler *c, return 0; } +#define VISITED (-1) // Replace an arbitrary run of SWAPs and NOPs with an optimal one that has the -// same effect. -static int -swaptimize(basicblock *block, int start) -{ - // NOTE: Running "python -m test test_patma" serves as a good, quick stress - // test for this function. Make sure to blow away cached *.pyc files first! - assert(block->b_instr[start].i_opcode == SWAP); - // Find the end of the current run: - int stop = start; - while (++stop < block->b_iused && (block->b_instr[stop].i_opcode == SWAP || - block->b_instr[stop].i_opcode == NOP)); - // Find the maximum "depth" of the stack changes: - int depth = 0; - for (int i = start; i < stop; i++) { - if (block->b_instr[i].i_opcode == SWAP) { - depth = Py_MAX(depth, block->b_instr[i].i_oparg); +// same effect. Return the number of instructions that were optimized. +static int +swaptimize(basicblock *block, int ix) +{ + // NOTE: "./python -m test test_patma" serves as a good, quick stress test + // for this function. Make sure to blow away cached *.pyc files first! + assert(ix < block->b_iused); + struct instr *instructions = &block->b_instr[ix]; + // Find the length of the current sequence of SWAPs and NOPs, and record the + // maximum depth of the stack manipulations: + assert(instructions[0].i_opcode == SWAP); + int depth = instructions[0].i_oparg; + int len = 0; + while (++len < block->b_iused - ix) { + int opcode = instructions[len].i_opcode; + if (opcode == SWAP) { + depth = Py_MAX(depth, instructions[len].i_oparg); + } + else if (opcode != NOP) { + break; } } - // Create an array of {0, 1, 2, ..., depth - 1}: + // Create an array with elements {0, 1, 2, ..., depth - 1}: int stack[depth]; for (int i = 0; i < depth; i++) { stack[i] = i; } // Simulate the combined effect of these instructions by "running" them on // our "stack": - for (int i = start; i < stop; i++) { - if (block->b_instr[i].i_opcode == SWAP) { - int oparg = block->b_instr[i].i_oparg; + for (int i = 0; i < len; i++) { + if (instructions[i].i_opcode == SWAP) { + int oparg = instructions[i].i_oparg; int top = stack[0]; + // SWAPs are 1-indexed: stack[0] = stack[oparg - 1]; stack[oparg - 1] = top; } @@ -8477,64 +8486,43 @@ swaptimize(basicblock *block, int start) // think of this algorithm as determining the steps needed to efficiently // "un-shuffle" our stack. By performing the moves in *reverse* order, // though, we can efficiently *shuffle* it! For this reason, we will be - // replacing instructions starting from the *end* of the run, and moving - // towards the *start*. Since the solution is optimal, we don't need to - // worry about running out of space: - int i = stop; + // replacing instructions starting from the *end* of the run. Since the + // solution is optimal, we don't need to worry about running out of space: + int i = len - 1; for (int item = 0; item < depth; item++) { - // We will be replacing items with -1 to mark them as visited: - if (stack[item] < 0) { + // Skip items that have already been visited, or just happen to be in + // the correct location: + if (stack[item] == VISITED || stack[item] == item) { continue; } // Okay, we've found an item that hasn't been visited. It forms a cycle - // with zero or more other items; traversing the cycle and swapping each - // item by its value will put them all in the right place: - int previous_i = i; - int size = 0; - while (0 <= stack[item]) { - // Skip the actual swap if our item is zero (this happens exactly - // twice, at the beginning and end of the first cycle). Swapping the - // top item with itself is pointless: + // with other items; traversing the cycle and swapping each item with + // the next will put them all in the correct place. The weird + // loop-and-a-half is necessary to insert 0 into every cycle, since + // we can only swap from that position. + while (true) { + // Skip the actual swap if our item is zero, since swapping the top + // item with itself is pointless. if (item) { - // SWAP(item + 1), since swaps are 1-indexed. We'll update the - // actual opcode later: - assert(start < i); - block->b_instr[--i].i_oparg = item + 1; + assert(0 <= i); + // SWAPs are 1-indexed: + instructions[i].i_opcode = SWAP; + instructions[i--].i_oparg = item + 1; + } + if (stack[item] == VISITED) { + break; } - size++; - // Mark the current item as visited, and move on to the next one: int next_item = stack[item]; - stack[item] = -1; + stack[item] = VISITED; item = next_item; } - assert(size); - // Visit the original item to complete the cycle: - if (item && start < i) { - block->b_instr[--i].i_oparg = item + 1; - } - // If there was only one item in this cycle, then we just swapped it - // with itself twice! No meaningful work was done, so reset the current - // position (we'll overwrite those useless swaps on the next traversal): - if (size == 1) { - i = previous_i; - } - // If i == start before anchor == depth, it means that all remaining - // items form cycles with themselves, and are already in the correct - // location. Just bail if that happens: - if (i == start) { - break; - } } // NOP out any unused instructions: - while (start < i) { - block->b_instr[start++].i_opcode = NOP; - } - // Ensure that any used instructions are indeed SWAPs: - while (start < stop) { - block->b_instr[start++].i_opcode = SWAP; + while (0 <= i) { + instructions[i--].i_opcode = NOP; } - // Nothing more to be done here! Jump ahead to the end of the original run: - return stop - 1; + // Done! Return the number of optimized instructions: + return len - 1; } // Attempt to eliminate jumps to jumps by updating inst to jump to @@ -8781,7 +8769,7 @@ optimize_basic_block(struct compiler *c, basicblock *bb, PyObject *consts) } break; case SWAP: - i = swaptimize(bb, i); + i += swaptimize(bb, i); break; default: /* All HAS_CONST opcodes should be handled with LOAD_CONST */ From d85d65bff6b7526290602f892149a3f3923c252b Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 17:42:10 -0800 Subject: [PATCH 4/9] blurb add --- Doc/whatsnew/3.11.rst | 5 +++-- .../2022-01-25-17-40-07.bpo-46528.2Qmni9.rst | 3 +++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 4328ee6a5030cd..6796ff0ed13f8c 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -411,8 +411,9 @@ CPython bytecode changes indicate failure with :const:`None` (where a tuple of extracted values would otherwise be). -* Added :opcode:`COPY`, which pushes the *i*-th item to the top of the stack. - The item is not removed from its original location. +* Replace several stack manipulation instructions (``DUP_TOP``, ``DUP_TOP_TWO``, + ``ROT_TWO``, ``ROT_THREE``, ``ROT_FOUR``, and ``ROT_N``) with new + :opcode:`COPY` and new :opcode:`SWAP` instructions. * Add :opcode:`POP_JUMP_IF_NOT_NONE` and :opcode:`POP_JUMP_IF_NONE` opcodes to speed up conditional jumps. diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst b/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst new file mode 100644 index 00000000000000..2dbf765a1bb7a5 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst @@ -0,0 +1,3 @@ +Replace several stack manipulation instructions (``DUP_TOP``, +``DUP_TOP_TWO``, ``ROT_TWO``, ``ROT_THREE``, ``ROT_FOUR``, and ``ROT_N``) +with the existing :opcode:`COPY` and new :opcode:`SWAP` instructions. From 1f67c08f40def410c4875e8c22b9f259c3c77e3c Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 17:43:41 -0800 Subject: [PATCH 5/9] Fix typo --- Doc/whatsnew/3.11.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst index 6796ff0ed13f8c..edb2f6b89bbae6 100644 --- a/Doc/whatsnew/3.11.rst +++ b/Doc/whatsnew/3.11.rst @@ -413,7 +413,7 @@ CPython bytecode changes * Replace several stack manipulation instructions (``DUP_TOP``, ``DUP_TOP_TWO``, ``ROT_TWO``, ``ROT_THREE``, ``ROT_FOUR``, and ``ROT_N``) with new - :opcode:`COPY` and new :opcode:`SWAP` instructions. + :opcode:`COPY` and :opcode:`SWAP` instructions. * Add :opcode:`POP_JUMP_IF_NOT_NONE` and :opcode:`POP_JUMP_IF_NONE` opcodes to speed up conditional jumps. From c4ef99f225b9ded28a7b91da136739847d6eac1a Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 17:46:31 -0800 Subject: [PATCH 6/9] Clean up wording --- .../Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst b/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst index 2dbf765a1bb7a5..f1639f8b3f06ee 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-01-25-17-40-07.bpo-46528.2Qmni9.rst @@ -1,3 +1,3 @@ Replace several stack manipulation instructions (``DUP_TOP``, ``DUP_TOP_TWO``, ``ROT_TWO``, ``ROT_THREE``, ``ROT_FOUR``, and ``ROT_N``) -with the existing :opcode:`COPY` and new :opcode:`SWAP` instructions. +with new :opcode:`COPY` and :opcode:`SWAP` instructions. From 6274309d1e972d4f7afb83dcc690c6c50d88ab56 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Tue, 25 Jan 2022 19:54:24 -0800 Subject: [PATCH 7/9] Remove VLA usage --- Python/compile.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Python/compile.c b/Python/compile.c index 14d09e5e1b4553..9a8f4aa617eb07 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -8465,8 +8465,12 @@ swaptimize(basicblock *block, int ix) break; } } + // It's already optimal if there's only one instruction: + if (len == 1) { + return 0; + } // Create an array with elements {0, 1, 2, ..., depth - 1}: - int stack[depth]; + int *stack = PyMem_Malloc(depth * sizeof(int)); for (int i = 0; i < depth; i++) { stack[i] = i; } @@ -8522,6 +8526,7 @@ swaptimize(basicblock *block, int ix) instructions[i--].i_opcode = NOP; } // Done! Return the number of optimized instructions: + PyMem_Free(stack); return len - 1; } From aa51caadf5d964c930d50bcde53aa1ae88e8a47a Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 26 Jan 2022 11:50:11 -0800 Subject: [PATCH 8/9] Apply feedback from code review --- Python/compile.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/Python/compile.c b/Python/compile.c index 9a8f4aa617eb07..b4f8a52793260e 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -6243,7 +6243,7 @@ compiler_error_duplicate_store(struct compiler *c, identifier n) // Duplicate the effect of 3.10's ROT_* instructions using SWAPs. static int -pattern_helper_rotate(struct compiler *c, int count) +pattern_helper_rotate(struct compiler *c, Py_ssize_t count) { while (1 < count) { ADDOP_I(c, SWAP, count--); @@ -8456,17 +8456,19 @@ swaptimize(basicblock *block, int ix) assert(instructions[0].i_opcode == SWAP); int depth = instructions[0].i_oparg; int len = 0; + int more = false; while (++len < block->b_iused - ix) { int opcode = instructions[len].i_opcode; if (opcode == SWAP) { depth = Py_MAX(depth, instructions[len].i_oparg); + more = true; } else if (opcode != NOP) { break; } } - // It's already optimal if there's only one instruction: - if (len == 1) { + // It's already optimal if there's only one SWAP: + if (!more) { return 0; } // Create an array with elements {0, 1, 2, ..., depth - 1}: @@ -8492,28 +8494,30 @@ swaptimize(basicblock *block, int ix) // though, we can efficiently *shuffle* it! For this reason, we will be // replacing instructions starting from the *end* of the run. Since the // solution is optimal, we don't need to worry about running out of space: - int i = len - 1; - for (int item = 0; item < depth; item++) { + int j = len - 1; + for (int i = 0; i < depth; i++) { // Skip items that have already been visited, or just happen to be in // the correct location: - if (stack[item] == VISITED || stack[item] == item) { + if (stack[i] == VISITED || stack[i] == i) { continue; } // Okay, we've found an item that hasn't been visited. It forms a cycle // with other items; traversing the cycle and swapping each item with // the next will put them all in the correct place. The weird - // loop-and-a-half is necessary to insert 0 into every cycle, since - // we can only swap from that position. + // loop-and-a-half is necessary to insert 0 into every cycle, since we + // can only swap from that position. + int item = i; while (true) { // Skip the actual swap if our item is zero, since swapping the top // item with itself is pointless. if (item) { - assert(0 <= i); + assert(0 <= j); // SWAPs are 1-indexed: - instructions[i].i_opcode = SWAP; - instructions[i--].i_oparg = item + 1; + instructions[j].i_opcode = SWAP; + instructions[j--].i_oparg = item + 1; } if (stack[item] == VISITED) { + assert(item == i); break; } int next_item = stack[item]; @@ -8522,8 +8526,8 @@ swaptimize(basicblock *block, int ix) } } // NOP out any unused instructions: - while (0 <= i) { - instructions[i--].i_opcode = NOP; + while (0 <= j) { + instructions[j--].i_opcode = NOP; } // Done! Return the number of optimized instructions: PyMem_Free(stack); @@ -8774,6 +8778,10 @@ optimize_basic_block(struct compiler *c, basicblock *bb, PyObject *consts) } break; case SWAP: + if (oparg == 1) { + inst->i_opcode = NOP; + break; + } i += swaptimize(bb, i); break; default: From 1537cdaf10865dd3413fc1623bdc38dc44ac8568 Mon Sep 17 00:00:00 2001 From: Brandt Bucher Date: Wed, 26 Jan 2022 12:07:06 -0800 Subject: [PATCH 9/9] Naming is hard --- Python/compile.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/Python/compile.c b/Python/compile.c index b4f8a52793260e..f1049fd931e143 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -8494,7 +8494,7 @@ swaptimize(basicblock *block, int ix) // though, we can efficiently *shuffle* it! For this reason, we will be // replacing instructions starting from the *end* of the run. Since the // solution is optimal, we don't need to worry about running out of space: - int j = len - 1; + int current = len - 1; for (int i = 0; i < depth; i++) { // Skip items that have already been visited, or just happen to be in // the correct location: @@ -8505,29 +8505,30 @@ swaptimize(basicblock *block, int ix) // with other items; traversing the cycle and swapping each item with // the next will put them all in the correct place. The weird // loop-and-a-half is necessary to insert 0 into every cycle, since we - // can only swap from that position. - int item = i; + // can only swap from that position: + int j = i; while (true) { // Skip the actual swap if our item is zero, since swapping the top - // item with itself is pointless. - if (item) { - assert(0 <= j); + // item with itself is pointless: + if (j) { + assert(0 <= current); // SWAPs are 1-indexed: - instructions[j].i_opcode = SWAP; - instructions[j--].i_oparg = item + 1; + instructions[current].i_opcode = SWAP; + instructions[current--].i_oparg = j + 1; } - if (stack[item] == VISITED) { - assert(item == i); + if (stack[j] == VISITED) { + // Completed the cycle: + assert(j == i); break; } - int next_item = stack[item]; - stack[item] = VISITED; - item = next_item; + int next_j = stack[j]; + stack[j] = VISITED; + j = next_j; } } // NOP out any unused instructions: - while (0 <= j) { - instructions[j--].i_opcode = NOP; + while (0 <= current) { + instructions[current--].i_opcode = NOP; } // Done! Return the number of optimized instructions: PyMem_Free(stack);