FFFF Implement LOAD_ATTR inline caching with adaptive specialization by youknowone · Pull Request #7292 · RustPython/RustPython · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
8000
Diff view
1 change: 1 addition & 0 deletions .cspell.dict/cpython.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ copyslot
cpucount
defaultdict
denom
deopt
dictbytype
DICTFLAG
dictoffset
Expand Down
2 changes: 2 additions & 0 deletions .cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
"dedentations",
"dedents",
"deduped",
"deoptimize",
"downcastable",
" 10BC0 ;downcasted",
"dumpable",
Expand All @@ -73,6 +74,7 @@
"interps",
"jitted",
"jitting",
"kwonly",
"lossily",
"makeunicodedata",
"microbenchmark",
Expand Down
12 changes: 7 additions & 5 deletions crates/codegen/src/ir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,11 +457,13 @@ impl CodeInfo {
.map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte))
.chain([CodeUnit { op, arg: lo_arg }]),
);
// Emit CACHE code units after the instruction
instructions.extend(core::iter::repeat_n(
CodeUnit::new(Instruction::Cache, 0.into()),
cache_count,
));
// Emit CACHE code units after the instruction (all zeroed)
if cache_count > 0 {
instructions.extend(core::iter::repeat_n(
CodeUnit::new(Instruction::Cache, 0.into()),
cache_count,
));
}
current_offset = offset_after;
}
next_block = block.next;
Expand Down
145 changes: 144 additions & 1 deletion crates/compiler-core/src/bytecode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,11 @@ pub struct CodeUnit {

const _: () = assert!(mem::size_of::<CodeUnit>() == 2);

/// Adaptive specialization: number of executions before attempting specialization.
pub const ADAPTIVE_WARMUP_VALUE: u8 = 50;
/// Adaptive specialization: backoff counter after de-optimization.
pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250;

impl CodeUnit {
pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
Self { op, arg }
Expand Down Expand Up @@ -391,7 +396,11 @@ impl TryFrom<&[u8]> for CodeUnits {
return Err(Self::Error::InvalidBytecode);
}

value.chunks_exact(2).map(CodeUnit::try_from).collect()
let units: Self = value
.chunks_exact(2)
.map(CodeUnit::try_from)
.collect::<Result<_, _>>()?;
Ok(units)
}
}

Expand Down Expand Up @@ -441,6 +450,140 @@ impl CodeUnits {
core::ptr::write(op_ptr, new_op.into());
}
}

/// Write a u16 value into a CACHE code unit at `index`.
/// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly.
///
/// # Safety
/// - `index` must be in bounds and point to a CACHE entry.
/// - The caller must ensure no concurrent reads/writes to the same slot.
pub unsafe fn write_cache_u16(&self, index: usize, value: u16) {
unsafe {
let units = &mut *self.0.get();
let ptr = units.as_mut_ptr().add(index) as *mut u8;
core::ptr::write_unaligned(ptr as *mut u16, value);
}
}

/// Read a u16 value from a CACHE code unit at `index`.
///
/// # Panics
/// Panics if `index` is out of bounds.
pub fn read_cache_u16(&self, index: usize) -> u16 {
let units = unsafe { &*self.0.get() };
assert!(index < units.len(), "read_cache_u16: index out of bounds");
let ptr = units.as_ptr().wrapping_add(index) as *const u8;
unsafe { core::ptr::read_unaligned(ptr as *const u16) }
}

/// Write a u32 value across two consecutive CACHE code units starting at `index`.
///
/// # Safety
/// Same requirements as `write_cache_u16`.
pub unsafe fn write_cache_u32(&self, index: usize, value: u32) {
unsafe {
self.write_cache_u16(index, value as u16);
self.write_cache_u16(index + 1, (value >> 16) as u16);
}
}

/// Read a u32 value from two consecutive CACHE code units starting at `index`.
///
/// # Panics
/// Panics if `index + 1` is out of bounds.
pub fn read_cache_u32(&self, index: usize) -> u32 {
let lo = self.read_cache_u16(index) as u32;
let hi = self.read_cache_u16(index + 1) as u32;
lo | (hi << 16)
}

/// Write a u64 value across four consecutive CACHE code units starting at `index`.
///
/// # Safety
/// Same requirements as `write_cache_u16`.
pub unsafe fn write_cache_u64(&self, index: usize, value: u64) {
unsafe {
self.write_cache_u32(index, value as u32);
self.write_cache_u32(index + 2, (value >> 32) as u32);
}
}

/// Read a u64 value from four consecutive CACHE code units starting at `index`.
///
/// # Panics
/// Panics if `index + 3` is out of bounds.
pub fn read_cache_u64(&self, index: usize) -> u64 {
let lo = self.read_cache_u32(index) as u64;
let hi = self.read_cache_u32(index + 2) as u64;
lo | (hi << 32)
}

/// Read the adaptive counter from the first CACHE entry's `arg` byte.
/// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`.
pub fn read_adaptive_counter(&self, index: usize) -> u8 {
let units = unsafe { &*self.0.get() };
u8::from(units[index].arg)
}

/// Write the adaptive counter to the first CACHE entry's `arg` byte.
/// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`.
///
/// # Safety
/// - `index` must be in bounds and point to a CACHE entry.
pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) {
let units = unsafe { &mut *self.0.get() };
units[index].arg = OpArgByte::from(value);
}

/// Produce a clean copy of the bytecode suitable for serialization
/// (marshal) and `co_code`. Specialized opcodes are mapped back to their
/// base variants via `deoptimize()` and all CACHE entries are zeroed.
pub fn original_bytes(&self) -> Vec<u8> {
let units = unsafe { &*self.0.get() };
let mut out = Vec::with_capacity(units.len() * 2);
let len = units.len();
let mut i = 0;
while i < len {
let op = units[i].op.deoptimize();
let caches = op.cache_entries();
out.push(u8::from(op));
out.push(u8::from(units[i].arg));
// Zero-fill all CACHE entries (counter + cached data)
for _ in 0..caches {
i += 1;
out.push(0); // op = Cache = 0
out.push(0); // arg = 0
}
i += 1;
}
out
}

/// Initialize adaptive warmup counters for all cacheable instructions.
/// Called lazily at RESUME (first execution of a code object).
/// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
pub fn quicken(&self) {
let units = unsafe { &mut *self.0.get() };
let len = units.len();
let mut i = 0;
while i < len {
let op = units[i].op;
let caches = op.cache_entries();
if caches > 0 {
// Don't write adaptive counter for instrumented opcodes;
// specialization is skipped while monitoring is active.
if !op.is_instrumented() {
let cache_base = i + 1;
if cache_base < len {
units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE);
}
}
i += 1 + caches;
} else {
i += 1;
}
}
}
}

/// A Constant (which usually encapsulates data within it)
Expand Down
127 changes: 125 additions & 2 deletions crates/compiler-core/src/bytecode/instruction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,126 @@ impl Instruction {
})
}

/// Map a specialized opcode back to its adaptive (base) variant.
/// `_PyOpcode_Deopt`
pub fn deoptimize(self) -> Self {
match self {
// LOAD_ATTR specializations
Self::LoadAttrClass
| Self::LoadAttrClassWithMetaclassCheck
| Self::LoadAttrGetattributeOverridden
| Self::LoadAttrInstanceValue
| Self::LoadAttrMethodLazyDict
| Self::LoadAttrMethodNoDict
| Self::LoadAttrMethodWithValues
| Self::LoadAttrModule
| Self::LoadAttrNondescriptorNoDict
| Self::LoadAttrNondescriptorWithValues
| Self::LoadAttrProperty
| Self::LoadAttrSlot
| Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() },
// BINARY_OP specializations
Self::BinaryOpAddFloat
| Self::BinaryOpAddInt
| Self::BinaryOpAddUnicode
| Self::BinaryOpExtend
| Self::BinaryOpInplaceAddUnicode
| Self::BinaryOpMultiplyFloat
| Self::BinaryOpMultiplyInt
| Self::BinaryOpSubscrDict
| Self::BinaryOpSubscrGetitem
| Self::BinaryOpSubscrListInt
| Self::BinaryOpSubscrListSlice
| Self::BinaryOpSubscrStrInt
| Self::BinaryOpSubscrTupleInt
| Self::BinaryOpSubtractFloat
| Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() },
// CALL specializations
Self::CallAllocAndEnterInit
| Self::CallBoundMethodExactArgs
| Self::CallBoundMethodGeneral
| Self::CallBuiltinClass
| Self::CallBuiltinFast
| Self::CallBuiltinFastWithKeywords
| Self::CallBuiltinO
| Self::CallIsinstance
| Self::CallLen
| Self::CallListAppend
| Self::CallMethodDescriptorFast
| Self::CallMethodDescriptorFastWithKeywords
| Self::CallMethodDescriptorNoargs
| Self::CallMethodDescriptorO
| Self::CallNonPyGeneral
| Self::CallPyExactArgs
| Self::CallPyGeneral
| Self::CallStr1
| Self::CallTuple1
| Self::CallType1 => Self::Call {
nargs: Arg::marker(),
},
// CALL_KW specializations
Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw {
nargs: Arg::marker(),
},
// TO_BOOL specializations
Self::ToBoolAlwaysTrue
| Self::ToBoolBool
| Self::ToBoolInt
| Self::ToBoolList
| Self::ToBoolNone
| Self::ToBoolStr => Self::ToBool,
// COMPARE_OP specializations
Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => {
Self::CompareOp { op: Arg::marker() }
}
// CONTAINS_OP specializations
Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()),
// FOR_ITER specializations
Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => {
Self::ForIter {
target: Arg::marker(),
}
}
// LOAD_GLOBAL specializations
Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()),
// STORE_ATTR specializations
Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => {
Self::StoreAttr { idx: Arg::marker() }
}
// LOAD_SUPER_ATTR specializations
Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => {
Self::LoadSuperAttr { arg: Arg::marker() }
}
// STORE_SUBSCR specializations
Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr,
// UNPACK_SEQUENCE specializations
Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => {
Self::UnpackSequence {
size: Arg::marker(),
}
}
// SEND specializations
Self::SendGen => Self::Send {
target: Arg::marker(),
},
// LOAD_CONST specializations
Self::LoadConstImmortal | Self::LoadConstMortal => {
Self::LoadConst { idx: Arg::marker() }
}
// RESUME specializations
Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
// JUMP_BACKWARD specializations
Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward {
target: Arg::marker(),
},
// Instrumented opcodes map back to their base
_ => match self.to_base() {
Some(base) => base,
None => self,
},
}
}

/// Number of CACHE code units that follow this instruction.
/// _PyOpcode_Caches
pub fn cache_entries(self) -> usize {
Expand Down Expand Up @@ -626,8 +746,11 @@ impl Instruction {
| Self::UnpackSequenceTuple
| Self::UnpackSequenceTwoTuple => 1,

// Everything else: 0 cache entries
_ => 0,
// Instrumented opcodes have the same cache entries as their base
_ => match self.to_base() {
Some(base) => base.cache_entries(),
None => 0,
},
}
}
}
Expand Down
5 changes: 2 additions & 3 deletions crates/compiler-core/src/marshal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -662,9 +662,8 @@ pub fn serialize_value<W: Write, D: Dumpable>(

pub fn serialize_code<W: Write, C: Constant>(buf: &mut W, code: &CodeObject<C>) {
write_len(buf, code.instructions.len());
// SAFETY: it's ok to transmute CodeUnit to [u8; 2]
let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() };
buf.write_slice(instructions_bytes);
let original = code.instructions.original_bytes();
buf.write_slice(&original);

write_len(buf, code.locations.len());
for (start, end) in &*code.locations {
Expand Down
Loading
Loading
0