RustPython · youknowone · Mar 2, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/.cspell.dict/cpython.txt b/.cspell.dict/cpython.txt
@@ -44,6 +44,7 @@ copyslot
 cpucount
 defaultdict
 denom
+deopt
 dictbytype
 DICTFLAG
 dictoffset

diff --git a/.cspell.json b/.cspell.json
@@ -60,6 +60,7 @@
     "dedentations",
     "dedents",
     "deduped",
+    "deoptimize",
     "downcastable",
     "
10BC0
;downcasted",
     "dumpable",
@@ -73,6 +74,7 @@
     "interps",
     "jitted",
     "jitting",
+    "kwonly",
     "lossily",
     "makeunicodedata",
     "microbenchmark",

diff --git a/crates/codegen/src/ir.rs b/crates/codegen/src/ir.rs
@@ -457,11 +457,13 @@ impl CodeInfo {
                             .map(|byte| CodeUnit::new(Instruction::ExtendedArg, byte))
                             .chain([CodeUnit { op, arg: lo_arg }]),
                     );
-                    // Emit CACHE code units after the instruction
-                    instructions.extend(core::iter::repeat_n(
-                        CodeUnit::new(Instruction::Cache, 0.into()),
-                        cache_count,
-                    ));
+                    // Emit CACHE code units after the instruction (all zeroed)
+                    if cache_count > 0 {
+                        instructions.extend(core::iter::repeat_n(
+                            CodeUnit::new(Instruction::Cache, 0.into()),
+                            cache_count,
+                        ));
+                    }
                     current_offset = offset_after;
                 }
                 next_block = block.next;

diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs
@@ -343,6 +343,11 @@ pub struct CodeUnit {
 
 const _: () = assert!(mem::size_of::<CodeUnit>() == 2);
 
+/// Adaptive specialization: number of executions before attempting specialization.
+pub const ADAPTIVE_WARMUP_VALUE: u8 = 50;
+/// Adaptive specialization: backoff counter after de-optimization.
+pub const ADAPTIVE_BACKOFF_VALUE: u8 = 250;
+
 impl CodeUnit {
     pub const fn new(op: Instruction, arg: OpArgByte) -> Self {
         Self { op, arg }
@@ -391,7 +396,11 @@ impl TryFrom<&[u8]> for CodeUnits {
             return Err(Self::Error::InvalidBytecode);
         }
 
-        value.chunks_exact(2).map(CodeUnit::try_from).collect()
+        let units: Self = value
+            .chunks_exact(2)
+            .map(CodeUnit::try_from)
+            .collect::<Result<_, _>>()?;
+        Ok(units)
     }
 }
 
@@ -441,6 +450,140 @@ impl CodeUnits {
             core::ptr::write(op_ptr, new_op.into());
         }
     }
+
+    /// Write a u16 value into a CACHE code unit at `index`.
+    /// Each CodeUnit is 2 bytes (#[repr(C)]: op u8 + arg u8), so one u16 fits exactly.
+    ///
+    /// # Safety
+    /// - `index` must be in bounds and point to a CACHE entry.
+    /// - The caller must ensure no concurrent reads/writes to the same slot.
+    pub unsafe fn write_cache_u16(&self, index: usize, value: u16) {
+        unsafe {
+            let units = &mut *self.0.get();
+            let ptr = units.as_mut_ptr().add(index) as *mut u8;
+            core::ptr::write_unaligned(ptr as *mut u16, value);
+        }
+    }
+
+    /// Read a u16 value from a CACHE code unit at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index` is out of bounds.
+    pub fn read_cache_u16(&self, index: usize) -> u16 {
+        let units = unsafe { &*self.0.get() };
+        assert!(index < units.len(), "read_cache_u16: index out of bounds");
+        let ptr = units.as_ptr().wrapping_add(index) as *const u8;
+        unsafe { core::ptr::read_unaligned(ptr as *const u16) }
+    }
+
+    /// Write a u32 value across two consecutive CACHE code units starting at `index`.
+    ///
+    /// # Safety
+    /// Same requirements as `write_cache_u16`.
+    pub unsafe fn write_cache_u32(&self, index: usize, value: u32) {
+        unsafe {
+            self.write_cache_u16(index, value as u16);
+            self.write_cache_u16(index + 1, (value >> 16) as u16);
+        }
+    }
+
+    /// Read a u32 value from two consecutive CACHE code units starting at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index + 1` is out of bounds.
+    pub fn read_cache_u32(&self, index: usize) -> u32 {
+        let lo = self.read_cache_u16(index) as u32;
+        let hi = self.read_cache_u16(index + 1) as u32;
+        lo | (hi << 16)
+    }
+
+    /// Write a u64 value across four consecutive CACHE code units starting at `index`.
+    ///
+    /// # Safety
+    /// Same requirements as `write_cache_u16`.
+    pub unsafe fn write_cache_u64(&self, index: usize, value: u64) {
+        unsafe {
+            self.write_cache_u32(index, value as u32);
+            self.write_cache_u32(index + 2, (value >> 32) as u32);
+        }
+    }
+
+    /// Read a u64 value from four consecutive CACHE code units starting at `index`.
+    ///
+    /// # Panics
+    /// Panics if `index + 3` is out of bounds.
+    pub fn read_cache_u64(&self, index: usize) -> u64 {
+        let lo = self.read_cache_u32(index) as u64;
+        let hi = self.read_cache_u32(index + 2) as u64;
+        lo | (hi << 32)
+    }
+
+    /// Read the adaptive counter from the first CACHE entry's `arg` byte.
+    /// This preserves `op = Instruction::Cache`, unlike `read_cache_u16`.
+    pub fn read_adaptive_counter(&self, index: usize) -> u8 {
+        let units = unsafe { &*self.0.get() };
+        u8::from(units[index].arg)
+    }
+
+    /// Write the adaptive counter to the first CACHE entry's `arg` byte.
+    /// This preserves `op = Instruction::Cache`, unlike `write_cache_u16`.
+    ///
+    /// # Safety
+    /// - `index` must be in bounds and point to a CACHE entry.
+    pub unsafe fn write_adaptive_counter(&self, index: usize, value: u8) {
+        let units = unsafe { &mut *self.0.get() };
+        units[index].arg = OpArgByte::from(value);
+    }
+
+    /// Produce a clean copy of the bytecode suitable for serialization
+    /// (marshal) and `co_code`. Specialized opcodes are mapped back to their
+    /// base variants via `deoptimize()` and all CACHE entries are zeroed.
+    pub fn original_bytes(&self) -> Vec<u8> {
+        let units = unsafe { &*self.0.get() };
+        let mut out = Vec::with_capacity(units.len() * 2);
+        let len = units.len();
+        let mut i = 0;
+        while i < len {
+            let op = units[i].op.deoptimize();
+            let caches = op.cache_entries();
+            out.push(u8::from(op));
+            out.push(u8::from(units[i].arg));
+            // Zero-fill all CACHE entries (counter + cached data)
+            for _ in 0..caches {
+                i += 1;
+                out.push(0); // op = Cache = 0
+                out.push(0); // arg = 0
+            }
+            i += 1;
+        }
+        out
+    }
+
+    /// Initialize adaptive warmup counters for all cacheable instructions.
+    /// Called lazily at RESUME (first execution of a code object).
+    /// Uses the `arg` byte of the first CACHE entry, preserving `op = Instruction::Cache`.
+    pub fn quicken(&self) {
+        let units = unsafe { &mut *self.0.get() };
+        let len = units.len();
+        let mut i = 0;
+        while i < len {
+            let op = units[i].op;
+            let caches = op.cache_entries();
+            if caches > 0 {
+                // Don't write adaptive counter for instrumented opcodes;
+                // specialization is skipped while monitoring is active.
+                if !op.is_instrumented() {
+                    let cache_base = i + 1;
+                    if cache_base < len {
+                        units[cache_base].arg = OpArgByte::from(ADAPTIVE_WARMUP_VALUE);
+                    }
+                }
+                i += 1 + caches;
+            } else {
+                i += 1;
+            }
+        }
+    }
 }
 
 /// A Constant (which usually encapsulates data within it)

diff --git a/crates/compiler-core/src/bytecode/instruction.rs b/crates/compiler-core/src/bytecode/instruction.rs
@@ -512,6 +512,126 @@ impl Instruction {
         })
     }
 
+    /// Map a specialized opcode back to its adaptive (base) variant.
+    /// `_PyOpcode_Deopt`
+    pub fn deoptimize(self) -> Self {
+        match self {
+            // LOAD_ATTR specializations
+            Self::LoadAttrClass
+            | Self::LoadAttrClassWithMetaclassCheck
+            | Self::LoadAttrGetattributeOverridden
+            | Self::LoadAttrInstanceValue
+            | Self::LoadAttrMethodLazyDict
+            | Self::LoadAttrMethodNoDict
+            | Self::LoadAttrMethodWithValues
+            | Self::LoadAttrModule
+            | Self::LoadAttrNondescriptorNoDict
+            | Self::LoadAttrNondescriptorWithValues
+            | Self::LoadAttrProperty
+            | Self::LoadAttrSlot
+            | Self::LoadAttrWithHint => Self::LoadAttr { idx: Arg::marker() },
+            // BINARY_OP specializations
+            Self::BinaryOpAddFloat
+            | Self::BinaryOpAddInt
+            | Self::BinaryOpAddUnicode
+            | Self::BinaryOpExtend
+            | Self::BinaryOpInplaceAddUnicode
+            | Self::BinaryOpMultiplyFloat
+            | Self::BinaryOpMultiplyInt
+            | Self::BinaryOpSubscrDict
+            | Self::BinaryOpSubscrGetitem
+            | Self::BinaryOpSubscrListInt
+            | Self::BinaryOpSubscrListSlice
+            | Self::BinaryOpSubscrStrInt
+            | Self::BinaryOpSubscrTupleInt
+            | Self::BinaryOpSubtractFloat
+            | Self::BinaryOpSubtractInt => Self::BinaryOp { op: Arg::marker() },
+            // CALL specializations
+            Self::CallAllocAndEnterInit
+            | Self::CallBoundMethodExactArgs
+            | Self::CallBoundMethodGeneral
+            | Self::CallBuiltinClass
+            | Self::CallBuiltinFast
+            | Self::CallBuiltinFastWithKeywords
+            | Self::CallBuiltinO
+            | Self::CallIsinstance
+            | Self::CallLen
+            | Self::CallListAppend
+            | Self::CallMethodDescriptorFast
+            | Self::CallMethodDescriptorFastWithKeywords
+            | Self::CallMethodDescriptorNoargs
+            | Self::CallMethodDescriptorO
+            | Self::CallNonPyGeneral
+            | Self::CallPyExactArgs
+            | Self::CallPyGeneral
+            | Self::CallStr1
+            | Self::CallTuple1
+            | Self::CallType1 => Self::Call {
+                nargs: Arg::marker(),
+            },
+            // CALL_KW specializations
+            Self::CallKwBoundMethod | Self::CallKwNonPy | Self::CallKwPy => Self::CallKw {
+                nargs: Arg::marker(),
+            },
+            // TO_BOOL specializations
+            Self::ToBoolAlwaysTrue
+            | Self::ToBoolBool
+            | Self::ToBoolInt
+            | Self::ToBoolList
+            | Self::ToBoolNone
+            | Self::ToBoolStr => Self::ToBool,
+            // COMPARE_OP specializations
+            Self::CompareOpFloat | Self::CompareOpInt | Self::CompareOpStr => {
+                Self::CompareOp { op: Arg::marker() }
+            }
+            // CONTAINS_OP specializations
+            Self::ContainsOpDict | Self::ContainsOpSet => Self::ContainsOp(Arg::marker()),
+            // FOR_ITER specializations
+            Self::ForIterGen | Self::ForIterList | Self::ForIterRange | Self::ForIterTuple => {
+                Self::ForIter {
+                    target: Arg::marker(),
+                }
+            }
+            // LOAD_GLOBAL specializations
+            Self::LoadGlobalBuiltin | Self::LoadGlobalModule => Self::LoadGlobal(Arg::marker()),
+            // STORE_ATTR specializations
+            Self::StoreAttrInstanceValue | Self::StoreAttrSlot | Self::StoreAttrWithHint => {
+                Self::StoreAttr { idx: Arg::marker() }
+            }
+            // LOAD_SUPER_ATTR specializations
+            Self::LoadSuperAttrAttr | Self::LoadSuperAttrMethod => {
+                Self::LoadSuperAttr { arg: Arg::marker() }
+            }
+            // STORE_SUBSCR specializations
+            Self::StoreSubscrDict | Self::StoreSubscrListInt => Self::StoreSubscr,
+            // UNPACK_SEQUENCE specializations
+            Self::UnpackSequenceList | Self::UnpackSequenceTuple | Self::UnpackSequenceTwoTuple => {
+                Self::UnpackSequence {
+                    size: Arg::marker(),
+                }
+            }
+            // SEND specializations
+            Self::SendGen => Self::Send {
+                target: Arg::marker(),
+            },
+            // LOAD_CONST specializations
+            Self::LoadConstImmortal | Self::LoadConstMortal => {
+                Self::LoadConst { idx: Arg::marker() }
+            }
+            // RESUME specializations
+            Self::ResumeCheck => Self::Resume { arg: Arg::marker() },
+            // JUMP_BACKWARD specializations
+            Self::JumpBackwardJit | Self::JumpBackwardNoJit => Self::JumpBackward {
+                target: Arg::marker(),
+            },
+            // Instrumented opcodes map back to their base
+            _ => match self.to_base() {
+                Some(base) => base,
+                None => self,
+            },
+        }
+    }
+
     /// Number of CACHE code units that follow this instruction.
     /// _PyOpcode_Caches
     pub fn cache_entries(self) -> usize {
@@ -626,8 +746,11 @@ impl Instruction {
             | Self::UnpackSequenceTuple
             | Self::UnpackSequenceTwoTuple => 1,
 
-            // Everything else: 0 cache entries
-            _ => 0,
+            // Instrumented opcodes have the same cache entries as their base
+            _ => match self.to_base() {
+                Some(base) => base.cache_entries(),
+                None => 0,
+            },
         }
     }
 }

diff --git a/crates/compiler-core/src/marshal.rs b/crates/compiler-core/src/marshal.rs
@@ -662,9 +662,8 @@ pub fn serialize_value<W: Write, D: Dumpable>(
 
 pub fn serialize_code<W: Write, C: Constant>(buf: &mut W, code: &CodeObject<C>) {
     write_len(buf, code.instructions.len());
-    // SAFETY: it's ok to transmute CodeUnit to [u8; 2]
-    let (_, instructions_bytes, _) = unsafe { code.instructions.align_to() };
-    buf.write_slice(instructions_bytes);
+    let original = code.instructions.original_bytes();
+    buf.write_slice(&original);
 
     write_len(buf, code.locations.len());
     for (start, end) in &*code.locations {