E5EE Add vectorcall (PEP 590) dispatch for function calls by youknowone · Pull Request #7329 · RustPython/RustPython · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions crates/vm/src/builtins/builtin_func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,44 @@ impl fmt::Debug for PyNativeMethod {
}
}

/// Vectorcall for builtin functions (PEP 590).
/// Avoids `prepend_arg` O(n) shift by building args with self at front.
fn vectorcall_native_function(
zelf_obj: &PyObject,
args: Vec<PyObjectRef>,
nargs: usize,
kwnames: Option<&[PyObjectRef]>,
vm: &VirtualMachine,
) -> PyResult {
let zelf: &Py<PyNativeFunction> = zelf_obj.downcast_ref().unwrap();

// Build FuncArgs with self already at position 0 (no insert(0) needed)
let needs_self = zelf
.zelf
.as_ref()
.is_some_and(|_| !zelf.value.flags.contains(PyMethodFlags::STATIC));

let func_args = if needs_self {
let self_obj = zelf.zelf.as_ref().unwrap().clone();
let mut all_args = Vec::with_capacity(args.len() + 1);
all_args.push(self_obj);
all_args.extend(args);
FuncArgs::from_vectorcall(&all_args, nargs + 1, kwnames)
} else {
FuncArgs::from_vectorcall(&args, nargs, kwnames)
};

(zelf.value.func)(vm, func_args)
}

pub fn init(context: &'static Context) {
PyNativeFunction::extend_class(context, context.types.builtin_function_or_method_type);
context
.types
.builtin_function_or_method_type
.slots
.vectorcall
.store(Some(vectorcall_native_function));
}

/// Wrapper that provides access to the common PyNativeFunction data
Expand Down
107 changes: 103 additions & 4 deletions crates/vm/src/builtins/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ impl Py<PyFunction> {
/// Skips FuncArgs allocation, prepend_arg, and fill_locals_from_args.
/// Only valid when: no VARARGS, no VARKEYWORDS, no kwonlyargs, not generator/coroutine,
/// and nargs == co_argcount.
pub fn invoke_exact_args(&self, args: &[PyObjectRef], vm: &VirtualMachine) -> PyResult {
pub fn invoke_exact_args(&self, mut args: Vec<PyObjectRef>, vm: &VirtualMachine) -> PyResult {
let code: PyRef<PyCode> = (*self.code).to_owned();

debug_assert_eq!(args.len(), code.arg_count as usize);
Expand All @@ -671,11 +671,11 @@ impl Py<PyFunction> {
)
.into_ref(&vm.ctx);

// Copy args directly into fastlocals
// Move args directly into fastlocals (no clone/refcount needed)
{
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
for (i, arg) in args.iter().enumerate() {
fastlocals[i] = Some(arg.clone());
for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..)) {
*slot = Some(arg);
}
}

Expand Down Expand Up @@ -1253,8 +1253,107 @@ impl PyCell {
}
}

/// Vectorcall implementation for PyFunction (PEP 590).
/// Takes owned args to avoid cloning when filling fastlocals.
pub(crate) fn vectorcall_function(
zelf_obj: &PyObject,
mut args: Vec<PyObjectRef>,
nargs: usize,
kwnames: Option<&[PyObjectRef]>,
vm: &VirtualMachine,
) -> PyResult {
let zelf: &Py<PyFunction> = zelf_obj.downcast_ref().unwrap();
let code: &Py<PyCode> = &zelf.code;

let has_kwargs = kwnames.is_some_and(|kw| !kw.is_empty());
let is_simple = !has_kwargs
&& !code.flags.contains(bytecode::CodeFlags::VARARGS)
&& !code.flags.contains(bytecode::CodeFlags::VARKEYWORDS)
&& code.kwonlyarg_count == 0
&& !code
.flags
.intersects(bytecode::CodeFlags::GENERATOR | bytecode::CodeFlags::COROUTINE);

if is_simple && nargs == code.arg_count as usize {
// FAST PATH: simple positional-only call, exact arg count.
// Move owned args directly into fastlocals — no clone needed.
let locals = if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) {
ArgMapping::from_dict_exact(vm.ctx.new_dict())
} else {
ArgMapping::from_dict_exact(zelf.globals.clone())
};

let frame = Frame::new(
code.to_owned(),
Scope::new(Some(locals), zelf.globals.clone()),
zelf.builtins.clone(),
Comment on lines +1280 to +1289
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# First, let's examine the code at the specified lines and nearby functions
head -n 1300 crates/vm/src/builtins/function.rs | tail -n 100

Repository: RustPython/RustPython

Length of output: 3133


🏁 Script executed:

#!/bin/bash
# Search for Scope::new() calls to understand the pattern
rg -n "Scope::new\(" crates/vm/src/builtins/function.rs -B 2 -A 2

Repository: RustPython/RustPython

Length of output: 770


🏁 Script executed:

#!/bin/bash
# Examine invoke_with_locals around line 557
sed -n '540,580p' crates/vm/src/builtins/function.rs

Repository: RustPython/RustPython

Length of output: 1528


🏁 Script executed:

#!/bin/bash
# Examine invoke_exact_args around line 666
sed -n '650,690p' crates/vm/src/builtins/function.rs

Repository: RustPython/RustPython

Length of output: 1686


🏁 Script executed:

#!/bin/bash
# Find the Scope struct definition to understand Scope::new() signature
rg -n "impl.*Scope|fn new" crates/vm/src/ -A 3 | grep -A 3 "Scope\|fn new"

Repository: RustPython/RustPython

Length of output: 50377


🏁 Script executed:

#!/bin/bash
# Find and examine the Scope struct definition
sed -n '1,100p' crates/vm/src/scope.rs

Repository: RustPython/RustPython

Length of output: 3555


🏁 Script executed:

#!/bin/bash
# Look for how Scope.locals is used in the frame execution
rg -n "scope\.locals|frame\.scope\.locals" crates/vm/src/ -A 2 -B 2 | head -100

Repository: RustPython/RustPython

Length of output: 497


🏁 Script executed:

#!/bin/bash
# Compare how NEWLOCALS is handled in all three paths more carefully
echo "=== vectorcall_function (lines 1270-1305) ===" && sed -n '1270,1305p' crates/vm/src/builtins/function.rs
echo ""
echo "=== invoke_with_locals (lines 540-575) ===" && sed -n '540,575p' crates/vm/src/builtins/function.rs
echo ""
echo "=== invoke_exact_args (lines 650-685) ===" && sed -n '650,685p' crates/vm/src/builtins/function.rs

Repository: RustPython/RustPython

Length of output: 4488


🏁 Script executed:

#!/bin/bash
# Examine FrameLocals enum and its lazy() method
rg -n "enum FrameLocals|fn lazy|fn with_locals" crates/vm/src/frame.rs -A 5 | head -80

Repository: RustPython/RustPython

Length of output: 362


🏁 Script executed:

#!/bin/bash
# Get more detail on FrameLocals implementation
sed -n '1,150p' crates/vm/src/frame.rs | grep -A 30 "enum FrameLocals"

Repository: RustPython/RustPython

Length of output: 47


🏁 Script executed:

#!/bin/bash
# Find FrameLocals struct definition
sed -n '100,200p' crates/vm/src/frame.rs

Repository: RustPython/RustPython

Length of output: 3120


🏁 Script executed:

#!/bin/bash
# Look at how FrameLocals::lazy() is used in frame execution
rg -n "FrameLocals::lazy" crates/vm/src/frame.rs -B 5 -A 10

Repository: RustPython/RustPython

Length of output: 756


🏁 Script executed:

#!/bin/bash
# Check if there's lazy initialization for NEWLOCALS
rg -n "NEWLOCALS" crates/vm/src/frame.rs -B 3 -A 3

Repository: RustPython/RustPython

Length of output: 1218


Use lazy allocation for NEWLOCALS scope in vectorcall fast path for consistency.

The vectorcall fast path (line 1282) eagerly allocates a new dict for NEWLOCALS via vm.ctx.new_dict(), while both invoke_with_locals (line 554) and invoke_exact_args (line 666) pass None to enable lazy allocation. When Scope::new() receives None with NEWLOCALS, Frame initialization uses FrameLocals::lazy() (frame.rs:356), deferring dict creation until first access. This approach is more efficient since most function frames never access the locals dict, and aligns with Rust memory management best practices.

🔧 Proposed fix
-        let locals = if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) {
-            ArgMapping::from_dict_exact(vm.ctx.new_dict())
-        } else {
-            ArgMapping::from_dict_exact(zelf.globals.clone())
-        };
+        let locals = if code.flags.contains(bytecode::CodeFlags::NEWLOCALS) {
+            None
+        } else {
+            Some(ArgMapping::from_dict_exact(zelf.globals.clone()))
+        };

         let frame = Frame::new(
             code.to_owned(),
-            Scope::new(Some(locals), zelf.globals.clone()),
+            Scope::new(locals, zelf.globals.clone()),
             zelf.builtins.clone(),
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/vm/src/builtins/function.rs` around lines 1280 - 1289, The vectorcall
fast path currently eagerly allocates a new dict for NEWLOCALS (via
vm.ctx.new_dict() and ArgMapping::from_dict_exact) before creating the Frame;
change this to use a None-local mapping so Scope::new receives None when
code.flags contains bytecode::CodeFlags::NEWLOCALS, matching
invoke_with_locals/invoke_exact_args behavior and allowing Frame initialization
to use FrameLocals::lazy() (defer allocation). Update the locals binding used by
Frame::new (the locals variable) to be an Option that is None for NEWLOCALS
instead of constructing vm.ctx.new_dict().

zelf.closure.as_ref().map_or(&[], |c| c.as_slice()),
Some(zelf.to_owned().into()),
vm,
)
.into_ref(&vm.ctx);

{
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
for (slot, arg) in fastlocals.iter_mut().zip(args.drain(..nargs)) {
*slot = Some(arg);
}
}

if let Some(cell2arg) = code.cell2arg.as_deref() {
let fastlocals = unsafe { frame.fastlocals.borrow_mut() };
for (cell_idx, arg_idx) in cell2arg.iter().enumerate().filter(|(_, i)| **i != -1) {
let x = fastlocals[*arg_idx as usize].take();
frame.set_cell_contents(cell_idx, x);
}
}

return vm.run_frame(frame);
}

// SLOW PATH: construct FuncArgs from owned Vec and delegate to invoke()
let func_args = if has_kwargs {
FuncArgs::from_vectorcall(&args, nargs, kwnames)
} else {
args.truncate(nargs);
FuncArgs::from(args)
};
zelf.invoke(func_args, vm)
}

/// Vectorcall implementation for PyBoundMethod (PEP 590).
fn vectorcall_bound_method(
zelf_obj: &PyObject,
mut args: Vec<PyObjectRef>,
nargs: usize,
kwnames: Option<&[PyObjectRef]>,
vm: &VirtualMachine,
) -> PyResult {
let zelf: &Py<PyBoundMethod> = zelf_obj.downcast_ref().unwrap();

// Insert self at front of existing Vec (avoids 2nd allocation).
// O(n) memmove is cheaper than a 2nd heap alloc+dealloc for typical arg counts.
args.insert(0, zelf.object.clone());
let new_nargs = nargs + 1;
zelf.function.vectorcall(args, new_nargs, kwnames, vm)
}

pub fn init(context: &'static Context) {
PyFunction::extend_class(context, context.types.function_type);
context
.types
.function_type
.slots
.vectorcall
.store(Some(vectorcall_function));

PyBoundMethod::extend_class(context, context.types.bound_method_type);
context
.types
.bound_method_type
.slots
.vectorcall
.store(Some(vectorcall_bound_method));

PyCell::extend_class(context, context.types.cell_type);
}
Loading
Loading
0