Skip to content

Commit dcaf17a

Browse files
authored
Specialized ops (RustPython#7301)
* Add debug_assert to invoke_exact_args, lazy func_version reassignment - Add debug_assert preconditions in invoke_exact_args - Add get_version_for_current_state() for lazy version reassignment after func_version invalidation - Document NEXT_TYPE_VERSION overflow policy * working * Add COMPARE_OP, TO_BOOL, FOR_ITER, LOAD_GLOBAL specialization - COMPARE_OP: CompareOpInt, CompareOpFloat, CompareOpStr - TO_BOOL: ToBoolBool, ToBoolInt, ToBoolNone, ToBoolList, ToBoolStr - FOR_ITER: ForIterRange, ForIterList, ForIterTuple with fast_next() - LOAD_GLOBAL: LoadGlobalModule, LoadGlobalBuiltin with dict version guard - Add version counter to Dict for mutation tracking * Add BINARY_SUBSCR, CONTAINS_OP, UNPACK_SEQUENCE, STORE_ATTR specialization - BinaryOpSubscrListInt, BinaryOpSubscrTupleInt, BinaryOpSubscrDict - ContainsOpDict, ContainsOpSet - UnpackSequenceTwoTuple, UnpackSequenceTuple, UnpackSequenceList - StoreAttrInstanceValue with type_version guard - Deoptimize bytecode for marshal serialization (original_bytes) - Separate co_code (deoptimized) from _co_code_adaptive (quickened) * Add STORE_SUBSCR, BinaryOpAddUnicode, ToBoolAlwaysTrue, CallLen, CallIsinstance, CallType1 specialization * Add BinaryOpSubscrStrInt, CallStr1, CallTuple1 specialization * Add BinaryOpInplaceAddUnicode specialization * Add LoadAttrModule, CallBuiltinO, CallPyGeneral, CallBoundMethodGeneral, ForIterGen, CallListAppend specialization * Add LoadAttrNondescriptor*, CallMethodDescriptor* specialization - LoadAttrNondescriptorNoDict: plain class attr on objects without dict - LoadAttrNondescriptorWithValues: plain class attr with dict fallback - LoadAttrClass: handler for type attribute access (not yet routed) - CallMethodDescriptorNoargs: method descriptor with 0 args - CallMethodDescriptorO: method descriptor with 1 arg - CallMethodDescriptorFast: method descriptor with multiple args - Use HAS_DICT flag instead of obj.dict().is_some() for method/nondescriptor routing * Add CallBuiltinFast, CallNonPyGeneral specialization - CallBuiltinFast: native function calls with arbitrary positional args - CallNonPyGeneral: fallback for unmatched callables (custom __call__, etc.) - All builtin function calls now specialize (CallBuiltinFast as default) - specialize_call now always produces a specialized instruction * Add SendGen specialization for generator/coroutine send - SendGen: direct coro.send() for generator/coroutine receivers - Add adaptive counter to Send instruction - specialize_send checks builtin_coro for PyGenerator/PyCoroutine * Add LoadAttrSlot, StoreAttrSlot specialization for __slots__ access - LoadAttrSlot: direct obj.get_slot(offset) bypassing descriptor protocol - StoreAttrSlot: direct obj.set_slot(offset, value) bypassing descriptor protocol - Detect PyMemberDescriptor with MemberGetter::Offset in specialize_load_attr/store_attr - Cache slot offset in cache_base+3 * Add LoadSuperAttrAttr, LoadSuperAttrMethod, CallBuiltinClass, CallBuiltinFastWithKeywords, CallMethodDescriptorFastWithKeywords specialization * Add LoadAttrProperty specialization for property descriptor access * Add LoadAttrClass specialization for class attribute access * Add BinaryOpSubscrListSlice specialization * Add CallKwPy, CallKwBoundMethod, CallKwNonPy specialization Fix LoadSuperAttrMethod to push unbound descriptor + self instead of bound method + self which caused double self binding. Fix LoadSuperAttrAttr obj_arg condition for classmethod detection. * Clean up comments in specialization code Remove unnecessary CPython references, FIXME→TODO, redundant Note: prefix, and "Same as" cross-references. * fix check_signals * fix import
1 parent 6fbb71f commit dcaf17a

10 files changed

Lines changed: 4329 additions & 1580 deletions

File tree

crates/vm/src/builtins/dict.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ impl PyDict {
7979
&self.entries
8080
}
8181

82+
/// Monotonically increasing version for mutation tracking.
83+
pub(crate) fn version(&self) -> u64 {
84+
self.entries.version()
85+
}
86+
8287
/// Returns all keys as a Vec, atomically under a single read lock.
8388
/// Thread-safe: prevents "dictionary changed size during iteration" errors.
8489
pub fn keys_vec(&self) -> Vec<PyObjectRef> {

crates/vm/src/builtins/function.rs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ pub struct PyFunction {
8080

8181
static FUNC_VERSION_COUNTER: AtomicU32 = AtomicU32::new(1);
8282

83+
/// Atomically allocate the next function version, returning 0 if exhausted.
84+
/// Once the counter wraps to 0, it stays at 0 permanently.
85+
fn next_func_version() -> u32 {
86+
FUNC_VERSION_COUNTER
87+
.fetch_update(Relaxed, Relaxed, |v| (v != 0).then(|| v.wrapping_add(1)))
88+
.unwrap_or(0)
89+
}
90+
8391
unsafe impl Traverse for PyFunction {
8492
fn traverse(&self, tracer_fn: &mut TraverseFn<'_>) {
8593
self.globals.traverse(tracer_fn);
@@ -204,7 +212,7 @@ impl PyFunction {
204212
annotate: PyMutex::new(None),
205213
module: PyMutex::new(module),
206214
doc: PyMutex::new(doc),
207-
func_version: AtomicU32::new(FUNC_VERSION_COUNTER.fetch_add(1, Relaxed)),
215+
func_version: AtomicU32::new(next_func_version()),
208216
#[cfg(feature = "jit")]
209217
jitted_code: OnceCell::new(),
210218
};
@@ -603,6 +611,22 @@ impl Py<PyFunction> {
603611
self.func_version.load(Relaxed)
604612
}
605613

614+
/// Returns the current version, assigning a fresh one if previously invalidated.
615+
/// Returns 0 if the version counter has overflowed.
616+
/// `_PyFunction_GetVersionForCurrentState`
617+
pub fn get_version_for_current_state(&self) -> u32 {
618+
let v = self.func_version.load(Relaxed);
619+
if v != 0 {
620+
return v;
621+
}
622+
let new_v = next_func_version();
623+
if new_v == 0 {
624+
return 0;
625+
}
626+
self.func_version.store(new_v, Relaxed);
627+
new_v
628+
}
629+
606630
/// Check if this function is eligible for exact-args call specialization.
607631
/// Returns true if: no VARARGS, no VARKEYWORDS, no kwonly args, not generator/coroutine,
608632
/// and effective_nargs matches co_argcount.
@@ -627,6 +651,16 @@ impl Py<PyFunction> {
627651
pub fn invoke_exact_args(&self, args: &[PyObjectRef], vm: &VirtualMachine) -> PyResult {
628652
let code: PyRef<PyCode> = (*self.code).to_owned();
629653

654+
debug_assert_eq!(args.len(), code.arg_count as usize);
655+
debug_assert!(code.flags.contains(bytecode::CodeFlags::NEWLOCALS));
656+
debug_assert!(!code.flags.intersects(
657+
bytecode::CodeFlags::VARARGS
658+
| bytecode::CodeFlags::VARKEYWORDS
659+
| bytecode::CodeFlags::GENERATOR
660+
| bytecode::CodeFlags::COROUTINE
661+
));
662+
debug_assert_eq!(code.kwonlyarg_count, 0);
663+
630664
let frame = Frame::new(
631665
code.clone(),
632666
Scope::new(None, self.globals.clone()),

crates/vm/src/builtins/list.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,23 @@ impl PyListIterator {
637637
}
638638
}
639639

640+
impl PyListIterator {
641+
/// Fast path for FOR_ITER specialization.
642+
pub(crate) fn fast_next(&self) -> Option<PyObjectRef> {
643+
self.internal
644+
.lock()
645+
.next(|list, pos| {
646+
let vec = list.borrow_vec();
647+
Ok(PyIterReturn::from_result(vec.get(pos).cloned().ok_or(None)))
648+
})
649+
.ok()
650+
.and_then(|r| match r {
651+
PyIterReturn::Return(v) => Some(v),
652+
PyIterReturn::StopIteration(_) => None,
653+
})
654+
}
655+
}
656+
640657
impl SelfIter for PyListIterator {}
641658
impl IterNext for PyListIterator {
642659
fn next(zelf: &Py<Self>, _vm: &VirtualMachine) -> PyResult<PyIterReturn> {

crates/vm/src/builtins/object.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ impl PyBaseObject {
365365
}
366366

367367
#[pyslot]
368-
fn slot_setattro(
368+
pub(crate) fn slot_setattro(
369369
obj: &PyObject,
370370
attr_name: &Py<PyStr>,
371371
value: PySetterValue,

crates/vm/src/builtins/property.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ impl PyProperty {
133133
self.getter.read().clone()
134134
}
135135

136+
pub(crate) fn get_fget(&self) -> Option<PyObjectRef> {
137+
self.getter.read().clone()
138+
}
139+
136140
#[pygetset]
137141
fn fset(&self) -> Option<PyObjectRef> {
138142
self.setter.read().clone()

crates/vm/src/builtins/range.rs

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -613,19 +613,6 @@ pub struct PyRangeIterator {
613613
length: usize,
614614
}
615615

616-
impl PyRangeIterator {
617-
/// Advance and return next value without going through the iterator protocol.
618-
#[inline]
619-
pub(crate) fn next_fast(&self) -> Option<isize> {
620-
let index = self.index.fetch_add(1);
621-
if index < self.length {
622-
Some(self.start + (index as isize) * self.step)
623-
} else {
624-
None
625-
}
626-
}
627-
}
628-
629616
impl PyPayload for PyRangeIterator {
630617
#[inline]
631618
fn class(ctx: &Context) -> &'static Py<PyType> {
@@ -660,18 +647,25 @@ impl PyRangeIterator {
660647
}
661648
}
662649

650+
impl PyRangeIterator {
651+
/// Fast path for FOR_ITER specialization. Returns the next isize value
652+
/// without allocating PyInt or PyIterReturn.
653+
pub(crate) fn fast_next(&self) -> Option<isize> {
654+
let index = self.index.fetch_add(1);
655+
if index < self.length {
656+
Some(self.start + (index as isize) * self.step)
657+
} else {
658+
None
659+
}
660+
}
661+
}
662+
663663
impl SelfIter for PyRangeIterator {}
664664
impl IterNext for PyRangeIterator {
665665
fn next(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<PyIterReturn> {
666-
// TODO: In pathological case (index == usize::MAX) this can wrap around
667-
// (since fetch_add wraps). This would result in the iterator spinning again
668-
// from the beginning.
669-
let index = zelf.index.fetch_add(1);
670-
let r = if index < zelf.length {
671-
let value = zelf.start + (index as isize) * zelf.step;
672-
PyIterReturn::Return(vm.ctx.new_int(value).into())
673-
} else {
674-
PyIterReturn::StopIteration(None)
666+
let r = match zelf.fast_next() {
667+
Some(value) => PyIterReturn::Return(vm.ctx.new_int(value).into()),
668+
None => PyIterReturn::StopIteration(None),
675669
};
676670
Ok(r)
677671
}

crates/vm/src/builtins/tuple.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,24 @@ impl PyTupleIterator {
572572
}
573573
}
574574

575+
impl PyTupleIterator {
576+
/// Fast path for FOR_ITER specialization.
577+
pub(crate) fn fast_next(&self) -> Option<PyObjectRef> {
578+
self.internal
579+
.lock()
580+
.next(|tuple, pos| {
581+
Ok(PyIterReturn::from_result(
582+
tuple.get(pos).cloned().ok_or(None),
583+
))
584+
})
585+
.ok()
586+
.and_then(|r| match r {
587+
PyIterReturn::Return(v) => Some(v),
588+
PyIterReturn::StopIteration(_) => None,
589+
})
590+
}
591+
}
592+
575593
impl SelfIter for PyTupleIterator {}
576594
impl IterNext for PyTupleIterator {
577595
fn next(zelf: &Py<Self>, _vm: &VirtualMachine) -> PyResult<PyIterReturn> {

crates/vm/src/builtins/type.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ pub struct PyType {
5555
pub tp_version_tag: AtomicU32,
5656
}
5757

58+
/// Monotonic counter for type version tags. Once it reaches `u32::MAX`,
59+
/// `assign_version_tag()` returns 0 permanently, disabling new inline-cache
60+
/// entries but not invalidating correctness (cache misses fall back to the
61+
/// generic path).
5862
static NEXT_TYPE_VERSION: AtomicU32 = AtomicU32::new(1);
5963

6064
unsafe impl crate::object::Traverse for PyType {
@@ -199,7 +203,8 @@ fn is_subtype_with_mro(a_mro: &[PyTypeRef], a: &Py<PyType>, b: &Py<PyType>) -> b
199203
}
200204

201205
impl PyType {
202-
/// Assign a fresh version tag. Returns 0 on overflow (all caches invalidated).
206+
/// Assign a fresh version tag. Returns 0 if the version counter has been
207+
/// exhausted, in which case no new cache entries can be created.
203208
pub fn assign_version_tag(&self) -> u32 {
204209
loop {
205210
let current = NEXT_TYPE_VERSION.load(Ordering::Relaxed);

crates/vm/src/dict_inner.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ use crate::{
1717
object::{Traverse, TraverseFn},
1818
};
1919
use alloc::fmt;
20-
use core::{mem::size_of, ops::ControlFlow};
20+
use core::mem::size_of;
21+
use core::ops::ControlFlow;
22+
use core::sync::atomic::{AtomicU64, Ordering::Relaxed};
2123
use num_traits::ToPrimitive;
2224

2325
// HashIndex is intended to be same size with hash::PyHash
@@ -34,6 +36,7 @@ type EntryIndex = usize;
3436

3537
pub struct Dict<T = PyObjectRef> {
3638
inner: PyRwLock<DictInner<T>>,
39+
version: AtomicU64,
3740
}
3841

3942
unsafe impl<T: Traverse> Traverse for Dict<T> {
@@ -98,6 +101,7 @@ impl<T: Clone> Clone for Dict<T> {
98101
fn clone(&self) -> Self {
99102
Self {
100103
inner: PyRwLock::new(self.inner.read().clone()),
104+
version: AtomicU64::new(0),
101105
}
102106
}
103107
}
@@ -111,6 +115,7 @@ impl<T> Default for Dict<T> {
111115
indices: vec![IndexEntry::FREE; 8],
112116
entries: Vec::new(),
113117
}),
118+
version: AtomicU64::new(0),
114119
}
115120
}
116121
}
@@ -254,6 +259,16 @@ impl<T> DictInner<T> {
254259
type PopInnerResult<T> = ControlFlow<Option<DictEntry<T>>>;
255260

256261
impl<T: Clone> Dict<T> {
262+
/// Monotonically increasing version counter for mutation tracking.
263+
pub fn version(&self) -> u64 {
264+
self.version.load(Relaxed)
265+
}
266+
267+
/// Bump the version counter after any mutation.
268+
fn bump_version(&self) {
269+
self.version.fetch_add(1, Relaxed);
270+
}
271+
257272
fn read(&self) -> PyRwLockReadGuard<'_, DictInner<T>> {
258273
self.inner.read()
259274
}
@@ -283,6 +298,7 @@ impl<T: Clone> Dict<T> {
283298
};
284299
if entry.index == index_index {
285300
let removed = core::mem::replace(&mut entry.value, value);
301+
self.bump_version();
286302
// defer dec RC
287303
break Some(removed);
288304
} else {
@@ -298,6 +314,7 @@ impl<T: Clone> Dict<T> {
298314
continue;
299315
}
300316
inner.unchecked_push(index_index, hash, key.to_pyobject(vm), value, entry_index);
317+
self.bump_version();
301318
break None;
302319
}
303320
};
@@ -361,6 +378,7 @@ impl<T: Clone> Dict<T> {
361378
inner.indices.resize(8, IndexEntry::FREE);
362379
inner.used = 0;
363380
inner.filled = 0;
381+
self.bump_version();
364382
// defer dec rc
365383
core::mem::take(&mut inner.entries)
366384
};
@@ -439,6 +457,7 @@ impl<T: Clone> Dict<T> {
439457
continue;
440458
}
441459
inner.unchecked_push(index_index, hash, key.to_owned(), value, entry);
460+
self.bump_version();
442461
break None;
443462
}
444463
};
@@ -475,6 +494,7 @@ impl<T: Clone> Dict<T> {
475494
value.clone(),
476495
index_entry,
477496
);
497+
self.bump_version();
478498
return Ok(value);
479499
}
480500
}
@@ -511,6 +531,7 @@ impl<T: Clone> Dict<T> {
511531
let key_obj = key.to_pyobject(vm);
512532
let ret = (key_obj.clone(), value.clone());
513533
inner.unchecked_push(index_index, hash, key_obj, value, index_entry);
534+
self.bump_version();
514535
return Ok(ret);
515536
}
516537
}
@@ -698,6 +719,7 @@ impl<T: Clone> Dict<T> {
698719
} = IndexEntry::DUMMY;
699720
inner.used -= 1;
700721
let removed = slot.take();
722+
self.bump_version();
701723
Ok(ControlFlow::Break(removed))
702724
}
703725

@@ -727,6 +749,7 @@ impl<T: Clone> Dict<T> {
727749
// entry.index always refers valid index
728750
inner.indices.get_unchecked_mut(entry.index)
729751
} = IndexEntry::DUMMY;
752+
self.bump_version();
730753
Some((entry.key, entry.value))
731754
}
732755

0 commit comments

Comments
 (0)