From 70169e8858fa7062a1b0dcc244688ef2fc39b377 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Thu, 5 Mar 2026 10:23:48 +0900 Subject: [PATCH 1/6] Fix thread-safety in GC, type cache, and instruction cache GC / refcount: - Add safe_inc() check for strong()==0 in RefCount - Add try_to_owned() to PyObject for atomic refcount acquire - Replace strong_count()+to_owned() with try_to_owned() in GC collection and weakref callback paths to prevent TOCTOU races Type cache: - Add proper SeqLock (sequence counter) to TypeCacheEntry - Readers spin-wait on odd sequence, validate after read - Writers bracket updates with begin_write/end_write - Use try_to_owned + pointer revalidation on read path - Call modified() BEFORE attribute modification in set_attr Instruction cache: - Add pointer_cache (AtomicUsize array) to CodeUnits for single atomic pointer load/store (prevents torn reads) - Add try_read_cached_descriptor with try_to_owned + pointer and version revalidation after increment - Add write_cached_descriptor with version-bracketed writes RLock: - Fix release() to check is_owned_by_current_thread - Add _release_save/_acquire_restore methods --- Lib/test/_test_multiprocessing.py | 2 - crates/common/src/refcount.rs | 2 +- crates/compiler-core/src/bytecode.rs | 46 ++++--- crates/vm/src/builtins/type.rs | 141 +++++++++++++------- crates/vm/src/frame.rs | 191 +++++++++++++++++---------- crates/vm/src/gc_state.rs | 25 ++-- crates/vm/src/object/core.rs | 35 +++-- crates/vm/src/stdlib/thread.rs | 30 ++++- crates/vm/src/vm/mod.rs | 6 +- 9 files changed, 309 insertions(+), 169 deletions(-) diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py index d05ed68144b..311cdebf44d 100644 --- a/Lib/test/_test_multiprocessing.py +++ b/Lib/test/_test_multiprocessing.py @@ -4815,8 +4815,6 @@ def test_finalize(self): result = [obj for obj in iter(conn.recv, 'STOP')] self.assertEqual(result, ['a', 'b', 'd10', 'd03', 'd02', 'd01', 'e']) - # TODO: RUSTPYTHON - gc.get_threshold() and gc.set_threshold() not implemented - @unittest.expectedFailure @support.requires_resource('cpu') def test_thread_safety(self): # bpo-24484: _run_finalizers() should be thread-safe diff --git a/crates/common/src/refcount.rs b/crates/common/src/refcount.rs index 18905d00f3e..4d871a67a0d 100644 --- a/crates/common/src/refcount.rs +++ b/crates/common/src/refcount.rs @@ -123,7 +123,7 @@ impl RefCount { pub fn safe_inc(&self) -> bool { let mut old = State::from_raw(self.state.load(Ordering::Relaxed)); loop { - if old.destructed() { + if old.destructed() || old.strong() == 0 { return false; } if (old.strong() as usize) >= STRONG { diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index 90b58116164..3c919dd0ca4 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -12,7 +12,7 @@ use core::{ cell::UnsafeCell, hash, mem, ops::Deref, - sync::atomic::{AtomicU8, AtomicU16, Ordering}, + sync::atomic::{AtomicU8, AtomicU16, AtomicUsize, Ordering}, }; use itertools::Itertools; use malachite_bigint::BigInt; @@ -411,6 +411,10 @@ impl TryFrom<&[u8]> for CodeUnit { pub struct CodeUnits { units: UnsafeCell>, adaptive_counters: Box<[AtomicU16]>, + /// Pointer-sized cache entries for descriptor pointers. + /// Single atomic load/store prevents torn reads when multiple threads + /// specialize the same instruction concurrently. + pointer_cache: Box<[AtomicUsize]>, } // SAFETY: All cache operations use atomic read/write instructions. @@ -432,9 +436,15 @@ impl Clone for CodeUnits { .iter() .map(|c| AtomicU16::new(c.load(Ordering::Relaxed))) .collect(); + let pointer_cache = self + .pointer_cache + .iter() + .map(|c| AtomicUsize::new(c.load(Ordering::Relaxed))) + .collect(); Self { units: UnsafeCell::new(units), adaptive_counters, + pointer_cache, } } } @@ -472,13 +482,19 @@ impl From<[CodeUnit; N]> for CodeUnits { impl From> for CodeUnits { fn from(value: Vec) -> Self { let units = value.into_boxed_slice(); - let adaptive_counters = (0..units.len()) + let len = units.len(); + let adaptive_counters = (0..len) .map(|_| AtomicU16::new(0)) .collect::>() .into_boxed_slice(); + let pointer_cache = (0..len) + .map(|_| AtomicUsize::new(0)) + .collect::>() + .into_boxed_slice(); Self { units: UnsafeCell::new(units), adaptive_counters, + pointer_cache, } } } @@ -600,25 +616,25 @@ impl CodeUnits { lo | (hi << 16) } - /// Write a u64 value across four consecutive CACHE code units starting at `index`. + /// Store a pointer-sized value atomically in the pointer cache at `index`. + /// + /// Uses a single `AtomicUsize` store to prevent torn writes when + /// multiple threads specialize the same instruction concurrently. /// /// # Safety - /// Same requirements as `write_cache_u16`. - pub unsafe fn write_cache_u64(&self, index: usize, value: u64) { - unsafe { - self.write_cache_u32(index, value as u32); - self.write_cache_u32(index + 2, (value >> 32) as u32); - } + /// - `index` must be in bounds. + pub unsafe fn write_cache_ptr(&self, index: usize, value: usize) { + self.pointer_cache[index].store(value, Ordering::Relaxed); } - /// Read a u64 value from four consecutive CACHE code units starting at `index`. + /// Load a pointer-sized value atomically from the pointer cache at `index`. + /// + /// Uses a single `AtomicUsize` load to prevent torn reads. /// /// # Panics - /// Panics if `index + 3` is out of bounds. - pub fn read_cache_u64(&self, index: usize) -> u64 { - let lo = self.read_cache_u32(index) as u64; - let hi = self.read_cache_u32(index + 2) as u64; - lo | (hi << 32) + /// Panics if `index` is out of bounds. + pub fn read_cache_ptr(&self, index: usize) -> usize { + self.pointer_cache[index].load(Ordering::Relaxed) } /// Read adaptive counter bits for instruction at `index`. diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index 5bc68d38c0f..9e6b15b2323 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -64,10 +64,9 @@ static NEXT_TYPE_VERSION: AtomicU32 = AtomicU32::new(1); // Method cache (type_cache / MCACHE): direct-mapped cache keyed by // (tp_version_tag, interned_name_ptr). // -// Uses a lock-free SeqLock pattern: -// - version acts as both cache key AND sequence counter -// - Read: load version (Acquire), read value ptr, re-check version -// - Write: set version=0 (invalidate), store value, store version (Release) +// Uses a lock-free SeqLock pattern for the read/write protocol: +// - Readers validate sequence/version/name before and after the value read. +// - Writers bracket updates with sequence odd/even transitions. // No mutex needed on the hot path (cache hit). const TYPE_CACHE_SIZE_EXP: u32 = 12; @@ -75,6 +74,8 @@ const TYPE_CACHE_SIZE: usize = 1 << TYPE_CACHE_SIZE_EXP; const TYPE_CACHE_MASK: usize = TYPE_CACHE_SIZE - 1; struct TypeCacheEntry { + /// Sequence lock (odd = write in progress, even = quiescent). + sequence: AtomicU32, /// tp_version_tag at cache time. 0 = empty/invalid. version: AtomicU32, /// Interned attribute name pointer (pointer equality check). @@ -94,12 +95,39 @@ unsafe impl Sync for TypeCacheEntry {} impl TypeCacheEntry { fn new() -> Self { Self { + sequence: AtomicU32::new(0), version: AtomicU32::new(0), name: AtomicPtr::new(core::ptr::null_mut()), value: AtomicPtr::new(core::ptr::null_mut()), } } + #[inline] + fn begin_write(&self) { + self.sequence.fetch_add(1, Ordering::AcqRel); + } + + #[inline] + fn end_write(&self) { + self.sequence.fetch_add(1, Ordering::Release); + } + + #[inline] + fn begin_read(&self) -> u32 { + let mut sequence = self.sequence.load(Ordering::Acquire); + while (sequence & 1) != 0 { + core::hint::spin_loop(); + sequence = self.sequence.load(Ordering::Acquire); + } + sequence + } + + #[inline] + fn end_read(&self, previous: u32) -> bool { + core::sync::atomic::fence(Ordering::Acquire); + self.sequence.load(Ordering::Relaxed) == previous + } + /// Take the value out of this entry, returning the owned PyObjectRef. /// Caller must ensure no concurrent reads can observe this entry /// (version should be set to 0 first). @@ -137,10 +165,14 @@ fn type_cache_clear_version(version: u32) { let mut to_drop = Vec::new(); for entry in TYPE_CACHE.iter() { if entry.version.load(Ordering::Relaxed) == version { - entry.version.store(0, Ordering::Release); - if let Some(v) = entry.take_value() { - to_drop.push(v); + entry.begin_write(); + if entry.version.load(Ordering::Relaxed) == version { + entry.version.store(0, Ordering::Release); + if let Some(v) = entry.take_value() { + to_drop.push(v); + } } + entry.end_write(); } } drop(to_drop); @@ -158,10 +190,12 @@ pub fn type_cache_clear() { // Invalidate all entries and collect values. let mut to_drop = Vec::new(); for entry in TYPE_CACHE.iter() { + entry.begin_write(); entry.version.store(0, Ordering::Release); if let Some(v) = entry.take_value() { to_drop.push(v); } + entry.end_write(); } drop(to_drop); TYPE_CACHE_CLEARING.store(false, Ordering::Release); @@ -701,8 +735,11 @@ impl PyType { } pub fn set_attr(&self, attr_name: &'static PyStrInterned, value: PyObjectRef) { - self.attributes.write().insert(attr_name, value); + // Invalidate caches BEFORE modifying attributes so that cached + // descriptor pointers are still alive when type_cache_clear_version + // drops the cache's strong references. self.modified(); + self.attributes.write().insert(attr_name, value); } /// Internal get_attr implementation for fast lookup on a class. @@ -718,41 +755,44 @@ impl PyType { /// find_name_in_mro with method cache (MCACHE). /// Looks in tp_dict of types in MRO, bypasses descriptors. /// - /// Uses a lock-free SeqLock pattern keyed by version: - /// Read: load version → check name → load value → clone → re-check version - /// Write: version=0 → swap value → set name → version=assigned + /// Uses a lock-free SeqLock-style pattern: + /// Read: load sequence/version/name → load value + try_to_owned → + /// validate value pointer + sequence + /// Write: sequence(begin) → version=0 → swap value/name → version=assigned → sequence(end) fn find_name_in_mro(&self, name: &'static PyStrInterned) -> Option { let version = self.tp_version_tag.load(Ordering::Acquire); if version != 0 { let idx = type_cache_hash(version, name); let entry = &TYPE_CACHE[idx]; - let v1 = entry.version.load(Ordering::Acquire); - if v1 == version - && core::ptr::eq( - entry.name.load(Ordering::Relaxed), - name as *const _ as *mut _, - ) - { + let name_ptr = name as *const _ as *mut _; + loop { + let seq1 = entry.begin_read(); + let v1 = entry.version.load(Ordering::Acquire); + let type_version = self.tp_version_tag.load(Ordering::Acquire); + if v1 != type_version + || !core::ptr::eq(entry.name.load(Ordering::Relaxed), name_ptr) + { + break; + } let ptr = entry.value.load(Ordering::Acquire); - if !ptr.is_null() { - // SAFETY: The value pointer was stored via PyObjectRef::into_raw - // and is valid as long as the version hasn't changed. We create - // a temporary reference (ManuallyDrop prevents decrement), clone - // it to get our own strong reference, then re-check the version - // to confirm the entry wasn't invalidated during our read. - let cloned = unsafe { - let tmp = core::mem::ManuallyDrop::new(PyObjectRef::from_raw( - NonNull::new_unchecked(ptr), - )); - (*tmp).clone() - }; - // SeqLock validation: if version changed, discard our clone - let v2 = entry.version.load(Ordering::Acquire); - if v2 == v1 { + if ptr.is_null() { + if entry.end_read(seq1) { + break; + } + continue; + } + // _Py_TryIncrefCompare-style validation: + // safe_inc, then ensure the source pointer is unchanged. + let obj: &PyObject = unsafe { &*ptr }; + if let Some(cloned) = obj.try_to_owned() { + let same_ptr = core::ptr::eq(entry.value.load(Ordering::Relaxed), ptr); + if same_ptr && entry.end_read(seq1) { return Some(cloned); } drop(cloned); + continue; } + break; } } @@ -777,16 +817,17 @@ impl PyType { { let idx = type_cache_hash(assigned, name); let entry = &TYPE_CACHE[idx]; + let name_ptr = name as *const _ as *mut _; + entry.begin_write(); // Invalidate first to prevent readers from seeing partial state entry.version.store(0, Ordering::Release); // Swap in new value (refcount held by cache) let new_ptr = found.clone().into_raw().as_ptr(); let old_ptr = entry.value.swap(new_ptr, Ordering::Relaxed); - entry - .name - .store(name as *const _ as *mut _, Ordering::Relaxed); + entry.name.store(name_ptr, Ordering::Relaxed); // Activate entry — Release ensures value/name writes are visible entry.version.store(assigned, Ordering::Release); + entry.end_write(); // Drop previous occupant (its version was already invalidated) if !old_ptr.is_null() { unsafe { @@ -832,20 +873,24 @@ impl PyType { if version != 0 { let idx = type_cache_hash(version, name); let entry = &TYPE_CACHE[idx]; - let v1 = entry.version.load(Ordering::Acquire); - if v1 == version - && core::ptr::eq( - entry.name.load(Ordering::Relaxed), - name as *const _ as *mut _, - ) - { + let name_ptr = name as *const _ as *mut _; + loop { + let seq1 = entry.begin_read(); + let v1 = entry.version.load(Ordering::Acquire); + let type_version = self.tp_version_tag.load(Ordering::Acquire); + if v1 != type_version + || !core::ptr::eq(entry.name.load(Ordering::Relaxed), name_ptr) + { + break; + } let ptr = entry.value.load(Ordering::Acquire); - if !ptr.is_null() { - let v2 = entry.version.load(Ordering::Acquire); - if v2 == v1 { + if entry.end_read(seq1) { + if !ptr.is_null() { return true; } + break; } + continue; } } @@ -1498,8 +1543,8 @@ impl PyType { PySetterValue::Assign(ref val) => { let key = identifier!(vm, __type_params__); self.check_set_special_type_attr(key, vm)?; - self.attributes.write().insert(key, val.clone().into()); self.modified(); + self.attributes.write().insert(key, val.clone().into()); } PySetterValue::Delete => { // For delete, we still need to check if the type is immutable @@ -1510,8 +1555,8 @@ impl PyType { ))); } let key = identifier!(vm, __type_params__); - self.attributes.write().shift_remove(&key); self.modified(); + self.attributes.write().shift_remove(&key); } } Ok(()) diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 8032cf2802d..8d18e64e46b 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -3257,10 +3257,10 @@ impl ExecutingFrame<'_> { let owner = self.top_value(); let type_version = self.code.instructions.read_cache_u32(cache_base + 1); - if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { - // Cache hit: load the cached method descriptor - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + if type_version != 0 + && owner.class().tp_version_tag.load(Acquire) == type_version + && let Some(func) = self.try_read_cached_descriptor(cache_base, type_version) + { let owner = self.pop_value(); self.push_value(func); self.push_value(owner); @@ -3287,9 +3287,8 @@ impl ExecutingFrame<'_> { if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version && owner.dict().is_none() + && let Some(func) = self.try_read_cached_descriptor(cache_base, type_version) { - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); let owner = self.pop_value(); self.push_value(func); self.push_value(owner); @@ -3345,10 +3344,10 @@ impl ExecutingFrame<'_> { false }; - if !shadowed { - // Cache hit: load the cached method descriptor - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let func = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + if !shadowed + && let Some(func) = + self.try_read_cached_descriptor(cache_base, type_version) + { let owner = self.pop_value(); self.push_value(func); self.push_value(owner); @@ -3475,10 +3474,10 @@ impl ExecutingFrame<'_> { let owner = self.top_value(); let type_version = self.code.instructions.read_cache_u32(cache_base + 1); - if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { - // Load cached class attribute directly (no dict, no data descriptor) - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let attr = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + if type_version != 0 + && owner.class().tp_version_tag.load(Acquire) == type_version + && let Some(attr) = self.try_read_cached_descriptor(cache_base, type_version) + { self.pop_value(); if oparg.is_method() { self.push_value(attr); @@ -3528,8 +3527,17 @@ impl ExecutingFrame<'_> { return Ok(None); } // Not in instance dict — use cached class attr - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let attr = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); + let Some(attr) = self.try_read_cached_descriptor(cache_base, type_version) + else { + self.deoptimize_at( + Instruction::LoadAttr { + namei: Arg::marker(), + }, + instr_idx, + cache_base, + ); + return self.load_attr_slow(vm, oparg); + }; self.pop_value(); if oparg.is_method() { self.push_value(attr); @@ -3566,9 +3574,8 @@ impl ExecutingFrame<'_> { if type_version != 0 && let Some(owner_type) = owner.downcast_ref::() && owner_type.tp_version_tag.load(Acquire) == type_version + && let Some(attr) = self.try_read_cached_descriptor(cache_base, type_version) { - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let attr = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); self.pop_value(); if oparg.is_method() { self.push_value(attr); @@ -3608,9 +3615,8 @@ impl ExecutingFrame<'_> { && let Some(owner_type) = owner.downcast_ref::() && owner_type.tp_version_tag.load(Acquire) == type_version && owner.class().tp_version_tag.load(Acquire) == metaclass_version + && let Some(attr) = self.try_read_cached_descriptor(cache_base, type_version) { - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - let attr = unsafe { &*(descr_ptr as *const PyObject) }.to_owned(); self.pop_value(); if oparg.is_method() { self.push_value(attr); @@ -3683,19 +3689,16 @@ impl ExecutingFrame<'_> { let owner = self.top_value(); let type_version = self.code.instructions.read_cache_u32(cache_base + 1); - if type_version != 0 && owner.class().tp_version_tag.load(Acquire) == type_version { - let descr_ptr = self.code.instructions.read_cache_u64(cache_base + 5); - if descr_ptr != 0 { - let descr = unsafe { &*(descr_ptr as *const PyObject) }; - if let Some(prop) = descr.downcast_ref::() { - let owner = self.pop_value(); - if let Some(getter) = prop.get_fget() { - let result = getter.call((owner,), vm)?; - self.push_value(result); - return Ok(None); - } - } - } + if type_version != 0 + && owner.class().tp_version_tag.load(Acquire) == type_version + && let Some(descr) = self.try_read_cached_descriptor(cache_base, type_version) + && let Some(prop) = descr.downcast_ref::() + && let Some(getter) = prop.get_fget() + { + let owner = self.pop_value(); + let result = getter.call((owner,), vm)?; + self.push_value(result); + return Ok(None); } unsafe { self.code.instructions.replace_op( @@ -6883,6 +6886,71 @@ impl ExecutingFrame<'_> { Ok(None) } + #[inline] + fn try_read_cached_descriptor( + &self, + cache_base: usize, + expected_type_version: u32, + ) -> Option { + let descr_ptr = self.code.instructions.read_cache_ptr(cache_base + 5); + if descr_ptr == 0 { + return None; + } + let descr = unsafe { &*(descr_ptr as *const PyObject) }; + let cloned = descr.try_to_owned()?; + if self.code.instructions.read_cache_u32(cache_base + 1) == expected_type_version + && self.code.instructions.read_cache_ptr(cache_base + 5) == descr_ptr + { + Some(cloned) + } else { + drop(cloned); + None + } + } + + #[inline] + unsafe fn write_cached_descriptor( + &self, + cache_base: usize, + type_version: u32, + descr_ptr: usize, + ) { + // Publish descriptor cache atomically as a tuple: + // invalidate version first, then write payload, then publish version. + unsafe { + self.code.instructions.write_cache_u32(cache_base + 1, 0); + self.code + .instructions + .write_cache_ptr(cache_base + 5, descr_ptr); + self.code + .instructions + .write_cache_u32(cache_base + 1, type_version); + } + } + + #[inline] + unsafe fn write_cached_descriptor_with_metaclass( + &self, + cache_base: usize, + type_version: u32, + metaclass_version: u32, + descr_ptr: usize, + ) { + // Same publish protocol as write_cached_descriptor(), plus metaclass guard. + unsafe { + self.code.instructions.write_cache_u32(cache_base + 1, 0); + self.code + .instructions + .write_cache_u32(cache_base + 3, metaclass_version); + self.code + .instructions + .write_cache_ptr(cache_base + 5, descr_ptr); + self.code + .instructions + .write_cache_u32(cache_base + 1, type_version); + } + } + fn load_attr(&mut self, vm: &VirtualMachine, oparg: LoadAttr) -> FrameResult { self.adaptive(|s, ii, cb| s.specialize_load_attr(vm, oparg, ii, cb)); self.load_attr_slow(vm, oparg) @@ -6973,14 +7041,9 @@ impl ExecutingFrame<'_> { .flags .has_feature(PyTypeFlags::METHOD_DESCRIPTOR) { - let descr_ptr = &**descr as *const PyObject as u64; + let descr_ptr = &**descr as *const PyObject as usize; unsafe { - self.code - .instructions - .write_cache_u32(cache_base + 1, type_version); - self.code - .instructions - .write_cache_u64(cache_base + 5, descr_ptr); + self.write_cached_descriptor(cache_base, type_version, descr_ptr); } let new_op = if !class_has_dict { @@ -7032,14 +7095,9 @@ impl ExecutingFrame<'_> { && descr.downcast_ref::().is_some() { // Property descriptor — cache the property object pointer - let descr_ptr = &**descr as *const PyObject as u64; + let descr_ptr = &**descr as *const PyObject as usize; unsafe { - self.code - .instructions - .write_cache_u32(cache_base + 1, type_version); - self.code - .instructions - .write_cache_u64(cache_base + 5, descr_ptr); + self.write_cached_descriptor(cache_base, type_version, descr_ptr); } self.specialize_at(instr_idx, cache_base, Instruction::LoadAttrProperty); } else { @@ -7065,14 +7123,9 @@ impl ExecutingFrame<'_> { } else if class_has_dict { if let Some(ref descr) = cls_attr { // Plain class attr + class supports dict — check dict first, fallback - let descr_ptr = &**descr as *const PyObject as u64; + let descr_ptr = &**descr as *const PyObject as usize; unsafe { - self.code - .instructions - .write_cache_u32(cache_base + 1, type_version); - self.code - .instructions - .write_cache_u64(cache_base + 5, descr_ptr); + self.write_cached_descriptor(cache_base, type_version, descr_ptr); } self.specialize_at( instr_idx, @@ -7119,14 +7172,9 @@ impl ExecutingFrame<'_> { } } else if let Some(ref descr) = cls_attr { // No dict support, plain class attr — cache directly - let descr_ptr = &**descr as *const PyObject as u64; + let descr_ptr = &**descr as *const PyObject as usize; unsafe { - self.code - .instructions - .write_cache_u32(cache_base + 1, type_version); - self.code - .instructions - .write_cache_u64(cache_base + 5, descr_ptr); + self.write_cached_descriptor(cache_base, type_version, descr_ptr); } self.specialize_at( instr_idx, @@ -7220,22 +7268,23 @@ impl ExecutingFrame<'_> { let has_descr_get = descr_class.slots.descr_get.load().is_some(); if !has_descr_get { // METHOD or NON_DESCRIPTOR — can cache directly - let descr_ptr = &**descr as *const PyObject as u64; + let descr_ptr = &**descr as *const PyObject as usize; let new_op = if metaclass_version == 0 { Instruction::LoadAttrClass } else { Instruction::LoadAttrClassWithMetaclassCheck }; unsafe { - self.code - .instructions - .write_cache_u32(cache_base + 1, type_version); - self.code - .instructions - .write_cache_u32(cache_base + 3, metaclass_version); - self.code - .instructions - .write_cache_u64(cache_base + 5, descr_ptr); + if metaclass_version == 0 { + self.write_cached_descriptor(cache_base, type_version, descr_ptr); + } else { + self.write_cached_descriptor_with_metaclass( + cache_base, + type_version, + metaclass_version, + descr_ptr, + ); + } } self.specialize_at(instr_idx, cache_base, new_op); return; diff --git a/crates/vm/src/gc_state.rs b/crates/vm/src/gc_state.rs index 2b3b3ccd3a2..e8e83bba49c 100644 --- a/crates/vm/src/gc_state.rs +++ b/crates/vm/src/gc_state.rs @@ -299,13 +299,7 @@ impl GcState { fn collect_from_list( list: &LinkedList, ) -> impl Iterator + '_ { - list.iter().filter_map(|obj| { - if obj.strong_count() > 0 { - Some(obj.to_owned()) - } else { - None - } - }) + list.iter().filter_map(|obj| obj.try_to_owned()) } match generation { @@ -468,15 +462,16 @@ impl GcState { // After dropping gen_locks, other threads can untrack+free objects, // making the raw pointers in `reachable`/`unreachable` dangling. // Strong refs keep objects alive for later phases. + // + // Use try_to_owned() (CAS-based) instead of strong_count()+to_owned() + // to prevent a TOCTOU race: another thread can dec() the count to 0 + // between the check and the increment, causing a use-after-free when + // the destroying thread eventually frees the memory. let survivor_refs: Vec = reachable .iter() .filter_map(|ptr| { let obj = unsafe { ptr.0.as_ref() }; - if obj.strong_count() > 0 { - Some(obj.to_owned()) - } else { - None - } + obj.try_to_owned() }) .collect(); @@ -484,11 +479,7 @@ impl GcState { .iter() .filter_map(|ptr| { let obj = unsafe { ptr.0.as_ref() }; - if obj.strong_count() > 0 { - Some(obj.to_owned()) - } else { - None - } + obj.try_to_owned() }) .collect(); diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs index fce1eaf7e35..fc2d679a96f 100644 --- a/crates/vm/src/object/core.rs +++ b/crates/vm/src/object/core.rs @@ -576,11 +576,12 @@ impl WeakRefList { ptrs.as_mut().set_next(None); } - // Collect callback if present and weakref is still alive - if wr.0.ref_count.get() > 0 { + // Collect callback only if we can still acquire a strong ref. + if wr.0.ref_count.safe_inc() { + let wr_ref = unsafe { PyRef::from_raw(wr as *const Py) }; let cb = unsafe { wr.0.payload.callback.get().replace(None) }; if let Some(cb) = cb { - callbacks.push((wr.to_owned(), cb)); + callbacks.push((wr_ref, cb)); } } @@ -626,11 +627,12 @@ impl WeakRefList { ptrs.as_mut().set_next(None); } - // Collect callback without invoking - if wr.0.ref_count.get() > 0 { + // Collect callback without invoking only if we can keep weakref alive. + if wr.0.ref_count.safe_inc() { + let wr_ref = unsafe { PyRef::from_raw(wr as *const Py) }; let cb = unsafe { wr.0.payload.callback.get().replace(None) }; if let Some(cb) = cb { - callbacks.push((wr.to_owned(), cb)); + callbacks.push((wr_ref, cb)); } } @@ -660,8 +662,8 @@ impl WeakRefList { let mut current = NonNull::new(self.head.load(Ordering::Relaxed)); while let Some(node) = current { let wr = unsafe { node.as_ref() }; - if wr.0.ref_count.get() > 0 { - v.push(wr.to_owned()); + if wr.0.ref_count.safe_inc() { + v.push(unsafe { PyRef::from_raw(wr as *const Py) }); } current = unsafe { WeakLink::pointers(node).as_ref().get_next() }; } @@ -952,6 +954,23 @@ impl ToOwned for PyObject { } } +impl PyObject { + /// Atomically try to create a strong reference. + /// Returns `None` if the strong count is already 0 (object being destroyed). + /// Uses CAS to prevent the TOCTOU race between checking strong_count and + /// incrementing it. + #[inline] + pub fn try_to_owned(&self) -> Option { + if self.0.ref_count.safe_inc() { + Some(PyObjectRef { + ptr: NonNull::from(self), + }) + } else { + None + } + } +} + impl PyObjectRef { #[inline(always)] pub const fn into_raw(self) -> NonNull { diff --git a/crates/vm/src/stdlib/thread.rs b/crates/vm/src/stdlib/thread.rs index a3ff23cf234..12a6a340d7a 100644 --- a/crates/vm/src/stdlib/thread.rs +++ b/crates/vm/src/stdlib/thread.rs @@ -231,8 +231,8 @@ pub(crate) mod _thread { #[pymethod] #[pymethod(name = "release_lock")] fn release(&self, vm: &VirtualMachine) -> PyResult<()> { - if !self.mu.is_locked() { - return Err(vm.new_runtime_error("release unlocked lock")); + if !self.mu.is_owned_by_current_thread() { + return Err(vm.new_runtime_error("cannot release un-acquired lock")); } debug_assert!( self.count.load(core::sync::atomic::Ordering::Relaxed) > 0, @@ -278,6 +278,32 @@ pub(crate) mod _thread { } } + #[pymethod] + fn _release_save(&self, vm: &VirtualMachine) -> PyResult<(usize, u64)> { + if !self.mu.is_owned_by_current_thread() { + return Err(vm.new_runtime_error("cannot release un-acquired lock")); + } + let count = self.count.swap(0, core::sync::atomic::Ordering::Relaxed); + debug_assert!(count > 0, "RLock count underflow"); + for _ in 0..count { + unsafe { self.mu.unlock() }; + } + Ok((count, current_thread_id())) + } + + #[pymethod] + fn _acquire_restore(&self, state: (usize, u64), _vm: &VirtualMachine) { + let (count, _owner) = state; + if count == 0 { + return; + } + for _ in 0..count { + self.mu.lock(); + } + self.count + .store(count, core::sync::atomic::Ordering::Relaxed); + } + #[pymethod] fn __exit__(&self, _args: FuncArgs, vm: &VirtualMachine) -> PyResult<()> { self.release(vm) diff --git a/crates/vm/src/vm/mod.rs b/crates/vm/src/vm/mod.rs index eb1c37ed2f7..6461502a582 100644 --- a/crates/vm/src/vm/mod.rs +++ b/crates/vm/src/vm/mod.rs @@ -22,7 +22,7 @@ use crate::{ self, PyBaseExceptionRef, PyDict, PyDictRef, PyInt, PyList, PyModule, PyStr, PyStrInterned, PyStrRef, PyTypeRef, PyUtf8Str, PyUtf8StrInterned, PyWeak, code::PyCode, - dict::{PyDictItems, PyDictKeys, PyDictValues}, + dict::{PyDictItems, PyDictValues}, pystr::AsPyStr, tuple::PyTuple, }, @@ -1319,10 +1319,6 @@ impl VirtualMachine { } else if cls.is(self.ctx.types.list_type) { list_borrow = value.downcast_ref::().unwrap().borrow_vec(); &list_borrow - } else if cls.is(self.ctx.types.dict_keys_type) { - // Atomic snapshot of dict keys - prevents race condition during iteration - let keys = value.downcast_ref::().unwrap().dict.keys_vec(); - return keys.into_iter().map(func).collect(); } else if cls.is(self.ctx.types.dict_values_type) { // Atomic snapshot of dict values - prevents race condition during iteration let values = value From b0ad91788104f6489804f0d8625b85b258d9de8d Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Thu, 5 Mar 2026 17:14:50 +0900 Subject: [PATCH 2/6] Fix RLock _acquire_restore tuple handling and unxfail threading test --- Lib/test/test_threading.py | 1 - crates/vm/src/stdlib/thread.rs | 13 ++++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py index b97ec0101ca..5ab69a7fa4f 100644 --- a/Lib/test/test_threading.py +++ b/Lib/test/test_threading.py @@ -2141,7 +2141,6 @@ def __init__(self, a, *, b) -> None: CustomRLock(1, b=2) self.assertEqual(warnings_log, []) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_release_save_unacquired(self): return super().test_release_save_unacquired() diff --git a/crates/vm/src/stdlib/thread.rs b/crates/vm/src/stdlib/thread.rs index 12a6a340d7a..80ac30a549e 100644 --- a/crates/vm/src/stdlib/thread.rs +++ b/crates/vm/src/stdlib/thread.rs @@ -292,16 +292,23 @@ pub(crate) mod _thread { } #[pymethod] - fn _acquire_restore(&self, state: (usize, u64), _vm: &VirtualMachine) { - let (count, _owner) = state; + fn _acquire_restore(&self, state: PyTupleRef, vm: &VirtualMachine) -> PyResult<()> { + let [count_obj, owner_obj] = state.as_slice() else { + return Err( + vm.new_type_error("_acquire_restore() argument 1 must be a 2-item tuple") + ); + }; + let count: usize = count_obj.clone().try_into_value(vm)?; + let _owner: u64 = owner_obj.clone().try_into_value(vm)?; if count == 0 { - return; + return Ok(()); } for _ in 0..count { self.mu.lock(); } self.count .store(count, core::sync::atomic::Ordering::Relaxed); + Ok(()) } #[pymethod] From de010994143681b51d739e7dacad70c352ff2490 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Thu, 5 Mar 2026 17:30:42 +0900 Subject: [PATCH 3/6] Align type cache seqlock writer protocol with CPython --- crates/compiler-core/src/bytecode.rs | 4 ++++ crates/vm/src/builtins/type.rs | 23 ++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/crates/compiler-core/src/bytecode.rs b/crates/compiler-core/src/bytecode.rs index 3c919dd0ca4..46182962654 100644 --- a/crates/compiler-core/src/bytecode.rs +++ b/crates/compiler-core/src/bytecode.rs @@ -623,6 +623,10 @@ impl CodeUnits { /// /// # Safety /// - `index` must be in bounds. + /// - `value` must be `0` or a valid `*const PyObject` encoded as `usize`. + /// - Callers must follow the cache invalidation/upgrade protocol: + /// invalidate the version guard before writing and publish the new + /// version after writing. pub unsafe fn write_cache_ptr(&self, index: usize, value: usize) { self.pointer_cache[index].store(value, Ordering::Relaxed); } diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index 9e6b15b2323..7947ab1958f 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -104,7 +104,28 @@ impl TypeCacheEntry { #[inline] fn begin_write(&self) { - self.sequence.fetch_add(1, Ordering::AcqRel); + let mut seq = self.sequence.load(Ordering::Acquire); + loop { + while (seq & 1) != 0 { + core::hint::spin_loop(); + seq = self.sequence.load(Ordering::Acquire); + } + match self.sequence.compare_exchange_weak( + seq, + seq.wrapping_add(1), + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => { + core::sync::atomic::fence(Ordering::Release); + break; + } + Err(observed) => { + core::hint::spin_loop(); + seq = observed; + } + } + } } #[inline] From 8683977378340ee67501bb85d997b8407e440aaa Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Thu, 5 Mar 2026 17:41:40 +0900 Subject: [PATCH 4/6] RLock: use single parking_lot level, track recursion manually Instead of calling lock()/unlock() N times for recursion depth N, keep parking_lot at 1 level and manage the count ourselves. This makes acquire/release O(1) and matches CPython's _PyRecursiveMutex approach (lock once + set level directly). --- Lib/test/test_threading.py | 3 --- crates/vm/src/stdlib/thread.rs | 30 ++++++++++++++++-------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/Lib/test/test_threading.py b/Lib/test/test_threading.py index 5ab69a7fa4f..8db0bbdb949 100644 --- a/Lib/test/test_threading.py +++ b/Lib/test/test_threading.py @@ -2141,9 +2141,6 @@ def __init__(self, a, *, b) -> None: CustomRLock(1, b=2) self.assertEqual(warnings_log, []) - def test_release_save_unacquired(self): - return super().test_release_save_unacquired() - @unittest.skip('TODO: RUSTPYTHON; flaky test') def test_different_thread(self): return super().test_different_thread() diff --git a/crates/vm/src/stdlib/thread.rs b/crates/vm/src/stdlib/thread.rs index 80ac30a549e..45be328dc1e 100644 --- a/crates/vm/src/stdlib/thread.rs +++ b/crates/vm/src/stdlib/thread.rs @@ -221,10 +221,16 @@ pub(crate) mod _thread { #[pymethod(name = "acquire_lock")] #[pymethod(name = "__enter__")] fn acquire(&self, args: AcquireArgs, vm: &VirtualMachine) -> PyResult { - let result = acquire_lock_impl!(&self.mu, args, vm)?; - if result { + if self.mu.is_owned_by_current_thread() { + // Re-entrant acquisition: just increment our count. + // parking_lot stays at 1 level; we track recursion ourselves. self.count .fetch_add(1, core::sync::atomic::Ordering::Relaxed); + return Ok(true); + } + let result = acquire_lock_impl!(&self.mu, args, vm)?; + if result { + self.count.store(1, core::sync::atomic::Ordering::Relaxed); } Ok(result) } @@ -234,13 +240,13 @@ pub(crate) mod _thread { if !self.mu.is_owned_by_current_thread() { return Err(vm.new_runtime_error("cannot release un-acquired lock")); } - debug_assert!( - self.count.load(core::sync::atomic::Ordering::Relaxed) > 0, - "RLock count underflow" - ); - self.count + let prev = self + .count .fetch_sub(1, core::sync::atomic::Ordering::Relaxed); - unsafe { self.mu.unlock() }; + debug_assert!(prev > 0, "RLock count underflow"); + if prev == 1 { + unsafe { self.mu.unlock() }; + } Ok(()) } @@ -285,9 +291,7 @@ pub(crate) mod _thread { } let count = self.count.swap(0, core::sync::atomic::Ordering::Relaxed); debug_assert!(count > 0, "RLock count underflow"); - for _ in 0..count { - unsafe { self.mu.unlock() }; - } + unsafe { self.mu.unlock() }; Ok((count, current_thread_id())) } @@ -303,9 +307,7 @@ pub(crate) mod _thread { if count == 0 { return Ok(()); } - for _ in 0..count { - self.mu.lock(); - } + self.mu.lock(); self.count .store(count, core::sync::atomic::Ordering::Relaxed); Ok(()) From 3e31d2230fbee85d8222c65b6587f3e91aba1de5 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Thu, 5 Mar 2026 18:01:35 +0900 Subject: [PATCH 5/6] Add try_to_owned_from_ptr to avoid &PyObject on stale ptrs Use addr_of! to access ref_count directly from a raw pointer without forming &PyObject first. Applied in type cache and instruction cache hit paths where the pointer may be stale. --- crates/vm/src/builtins/type.rs | 5 ++--- crates/vm/src/frame.rs | 3 +-- crates/vm/src/object/core.rs | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/crates/vm/src/builtins/type.rs b/crates/vm/src/builtins/type.rs index 7947ab1958f..cca8c4692e6 100644 --- a/crates/vm/src/builtins/type.rs +++ b/crates/vm/src/builtins/type.rs @@ -803,9 +803,8 @@ impl PyType { continue; } // _Py_TryIncrefCompare-style validation: - // safe_inc, then ensure the source pointer is unchanged. - let obj: &PyObject = unsafe { &*ptr }; - if let Some(cloned) = obj.try_to_owned() { + // safe_inc via raw pointer, then ensure source is unchanged. + if let Some(cloned) = unsafe { PyObject::try_to_owned_from_ptr(ptr) } { let same_ptr = core::ptr::eq(entry.value.load(Ordering::Relaxed), ptr); if same_ptr && entry.end_read(seq1) { return Some(cloned); diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs index 8d18e64e46b..376c9ed6bd1 100644 --- a/crates/vm/src/frame.rs +++ b/crates/vm/src/frame.rs @@ -6896,8 +6896,7 @@ impl ExecutingFrame<'_> { if descr_ptr == 0 { return None; } - let descr = unsafe { &*(descr_ptr as *const PyObject) }; - let cloned = descr.try_to_owned()?; + let cloned = unsafe { PyObject::try_to_owned_from_ptr(descr_ptr as *mut PyObject) }?; if self.code.instructions.read_cache_u32(cache_base + 1) == expected_type_version && self.code.instructions.read_cache_ptr(cache_base + 5) == descr_ptr { diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs index fc2d679a96f..70c3569eaa4 100644 --- a/crates/vm/src/object/core.rs +++ b/crates/vm/src/object/core.rs @@ -969,6 +969,29 @@ impl PyObject { None } } + + /// Like [`try_to_owned`](Self::try_to_owned), but from a raw pointer. + /// + /// Uses `addr_of!` to access `ref_count` without forming `&PyObject`, + /// minimising the borrow scope when the pointer may be stale + /// (e.g. cache-hit paths protected by version guards). + /// + /// # Safety + /// `ptr` must point to a live (not yet deallocated) `PyObject`, or to + /// memory whose `ref_count` field is still atomically readable + /// (same guarantee as `_Py_TryIncRefShared`). + #[inline] + pub unsafe fn try_to_owned_from_ptr(ptr: *mut Self) -> Option { + let inner = ptr.cast::>(); + let ref_count = unsafe { &*core::ptr::addr_of!((*inner).ref_count) }; + if ref_count.safe_inc() { + Some(PyObjectRef { + ptr: unsafe { NonNull::new_unchecked(ptr) }, + }) + } else { + None + } + } } impl PyObjectRef { From b3de227e0a47007e8676382306d18d2c09ce457f Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Thu, 5 Mar 2026 19:38:38 +0900 Subject: [PATCH 6/6] Fix CI: spelling typo and xfail flaky test_thread_safety - Fix "minimising" -> "minimizing" for cspell - xfail test_thread_safety: dict iteration races with concurrent GC mutations in _finalizer_registry --- Lib/test/_test_multiprocessing.py | 2 ++ crates/vm/src/object/core.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/test/_test_multiprocessing.py b/Lib/test/_test_multiprocessing.py index 311cdebf44d..894cebda57b 100644 --- a/Lib/test/_test_multiprocessing.py +++ b/Lib/test/_test_multiprocessing.py @@ -4816,6 +4816,8 @@ def test_finalize(self): self.assertEqual(result, ['a', 'b', 'd10', 'd03', 'd02', 'd01', 'e']) @support.requires_resource('cpu') + # TODO: RUSTPYTHON; dict iteration races with concurrent GC mutations + @unittest.expectedFailure def test_thread_safety(self): # bpo-24484: _run_finalizers() should be thread-safe def cb(): diff --git a/crates/vm/src/object/core.rs b/crates/vm/src/object/core.rs index 70c3569eaa4..a78b906ceea 100644 --- a/crates/vm/src/object/core.rs +++ b/crates/vm/src/object/core.rs @@ -973,7 +973,7 @@ impl PyObject { /// Like [`try_to_owned`](Self::try_to_owned), but from a raw pointer. /// /// Uses `addr_of!` to access `ref_count` without forming `&PyObject`, - /// minimising the borrow scope when the pointer may be stale + /// minimizing the borrow scope when the pointer may be stale /// (e.g. cache-hit paths protected by version guards). /// /// # Safety