From fc9fafd76992dd2f406243455293380d14044707 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 17 Jun 2026 22:21:21 +0100 Subject: [PATCH] gh-151613: Fix remote debugging frame cache ABA Fixes #151613. The remote debugging frame cache previously used only the last_profiled_frame address as its cache anchor. If a frame returned and a later frame reused the same _PyInterpreterFrame address, the profiler could accept a stale cache entry and splice parent frames from a different call chain into the current stack. This adds a last_profiled_frame_seq counter next to last_profiled_frame, increments it when the anchor advances, stores it in frame cache entries, and validates cache hits against both the frame address and the sequence. Cache miss walks now copy stack chunks before storing new cache entries so stored continuations come from a stable snapshot. The new regression test exercises alternating call chains and checks that cached stacks never contain frames from both branches. --- Include/cpython/pystate.h | 1 + Include/internal/pycore_debug_offsets.h | 2 + Include/internal/pycore_interpframe.h | 3 + InternalDocs/frames.md | 30 +++--- Lib/test/test_external_inspection.py | 84 ++++++++++++++++ ...-06-13-11-57-48.gh-issue-151436.UEDowO.rst | 2 +- ...-06-17-22-31-57.gh-issue-151613.n0nua1.rst | 3 + Modules/_remote_debugging/_remote_debugging.h | 17 +++- .../debug_offsets_validation.h | 5 +- Modules/_remote_debugging/frame_cache.c | 96 ++++++++++++++++--- Modules/_remote_debugging/frames.c | 59 ++++++++---- Modules/_remote_debugging/module.c | 4 +- Modules/_remote_debugging/threads.c | 14 +-- Python/pystate.c | 2 + 14 files changed, 266 insertions(+), 56 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index a9d97e47e005df..f367146e262bfe 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -143,6 +143,7 @@ struct _ts { struct _PyInterpreterFrame *base_frame; struct _PyInterpreterFrame *last_profiled_frame; + uintptr_t last_profiled_frame_seq; Py_tracefunc c_profilefunc; Py_tracefunc c_tracefunc; diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index 18490f98a918a7..6e1eb573c8c2c8 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -104,6 +104,7 @@ typedef struct _Py_DebugOffsets { uint64_t current_frame; uint64_t base_frame; uint64_t last_profiled_frame; + uint64_t last_profiled_frame_seq; uint64_t thread_id; uint64_t native_thread_id; uint64_t datastack_chunk; @@ -294,6 +295,7 @@ typedef struct _Py_DebugOffsets { .current_frame = offsetof(PyThreadState, current_frame), \ .base_frame = offsetof(PyThreadState, base_frame), \ .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \ + .last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \ .thread_id = offsetof(PyThreadState, thread_id), \ .native_thread_id = offsetof(PyThreadState, native_thread_id), \ .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \ diff --git a/Include/internal/pycore_interpframe.h b/Include/internal/pycore_interpframe.h index 0f4bf7d8a2f2f0..e10c0fe0a2fbf2 100644 --- a/Include/internal/pycore_interpframe.h +++ b/Include/internal/pycore_interpframe.h @@ -292,12 +292,15 @@ _PyThreadState_GetFrame(PyThreadState *tstate) // This avoids corrupting the cache when transient frames (called and returned // between profiler samples) update last_profiled_frame to addresses the // profiler never saw. +// The sequence distinguishes this anchor from a later frame that reuses the +// same _PyInterpreterFrame address. #define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \ do { \ PyThreadState *tstate_ = (tstate); \ _PyInterpreterFrame *frame_ = (frame); \ if (tstate_->last_profiled_frame == frame_) { \ tstate_->last_profiled_frame = (previous); \ + tstate_->last_profiled_frame_seq++; \ } \ } while (0) diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md index 60ab2055afa7b1..475ca75e28e1a5 100644 --- a/InternalDocs/frames.md +++ b/InternalDocs/frames.md @@ -142,22 +142,26 @@ since the frame chain may have been in an inconsistent state due to concurrent u ### Remote Profiling Frame Cache -The `last_profiled_frame` field in `PyThreadState` supports an optimization for -remote profilers that sample call stacks from external processes. When a remote -profiler reads the call stack, it writes the current frame address to this field. -The eval loop then keeps this pointer valid by updating it to the parent frame -whenever a frame returns (in `_PyEval_FrameClearAndPop`). +The `last_profiled_frame` and `last_profiled_frame_seq` fields in +`PyThreadState` support an optimization for remote profilers that sample call +stacks from external processes. When a remote profiler reads the call stack, it +writes the current frame address to `last_profiled_frame`. The eval loop then +keeps this pointer valid by updating it to the parent frame whenever a frame +returns (in `_PyEval_FrameClearAndPop`) and increments the sequence. This creates a "high-water mark" that always points to a frame still on the stack. On subsequent samples, the profiler can walk from `current_frame` until it reaches -`last_profiled_frame`, knowing that frames from that point downward are unchanged -and can be retrieved from a cache. This significantly reduces the amount of remote -memory reads needed when call stacks are deep and stable at their base. - -The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when -`last_profiled_frame` is non-NULL AND matches the frame being popped. This -prevents transient frames (called and returned between profiler samples) from -corrupting the cache pointer, while avoiding any overhead when profiling is inactive. +`last_profiled_frame`, then validate the pointer and sequence before using cached +callers. This prevents a later frame that reuses the same `_PyInterpreterFrame` +address from being mistaken for the sampled frame. The cache significantly +reduces the amount of remote memory reads needed when call stacks are deep and +stable at their base. + +The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the +pointer and sequence when `last_profiled_frame` is non-NULL AND matches the +frame being popped. This prevents transient frames (called and returned between +profiler samples) from corrupting the cache anchor, while avoiding any overhead +when profiling is inactive. ### The Instruction Pointer diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index 6b1529aa173f01..d588475fdcf848 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -3451,6 +3451,90 @@ def level1(): # Parent frames: must match exactly self.assertEqual(loc_cached, loc_no_cache) + @skip_if_not_supported + @unittest.skipIf( + sys.platform != "linux" or not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_rejects_reused_frame_address_aba(self): + """Test that frame address reuse cannot splice cached parent frames.""" + script_body = """\ + sock.sendall(b"ready") + + def burn_a(): + total = 0 + for i in range(20000): + total += i + return total + + def burn_b(): + total = 0 + for i in range(20000): + total += i + return total + + def a_leaf(): + return burn_a() + + def b_leaf(): + return burn_b() + + def a_parent(): + return a_leaf() + + def b_parent(): + return b_leaf() + + while True: + a_parent() + b_parent() + """ + + with self._target_process(script_body) as (p, client_socket, _): + _wait_for_signal(client_socket, b"ready") + unwinder = RemoteUnwinder( + p.pid, cache_frames=True, stats=True + ) + bad_stacks = [] + seen = 0 + branch_a = {"a_parent", "a_leaf", "burn_a"} + branch_b = {"b_parent", "b_leaf", "burn_b"} + + for _ in range(8000): + try: + traces = unwinder.get_stack_trace() + except TRANSIENT_ERRORS: + continue + for interp in traces: + for thread in interp.threads: + in_a = False + in_b = False + for frame in thread.frame_info: + name = frame.funcname + if name in branch_a: + in_a = True + elif name in branch_b: + in_b = True + if in_a and in_b: + break + if not (in_a or in_b): + continue + seen += 1 + if in_a and in_b: + funcs = [f.funcname for f in thread.frame_info] + bad_stacks.append(funcs) + if len(bad_stacks) >= 3: + break + + stats = unwinder.get_stats() + + self.assertGreater(seen, 0) + self.assertGreater( + stats["frame_cache_hits"] + stats["frame_cache_partial_hits"], 0 + ) + self.assertGreater(stats["frames_read_from_cache"], 0) + self.assertEqual(bad_stacks, []) + @skip_if_not_supported @unittest.skipIf( sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, diff --git a/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst b/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst index 1d1aadbf57be48..6b10b03ba02fdf 100644 --- a/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst +++ b/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst @@ -1,4 +1,4 @@ -Fix skewed stack trackes in the Tachyon profiler when caching is enabled and +Fix skewed stack traces in the Tachyon profiler when caching is enabled and when generators and coroutines are profiled, by updating ``tstate->last_profiled_frame`` at every frame-removal site. The issue resulted in total erasure of some callers. Patch by Maurycy Pawłowski-Wieroński. diff --git a/Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst b/Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst new file mode 100644 index 00000000000000..fbf3ce47ff2739 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst @@ -0,0 +1,3 @@ +Fix another way the Tachyon profiler frame cache could produce impossible +mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by +validating cached frame anchors with a sequence counter. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 635e6e208902af..fa37fb7b2167ec 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -225,9 +225,15 @@ typedef struct { #define FRAME_CACHE_MAX_THREADS 32 #define FRAME_CACHE_MAX_FRAMES 1024 +typedef struct { + uintptr_t frame; + uintptr_t seq; +} FrameCacheAnchor; + typedef struct { uint64_t thread_id; // 0 = empty slot uintptr_t thread_state_addr; + uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0] uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; PyObject *thread_id_obj; // owned reference, NULL if empty @@ -434,7 +440,7 @@ typedef struct { uintptr_t thread_state_addr; // Owning thread state address uintptr_t base_frame_addr; // Sentinel at bottom (for validation) uintptr_t gc_frame; // GC frame address (0 if not tracking) - uintptr_t last_profiled_frame; // Last cached frame (0 if no cache) + FrameCacheAnchor last_profiled; // Last cached frame anchor StackChunkList *chunks; // Pre-copied stack chunks int skip_first_frame; // Skip frame_addr itself (continue from its caller) RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers @@ -622,15 +628,21 @@ extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder); extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id); extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr); extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder); +extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr); extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result); extern int frame_cache_lookup_and_extend( RemoteUnwinderObject *unwinder, uint64_t thread_id, - uintptr_t last_profiled_frame, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor, PyObject *frame_info, uintptr_t *frame_addrs, Py_ssize_t *num_addrs, Py_ssize_t max_addrs); +extern int frame_cache_anchor_matches( + RemoteUnwinderObject *unwinder, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor); // Returns: 1 = stored, 0 = not stored (graceful), -1 = error // Only stores complete stacks that reach base_frame_addr extern int frame_cache_store( @@ -640,6 +652,7 @@ extern int frame_cache_store( const uintptr_t *addrs, Py_ssize_t num_addrs, uintptr_t thread_state_addr, + uintptr_t last_profiled_frame_seq, uintptr_t base_frame_addr, uintptr_t last_frame_visited); diff --git a/Modules/_remote_debugging/debug_offsets_validation.h b/Modules/_remote_debugging/debug_offsets_validation.h index f070f03ac459dc..c0c01a0a639e19 100644 --- a/Modules/_remote_debugging/debug_offsets_validation.h +++ b/Modules/_remote_debugging/debug_offsets_validation.h @@ -31,7 +31,7 @@ #define FIELD_SIZE(type, member) sizeof(((type *)0)->member) enum { - PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880, + PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888, PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104, }; @@ -261,7 +261,8 @@ validate_fixed_field( APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ - APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size) + APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ + APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size) #define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \ APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \ diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index 19fc406bca9ac9..1b2d9a60f408b9 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -147,25 +147,87 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; unwinder->frame_cache[i].thread_state_addr = 0; + unwinder->frame_cache[i].last_profiled_frame_seq = 0; unwinder->frame_cache[i].num_addrs = 0; STATS_INC(unwinder, stale_cache_invalidations); } } } +static int +read_last_profiled_anchor(RemoteUnwinderObject *unwinder, + uintptr_t thread_state_addr, + FrameCacheAnchor *anchor) +{ + uintptr_t frame_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame; + uintptr_t seq_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame_seq; + + // These fields are adjacent in PyThreadState. Read them together when the + // layout allows it so validation uses a pointer and sequence from the same + // remote-memory read. + if (seq_offset == frame_offset + sizeof(uintptr_t)) { + uintptr_t live_anchor[2]; + if (_Py_RemoteDebug_ReadRemoteMemory(&unwinder->handle, + thread_state_addr + frame_offset, + sizeof(live_anchor), + live_anchor) < 0) { + return -1; + } + anchor->frame = live_anchor[0]; + anchor->seq = live_anchor[1]; + return 0; + } + + if (read_ptr(unwinder, thread_state_addr + frame_offset, &anchor->frame) < 0) { + return -1; + } + return read_ptr(unwinder, thread_state_addr + seq_offset, &anchor->seq); +} + +static Py_ssize_t +find_cached_frame_addr(const FrameCacheEntry *entry, uintptr_t frame_addr, + uintptr_t *real_pops) +{ + *real_pops = 0; + for (Py_ssize_t i = 0; i < entry->num_addrs; i++) { + if (entry->addrs[i] == frame_addr) { + return i; + } + if (entry->addrs[i] != 0) { + (*real_pops)++; + } + } + return -1; +} + +int +frame_cache_anchor_matches( + RemoteUnwinderObject *unwinder, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor) +{ + FrameCacheAnchor live_anchor = {0, 0}; + if (read_last_profiled_anchor(unwinder, thread_state_addr, &live_anchor) < 0) { + PyErr_Clear(); + return 0; + } + return live_anchor.frame == anchor.frame && live_anchor.seq == anchor.seq; +} + // Find last_profiled_frame in cache and extend frame_info with cached continuation // If frame_addrs is provided (not NULL), also extends it with cached addresses int frame_cache_lookup_and_extend( RemoteUnwinderObject *unwinder, uint64_t thread_id, - uintptr_t last_profiled_frame, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor, PyObject *frame_info, uintptr_t *frame_addrs, Py_ssize_t *num_addrs, Py_ssize_t max_addrs) { - if (!unwinder->frame_cache || last_profiled_frame == 0) { + if (!unwinder->frame_cache || anchor.frame == 0) { return 0; } @@ -173,24 +235,31 @@ frame_cache_lookup_and_extend( if (!entry || !entry->frame_list) { return 0; } + if (entry->thread_state_addr != thread_state_addr) { + return 0; + } assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES); - // Find the index where last_profiled_frame matches - Py_ssize_t start_idx = -1; - for (Py_ssize_t i = 0; i < entry->num_addrs; i++) { - if (entry->addrs[i] == last_profiled_frame) { - start_idx = i; - break; - } - } - + uintptr_t real_pops = 0; + Py_ssize_t start_idx = find_cached_frame_addr(entry, anchor.frame, &real_pops); if (start_idx < 0) { return 0; // Not found } assert(start_idx < entry->num_addrs); + // Synthetic marker frames (/) are stored as addr-0 entries but + // never increment last_profiled_frame_seq in the target (only real frame + // pops do). Count the real frames before start_idx so the sequence check is + // not thrown off by markers sitting between the leaf and the anchor. + if (entry->last_profiled_frame_seq + real_pops != anchor.seq) { + return 0; + } + Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list); + if (start_idx >= num_frames) { + return 0; + } // Extend frame_info with frames ABOVE start_idx (not including it). // The frame at start_idx (last_profiled_frame) was the executing frame @@ -200,6 +269,9 @@ frame_cache_lookup_and_extend( if (cache_start >= num_frames) { return 0; // Nothing above last_profiled_frame to extend with } + if (!frame_cache_anchor_matches(unwinder, thread_state_addr, anchor)) { + return 0; + } PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames); if (!slice) { @@ -235,6 +307,7 @@ frame_cache_store( const uintptr_t *addrs, Py_ssize_t num_addrs, uintptr_t thread_state_addr, + uintptr_t last_profiled_frame_seq, uintptr_t base_frame_addr, uintptr_t last_frame_visited) { @@ -277,6 +350,7 @@ frame_cache_store( } entry->thread_id = thread_id; entry->thread_state_addr = thread_state_addr; + entry->last_profiled_frame_seq = last_profiled_frame_seq; if (entry->thread_id_obj == NULL) { entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id); if (entry->thread_id_obj == NULL) { diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index e7d2a276439026..46968acc6ff1fe 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -416,7 +416,7 @@ process_frame_chain( Py_DECREF(frame); } - if (ctx->last_profiled_frame != 0 && frame_addr == ctx->last_profiled_frame) { + if (ctx->last_profiled.frame != 0 && frame_addr == ctx->last_profiled.frame) { ctx->stopped_at_cached_frame = 1; break; } @@ -437,14 +437,23 @@ process_frame_chain( return 0; } -// Clear last_profiled_frame for all threads in the target process. -// This must be called at the start of profiling to avoid stale values -// from previous profilers causing us to stop frame walking early. +int +set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, + uintptr_t frame_addr) +{ + uintptr_t lpf_addr = tstate_addr + + (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame; + return _Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, + sizeof(uintptr_t), &frame_addr); +} + +// Clear the profiler anchor frame for all threads in the target process. The +// sequence is intentionally preserved: a zero frame disables cache lookup, and +// the next profiler-owned anchor should use the target's current generation. int clear_last_profiled_frames(RemoteUnwinderObject *unwinder) { uintptr_t current_interp = unwinder->interpreter_addr; - uintptr_t zero = 0; const size_t MAX_INTERPRETERS = 256; size_t interp_count = 0; @@ -467,11 +476,8 @@ clear_last_profiled_frames(RemoteUnwinderObject *unwinder) size_t thread_count = 0; while (tstate_addr != 0 && thread_count < MAX_THREADS_PER_INTERP) { thread_count++; - // Clear last_profiled_frame - uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame; - if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, - sizeof(uintptr_t), &zero) < 0) { - // Non-fatal: just continue + uintptr_t no_frame = 0; + if (set_last_profiled_frame(unwinder, tstate_addr, no_frame) < 0) { PyErr_Clear(); } @@ -512,10 +518,10 @@ try_full_cache_hit( const FrameWalkContext *ctx, uint64_t thread_id) { - if (!unwinder->frame_cache || ctx->last_profiled_frame == 0) { + if (!unwinder->frame_cache || ctx->last_profiled.frame == 0) { return 0; } - if (ctx->frame_addr != ctx->last_profiled_frame) { + if (ctx->frame_addr != ctx->last_profiled.frame) { return 0; } @@ -523,10 +529,16 @@ try_full_cache_hit( if (!entry || !entry->frame_list) { return 0; } + if (entry->thread_state_addr != ctx->thread_state_addr) { + return 0; + } if (entry->num_addrs == 0 || entry->addrs[0] != ctx->frame_addr) { return 0; } + if (entry->last_profiled_frame_seq != ctx->last_profiled.seq) { + return 0; + } PyObject *current_frame = NULL; uintptr_t code_object_addr = 0; @@ -544,6 +556,11 @@ try_full_cache_hit( if (parse_result < 0) { return -1; } + if (!frame_cache_anchor_matches(unwinder, ctx->thread_state_addr, + ctx->last_profiled)) { + Py_XDECREF(current_frame); + return 0; + } if (current_frame != NULL) { if (PyList_Append(ctx->frame_info, current_frame) < 0) { @@ -582,9 +599,12 @@ collect_frames_with_cache( assert(ctx->chunks != NULL); + // Cache misses copy stack chunks before walking. Frames found there are + // parsed from a stable snapshot, which keeps moving stacks from seeding the + // cache with an impossible parent chain. if (ctx->chunks->count == 0) { if (copy_stack_chunks(unwinder, ctx->thread_state_addr, ctx->chunks) < 0) { - PyErr_Clear(); + return -1; } } @@ -598,7 +618,9 @@ collect_frames_with_cache( if (ctx->stopped_at_cached_frame) { Py_ssize_t frames_before_cache = PyList_GET_SIZE(ctx->frame_info); - int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, ctx->last_profiled_frame, + int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, + ctx->thread_state_addr, + ctx->last_profiled, ctx->frame_info, ctx->frame_addrs, &ctx->num_addrs, ctx->max_addrs); if (cache_result < 0) { @@ -610,7 +632,7 @@ collect_frames_with_cache( // Continue walking from last_profiled_frame, skipping it (already processed) Py_ssize_t frames_before_walk = PyList_GET_SIZE(ctx->frame_info); FrameWalkContext continue_ctx = { - .frame_addr = ctx->last_profiled_frame, + .frame_addr = ctx->last_profiled.frame, .base_frame_addr = ctx->base_frame_addr, .gc_frame = ctx->gc_frame, .chunks = ctx->chunks, @@ -634,13 +656,14 @@ collect_frames_with_cache( STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(ctx->frame_info) - frames_before_cache); } } else { - if (ctx->last_profiled_frame == 0) { + if (ctx->last_profiled.frame == 0) { STATS_INC(unwinder, frame_cache_misses); } } - if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs, - ctx->thread_state_addr, ctx->base_frame_addr, + if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, + ctx->num_addrs, ctx->thread_state_addr, + ctx->last_profiled.seq, ctx->base_frame_addr, ctx->last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 36115f20d9d4cc..7979cd43c6a127 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -475,8 +475,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return -1; } - // Clear stale last_profiled_frame values from previous profilers - // This prevents us from stopping frame walking early due to stale values + // Clear stale profiler anchors from previous profilers. This prevents us + // from stopping frame walking early due to stale frame pointers. if (cache_frames) { clear_last_profiled_frames(self); } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 81735e85395ac9..29f22f14c9b29f 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -503,7 +503,8 @@ unwind_stack_for_thread( goto error; } - // In cache mode, copying stack chunks is more expensive than direct memory reads + // Cache mode skips this for full hits, but cache misses copy chunks before + // walking so newly stored cache entries come from a stable stack snapshot. if (!unwinder->cache_frames) { if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks"); @@ -528,18 +529,17 @@ unwind_stack_for_thread( if (unwinder->cache_frames) { // Use cache to avoid re-reading unchanged parent frames - ctx.last_profiled_frame = GET_MEMBER(uintptr_t, ts, + ctx.last_profiled.frame = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.last_profiled_frame); + ctx.last_profiled.seq = GET_MEMBER(uintptr_t, ts, + unwinder->debug_offsets.thread_state.last_profiled_frame_seq); if (collect_frames_with_cache(unwinder, &ctx, tid) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames"); goto error; } // Update last_profiled_frame for next sample if it changed - if (frame_addr != ctx.last_profiled_frame) { - uintptr_t lpf_addr = - *current_tstate + (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame; - if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, - sizeof(uintptr_t), &frame_addr) < 0) { + if (frame_addr != ctx.last_profiled.frame) { + if (set_last_profiled_frame(unwinder, *current_tstate, frame_addr) < 0) { PyErr_Clear(); // Non-fatal } } diff --git a/Python/pystate.c b/Python/pystate.c index fed1df0173bacf..29f13e92e0dd1f 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1605,6 +1605,8 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->current_frame = &_tstate->base_frame; // base_frame pointer for profilers to validate stack unwinding tstate->base_frame = &_tstate->base_frame; + tstate->last_profiled_frame = NULL; + tstate->last_profiled_frame_seq = 0; tstate->datastack_chunk = NULL; tstate->datastack_top = NULL; tstate->datastack_limit = NULL;