diff --git a/Include/cpython/pystate.h b/Include/cpython/pystate.h index a9d97e47e005df..f367146e262bfe 100644 --- a/Include/cpython/pystate.h +++ b/Include/cpython/pystate.h @@ -143,6 +143,7 @@ struct _ts { struct _PyInterpreterFrame *base_frame; struct _PyInterpreterFrame *last_profiled_frame; + uintptr_t last_profiled_frame_seq; Py_tracefunc c_profilefunc; Py_tracefunc c_tracefunc; diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index 18490f98a918a7..6e1eb573c8c2c8 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -104,6 +104,7 @@ typedef struct _Py_DebugOffsets { uint64_t current_frame; uint64_t base_frame; uint64_t last_profiled_frame; + uint64_t last_profiled_frame_seq; uint64_t thread_id; uint64_t native_thread_id; uint64_t datastack_chunk; @@ -294,6 +295,7 @@ typedef struct _Py_DebugOffsets { .current_frame = offsetof(PyThreadState, current_frame), \ .base_frame = offsetof(PyThreadState, base_frame), \ .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \ + .last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \ .thread_id = offsetof(PyThreadState, thread_id), \ .native_thread_id = offsetof(PyThreadState, native_thread_id), \ .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \ diff --git a/Include/internal/pycore_interpframe.h b/Include/internal/pycore_interpframe.h index 0f4bf7d8a2f2f0..e10c0fe0a2fbf2 100644 --- a/Include/internal/pycore_interpframe.h +++ b/Include/internal/pycore_interpframe.h @@ -292,12 +292,15 @@ _PyThreadState_GetFrame(PyThreadState *tstate) // This avoids corrupting the cache when transient frames (called and returned // between profiler samples) update last_profiled_frame to addresses the // profiler never saw. +// The sequence distinguishes this anchor from a later frame that reuses the +// same _PyInterpreterFrame address. #define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \ do { \ PyThreadState *tstate_ = (tstate); \ _PyInterpreterFrame *frame_ = (frame); \ if (tstate_->last_profiled_frame == frame_) { \ tstate_->last_profiled_frame = (previous); \ + tstate_->last_profiled_frame_seq++; \ } \ } while (0) diff --git a/InternalDocs/frames.md b/InternalDocs/frames.md index 60ab2055afa7b1..475ca75e28e1a5 100644 --- a/InternalDocs/frames.md +++ b/InternalDocs/frames.md @@ -142,22 +142,26 @@ since the frame chain may have been in an inconsistent state due to concurrent u ### Remote Profiling Frame Cache -The `last_profiled_frame` field in `PyThreadState` supports an optimization for -remote profilers that sample call stacks from external processes. When a remote -profiler reads the call stack, it writes the current frame address to this field. -The eval loop then keeps this pointer valid by updating it to the parent frame -whenever a frame returns (in `_PyEval_FrameClearAndPop`). +The `last_profiled_frame` and `last_profiled_frame_seq` fields in +`PyThreadState` support an optimization for remote profilers that sample call +stacks from external processes. When a remote profiler reads the call stack, it +writes the current frame address to `last_profiled_frame`. The eval loop then +keeps this pointer valid by updating it to the parent frame whenever a frame +returns (in `_PyEval_FrameClearAndPop`) and increments the sequence. This creates a "high-water mark" that always points to a frame still on the stack. On subsequent samples, the profiler can walk from `current_frame` until it reaches -`last_profiled_frame`, knowing that frames from that point downward are unchanged -and can be retrieved from a cache. This significantly reduces the amount of remote -memory reads needed when call stacks are deep and stable at their base. - -The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when -`last_profiled_frame` is non-NULL AND matches the frame being popped. This -prevents transient frames (called and returned between profiler samples) from -corrupting the cache pointer, while avoiding any overhead when profiling is inactive. +`last_profiled_frame`, then validate the pointer and sequence before using cached +callers. This prevents a later frame that reuses the same `_PyInterpreterFrame` +address from being mistaken for the sampled frame. The cache significantly +reduces the amount of remote memory reads needed when call stacks are deep and +stable at their base. + +The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the +pointer and sequence when `last_profiled_frame` is non-NULL AND matches the +frame being popped. This prevents transient frames (called and returned between +profiler samples) from corrupting the cache anchor, while avoiding any overhead +when profiling is inactive. ### The Instruction Pointer diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index 6b1529aa173f01..d588475fdcf848 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -3451,6 +3451,90 @@ def level1(): # Parent frames: must match exactly self.assertEqual(loc_cached, loc_no_cache) + @skip_if_not_supported + @unittest.skipIf( + sys.platform != "linux" or not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_cache_rejects_reused_frame_address_aba(self): + """Test that frame address reuse cannot splice cached parent frames.""" + script_body = """\ + sock.sendall(b"ready") + + def burn_a(): + total = 0 + for i in range(20000): + total += i + return total + + def burn_b(): + total = 0 + for i in range(20000): + total += i + return total + + def a_leaf(): + return burn_a() + + def b_leaf(): + return burn_b() + + def a_parent(): + return a_leaf() + + def b_parent(): + return b_leaf() + + while True: + a_parent() + b_parent() + """ + + with self._target_process(script_body) as (p, client_socket, _): + _wait_for_signal(client_socket, b"ready") + unwinder = RemoteUnwinder( + p.pid, cache_frames=True, stats=True + ) + bad_stacks = [] + seen = 0 + branch_a = {"a_parent", "a_leaf", "burn_a"} + branch_b = {"b_parent", "b_leaf", "burn_b"} + + for _ in range(8000): + try: + traces = unwinder.get_stack_trace() + except TRANSIENT_ERRORS: + continue + for interp in traces: + for thread in interp.threads: + in_a = False + in_b = False + for frame in thread.frame_info: + name = frame.funcname + if name in branch_a: + in_a = True + elif name in branch_b: + in_b = True + if in_a and in_b: + break + if not (in_a or in_b): + continue + seen += 1 + if in_a and in_b: + funcs = [f.funcname for f in thread.frame_info] + bad_stacks.append(funcs) + if len(bad_stacks) >= 3: + break + + stats = unwinder.get_stats() + + self.assertGreater(seen, 0) + self.assertGreater( + stats["frame_cache_hits"] + stats["frame_cache_partial_hits"], 0 + ) + self.assertGreater(stats["frames_read_from_cache"], 0) + self.assertEqual(bad_stacks, []) + @skip_if_not_supported @unittest.skipIf( sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, diff --git a/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst b/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst index 1d1aadbf57be48..6b10b03ba02fdf 100644 --- a/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst +++ b/Misc/NEWS.d/next/Library/2026-06-13-11-57-48.gh-issue-151436.UEDowO.rst @@ -1,4 +1,4 @@ -Fix skewed stack trackes in the Tachyon profiler when caching is enabled and +Fix skewed stack traces in the Tachyon profiler when caching is enabled and when generators and coroutines are profiled, by updating ``tstate->last_profiled_frame`` at every frame-removal site. The issue resulted in total erasure of some callers. Patch by Maurycy Pawłowski-Wieroński. diff --git a/Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst b/Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst new file mode 100644 index 00000000000000..fbf3ce47ff2739 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-17-22-31-57.gh-issue-151613.n0nua1.rst @@ -0,0 +1,3 @@ +Fix another way the Tachyon profiler frame cache could produce impossible +mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by +validating cached frame anchors with a sequence counter. diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 635e6e208902af..fa37fb7b2167ec 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -225,9 +225,15 @@ typedef struct { #define FRAME_CACHE_MAX_THREADS 32 #define FRAME_CACHE_MAX_FRAMES 1024 +typedef struct { + uintptr_t frame; + uintptr_t seq; +} FrameCacheAnchor; + typedef struct { uint64_t thread_id; // 0 = empty slot uintptr_t thread_state_addr; + uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0] uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; PyObject *thread_id_obj; // owned reference, NULL if empty @@ -434,7 +440,7 @@ typedef struct { uintptr_t thread_state_addr; // Owning thread state address uintptr_t base_frame_addr; // Sentinel at bottom (for validation) uintptr_t gc_frame; // GC frame address (0 if not tracking) - uintptr_t last_profiled_frame; // Last cached frame (0 if no cache) + FrameCacheAnchor last_profiled; // Last cached frame anchor StackChunkList *chunks; // Pre-copied stack chunks int skip_first_frame; // Skip frame_addr itself (continue from its caller) RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers @@ -622,15 +628,21 @@ extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder); extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id); extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr); extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder); +extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr); extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result); extern int frame_cache_lookup_and_extend( RemoteUnwinderObject *unwinder, uint64_t thread_id, - uintptr_t last_profiled_frame, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor, PyObject *frame_info, uintptr_t *frame_addrs, Py_ssize_t *num_addrs, Py_ssize_t max_addrs); +extern int frame_cache_anchor_matches( + RemoteUnwinderObject *unwinder, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor); // Returns: 1 = stored, 0 = not stored (graceful), -1 = error // Only stores complete stacks that reach base_frame_addr extern int frame_cache_store( @@ -640,6 +652,7 @@ extern int frame_cache_store( const uintptr_t *addrs, Py_ssize_t num_addrs, uintptr_t thread_state_addr, + uintptr_t last_profiled_frame_seq, uintptr_t base_frame_addr, uintptr_t last_frame_visited); diff --git a/Modules/_remote_debugging/debug_offsets_validation.h b/Modules/_remote_debugging/debug_offsets_validation.h index f070f03ac459dc..c0c01a0a639e19 100644 --- a/Modules/_remote_debugging/debug_offsets_validation.h +++ b/Modules/_remote_debugging/debug_offsets_validation.h @@ -31,7 +31,7 @@ #define FIELD_SIZE(type, member) sizeof(((type *)0)->member) enum { - PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880, + PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888, PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104, }; @@ -261,7 +261,8 @@ validate_fixed_field( APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ - APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size) + APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \ + APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size) #define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \ APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \ diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index 19fc406bca9ac9..1b2d9a60f408b9 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -147,25 +147,87 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; unwinder->frame_cache[i].thread_state_addr = 0; + unwinder->frame_cache[i].last_profiled_frame_seq = 0; unwinder->frame_cache[i].num_addrs = 0; STATS_INC(unwinder, stale_cache_invalidations); } } } +static int +read_last_profiled_anchor(RemoteUnwinderObject *unwinder, + uintptr_t thread_state_addr, + FrameCacheAnchor *anchor) +{ + uintptr_t frame_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame; + uintptr_t seq_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame_seq; + + // These fields are adjacent in PyThreadState. Read them together when the + // layout allows it so validation uses a pointer and sequence from the same + // remote-memory read. + if (seq_offset == frame_offset + sizeof(uintptr_t)) { + uintptr_t live_anchor[2]; + if (_Py_RemoteDebug_ReadRemoteMemory(&unwinder->handle, + thread_state_addr + frame_offset, + sizeof(live_anchor), + live_anchor) < 0) { + return -1; + } + anchor->frame = live_anchor[0]; + anchor->seq = live_anchor[1]; + return 0; + } + + if (read_ptr(unwinder, thread_state_addr + frame_offset, &anchor->frame) < 0) { + return -1; + } + return read_ptr(unwinder, thread_state_addr + seq_offset, &anchor->seq); +} + +static Py_ssize_t +find_cached_frame_addr(const FrameCacheEntry *entry, uintptr_t frame_addr, + uintptr_t *real_pops) +{ + *real_pops = 0; + for (Py_ssize_t i = 0; i < entry->num_addrs; i++) { + if (entry->addrs[i] == frame_addr) { + return i; + } + if (entry->addrs[i] != 0) { + (*real_pops)++; + } + } + return -1; +} + +int +frame_cache_anchor_matches( + RemoteUnwinderObject *unwinder, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor) +{ + FrameCacheAnchor live_anchor = {0, 0}; + if (read_last_profiled_anchor(unwinder, thread_state_addr, &live_anchor) < 0) { + PyErr_Clear(); + return 0; + } + return live_anchor.frame == anchor.frame && live_anchor.seq == anchor.seq; +} + // Find last_profiled_frame in cache and extend frame_info with cached continuation // If frame_addrs is provided (not NULL), also extends it with cached addresses int frame_cache_lookup_and_extend( RemoteUnwinderObject *unwinder, uint64_t thread_id, - uintptr_t last_profiled_frame, + uintptr_t thread_state_addr, + FrameCacheAnchor anchor, PyObject *frame_info, uintptr_t *frame_addrs, Py_ssize_t *num_addrs, Py_ssize_t max_addrs) { - if (!unwinder->frame_cache || last_profiled_frame == 0) { + if (!unwinder->frame_cache || anchor.frame == 0) { return 0; } @@ -173,24 +235,31 @@ frame_cache_lookup_and_extend( if (!entry || !entry->frame_list) { return 0; } + if (entry->thread_state_addr != thread_state_addr) { + return 0; + } assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES); - // Find the index where last_profiled_frame matches - Py_ssize_t start_idx = -1; - for (Py_ssize_t i = 0; i < entry->num_addrs; i++) { - if (entry->addrs[i] == last_profiled_frame) { - start_idx = i; - break; - } - } - + uintptr_t real_pops = 0; + Py_ssize_t start_idx = find_cached_frame_addr(entry, anchor.frame, &real_pops); if (start_idx < 0) { return 0; // Not found } assert(start_idx < entry->num_addrs); + // Synthetic marker frames (/) are stored as addr-0 entries but + // never increment last_profiled_frame_seq in the target (only real frame + // pops do). Count the real frames before start_idx so the sequence check is + // not thrown off by markers sitting between the leaf and the anchor. + if (entry->last_profiled_frame_seq + real_pops != anchor.seq) { + return 0; + } + Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list); + if (start_idx >= num_frames) { + return 0; + } // Extend frame_info with frames ABOVE start_idx (not including it). // The frame at start_idx (last_profiled_frame) was the executing frame @@ -200,6 +269,9 @@ frame_cache_lookup_and_extend( if (cache_start >= num_frames) { return 0; // Nothing above last_profiled_frame to extend with } + if (!frame_cache_anchor_matches(unwinder, thread_state_addr, anchor)) { + return 0; + } PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames); if (!slice) { @@ -235,6 +307,7 @@ frame_cache_store( const uintptr_t *addrs, Py_ssize_t num_addrs, uintptr_t thread_state_addr, + uintptr_t last_profiled_frame_seq, uintptr_t base_frame_addr, uintptr_t last_frame_visited) { @@ -277,6 +350,7 @@ frame_cache_store( } entry->thread_id = thread_id; entry->thread_state_addr = thread_state_addr; + entry->last_profiled_frame_seq = last_profiled_frame_seq; if (entry->thread_id_obj == NULL) { entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id); if (entry->thread_id_obj == NULL) { diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index e7d2a276439026..46968acc6ff1fe 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -416,7 +416,7 @@ process_frame_chain( Py_DECREF(frame); } - if (ctx->last_profiled_frame != 0 && frame_addr == ctx->last_profiled_frame) { + if (ctx->last_profiled.frame != 0 && frame_addr == ctx->last_profiled.frame) { ctx->stopped_at_cached_frame = 1; break; } @@ -437,14 +437,23 @@ process_frame_chain( return 0; } -// Clear last_profiled_frame for all threads in the target process. -// This must be called at the start of profiling to avoid stale values -// from previous profilers causing us to stop frame walking early. +int +set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, + uintptr_t frame_addr) +{ + uintptr_t lpf_addr = tstate_addr + + (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame; + return _Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, + sizeof(uintptr_t), &frame_addr); +} + +// Clear the profiler anchor frame for all threads in the target process. The +// sequence is intentionally preserved: a zero frame disables cache lookup, and +// the next profiler-owned anchor should use the target's current generation. int clear_last_profiled_frames(RemoteUnwinderObject *unwinder) { uintptr_t current_interp = unwinder->interpreter_addr; - uintptr_t zero = 0; const size_t MAX_INTERPRETERS = 256; size_t interp_count = 0; @@ -467,11 +476,8 @@ clear_last_profiled_frames(RemoteUnwinderObject *unwinder) size_t thread_count = 0; while (tstate_addr != 0 && thread_count < MAX_THREADS_PER_INTERP) { thread_count++; - // Clear last_profiled_frame - uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame; - if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, - sizeof(uintptr_t), &zero) < 0) { - // Non-fatal: just continue + uintptr_t no_frame = 0; + if (set_last_profiled_frame(unwinder, tstate_addr, no_frame) < 0) { PyErr_Clear(); } @@ -512,10 +518,10 @@ try_full_cache_hit( const FrameWalkContext *ctx, uint64_t thread_id) { - if (!unwinder->frame_cache || ctx->last_profiled_frame == 0) { + if (!unwinder->frame_cache || ctx->last_profiled.frame == 0) { return 0; } - if (ctx->frame_addr != ctx->last_profiled_frame) { + if (ctx->frame_addr != ctx->last_profiled.frame) { return 0; } @@ -523,10 +529,16 @@ try_full_cache_hit( if (!entry || !entry->frame_list) { return 0; } + if (entry->thread_state_addr != ctx->thread_state_addr) { + return 0; + } if (entry->num_addrs == 0 || entry->addrs[0] != ctx->frame_addr) { return 0; } + if (entry->last_profiled_frame_seq != ctx->last_profiled.seq) { + return 0; + } PyObject *current_frame = NULL; uintptr_t code_object_addr = 0; @@ -544,6 +556,11 @@ try_full_cache_hit( if (parse_result < 0) { return -1; } + if (!frame_cache_anchor_matches(unwinder, ctx->thread_state_addr, + ctx->last_profiled)) { + Py_XDECREF(current_frame); + return 0; + } if (current_frame != NULL) { if (PyList_Append(ctx->frame_info, current_frame) < 0) { @@ -582,9 +599,12 @@ collect_frames_with_cache( assert(ctx->chunks != NULL); + // Cache misses copy stack chunks before walking. Frames found there are + // parsed from a stable snapshot, which keeps moving stacks from seeding the + // cache with an impossible parent chain. if (ctx->chunks->count == 0) { if (copy_stack_chunks(unwinder, ctx->thread_state_addr, ctx->chunks) < 0) { - PyErr_Clear(); + return -1; } } @@ -598,7 +618,9 @@ collect_frames_with_cache( if (ctx->stopped_at_cached_frame) { Py_ssize_t frames_before_cache = PyList_GET_SIZE(ctx->frame_info); - int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, ctx->last_profiled_frame, + int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, + ctx->thread_state_addr, + ctx->last_profiled, ctx->frame_info, ctx->frame_addrs, &ctx->num_addrs, ctx->max_addrs); if (cache_result < 0) { @@ -610,7 +632,7 @@ collect_frames_with_cache( // Continue walking from last_profiled_frame, skipping it (already processed) Py_ssize_t frames_before_walk = PyList_GET_SIZE(ctx->frame_info); FrameWalkContext continue_ctx = { - .frame_addr = ctx->last_profiled_frame, + .frame_addr = ctx->last_profiled.frame, .base_frame_addr = ctx->base_frame_addr, .gc_frame = ctx->gc_frame, .chunks = ctx->chunks, @@ -634,13 +656,14 @@ collect_frames_with_cache( STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(ctx->frame_info) - frames_before_cache); } } else { - if (ctx->last_profiled_frame == 0) { + if (ctx->last_profiled.frame == 0) { STATS_INC(unwinder, frame_cache_misses); } } - if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs, - ctx->thread_state_addr, ctx->base_frame_addr, + if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, + ctx->num_addrs, ctx->thread_state_addr, + ctx->last_profiled.seq, ctx->base_frame_addr, ctx->last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 36115f20d9d4cc..7979cd43c6a127 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -475,8 +475,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return -1; } - // Clear stale last_profiled_frame values from previous profilers - // This prevents us from stopping frame walking early due to stale values + // Clear stale profiler anchors from previous profilers. This prevents us + // from stopping frame walking early due to stale frame pointers. if (cache_frames) { clear_last_profiled_frames(self); } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 81735e85395ac9..29f22f14c9b29f 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -503,7 +503,8 @@ unwind_stack_for_thread( goto error; } - // In cache mode, copying stack chunks is more expensive than direct memory reads + // Cache mode skips this for full hits, but cache misses copy chunks before + // walking so newly stored cache entries come from a stable stack snapshot. if (!unwinder->cache_frames) { if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks"); @@ -528,18 +529,17 @@ unwind_stack_for_thread( if (unwinder->cache_frames) { // Use cache to avoid re-reading unchanged parent frames - ctx.last_profiled_frame = GET_MEMBER(uintptr_t, ts, + ctx.last_profiled.frame = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.last_profiled_frame); + ctx.last_profiled.seq = GET_MEMBER(uintptr_t, ts, + unwinder->debug_offsets.thread_state.last_profiled_frame_seq); if (collect_frames_with_cache(unwinder, &ctx, tid) < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames"); goto error; } // Update last_profiled_frame for next sample if it changed - if (frame_addr != ctx.last_profiled_frame) { - uintptr_t lpf_addr = - *current_tstate + (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame; - if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr, - sizeof(uintptr_t), &frame_addr) < 0) { + if (frame_addr != ctx.last_profiled.frame) { + if (set_last_profiled_frame(unwinder, *current_tstate, frame_addr) < 0) { PyErr_Clear(); // Non-fatal } } diff --git a/Python/pystate.c b/Python/pystate.c index fed1df0173bacf..29f13e92e0dd1f 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1605,6 +1605,8 @@ init_threadstate(_PyThreadStateImpl *_tstate, tstate->current_frame = &_tstate->base_frame; // base_frame pointer for profilers to validate stack unwinding tstate->base_frame = &_tstate->base_frame; + tstate->last_profiled_frame = NULL; + tstate->last_profiled_frame_seq = 0; tstate->datastack_chunk = NULL; tstate->datastack_top = NULL; tstate->datastack_limit = NULL;