Skip to content

Commit fc9fafd

Browse files
committed
gh-151613: Fix remote debugging frame cache ABA
Fixes #151613. The remote debugging frame cache previously used only the last_profiled_frame address as its cache anchor. If a frame returned and a later frame reused the same _PyInterpreterFrame address, the profiler could accept a stale cache entry and splice parent frames from a different call chain into the current stack. This adds a last_profiled_frame_seq counter next to last_profiled_frame, increments it when the anchor advances, stores it in frame cache entries, and validates cache hits against both the frame address and the sequence. Cache miss walks now copy stack chunks before storing new cache entries so stored continuations come from a stable snapshot. The new regression test exercises alternating call chains and checks that cached stacks never contain frames from both branches.
1 parent a8d74c0 commit fc9fafd

14 files changed

Lines changed: 266 additions & 56 deletions

Include/cpython/pystate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ struct _ts {
143143
struct _PyInterpreterFrame *base_frame;
144144

145145
struct _PyInterpreterFrame *last_profiled_frame;
146+
uintptr_t last_profiled_frame_seq;
146147

147148
Py_tracefunc c_profilefunc;
148149
Py_tracefunc c_tracefunc;

Include/internal/pycore_debug_offsets.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ typedef struct _Py_DebugOffsets {
104104
uint64_t current_frame;
105105
uint64_t base_frame;
106106
uint64_t last_profiled_frame;
107+
uint64_t last_profiled_frame_seq;
107108
uint64_t thread_id;
108109
uint64_t native_thread_id;
109110
uint64_t datastack_chunk;
@@ -294,6 +295,7 @@ typedef struct _Py_DebugOffsets {
294295
.current_frame = offsetof(PyThreadState, current_frame), \
295296
.base_frame = offsetof(PyThreadState, base_frame), \
296297
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
298+
.last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \
297299
.thread_id = offsetof(PyThreadState, thread_id), \
298300
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
299301
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \

Include/internal/pycore_interpframe.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,12 +292,15 @@ _PyThreadState_GetFrame(PyThreadState *tstate)
292292
// This avoids corrupting the cache when transient frames (called and returned
293293
// between profiler samples) update last_profiled_frame to addresses the
294294
// profiler never saw.
295+
// The sequence distinguishes this anchor from a later frame that reuses the
296+
// same _PyInterpreterFrame address.
295297
#define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \
296298
do { \
297299
PyThreadState *tstate_ = (tstate); \
298300
_PyInterpreterFrame *frame_ = (frame); \
299301
if (tstate_->last_profiled_frame == frame_) { \
300302
tstate_->last_profiled_frame = (previous); \
303+
tstate_->last_profiled_frame_seq++; \
301304
} \
302305
} while (0)
303306

InternalDocs/frames.md

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -142,22 +142,26 @@ since the frame chain may have been in an inconsistent state due to concurrent u
142142

143143
### Remote Profiling Frame Cache
144144

145-
The `last_profiled_frame` field in `PyThreadState` supports an optimization for
146-
remote profilers that sample call stacks from external processes. When a remote
147-
profiler reads the call stack, it writes the current frame address to this field.
148-
The eval loop then keeps this pointer valid by updating it to the parent frame
149-
whenever a frame returns (in `_PyEval_FrameClearAndPop`).
145+
The `last_profiled_frame` and `last_profiled_frame_seq` fields in
146+
`PyThreadState` support an optimization for remote profilers that sample call
147+
stacks from external processes. When a remote profiler reads the call stack, it
148+
writes the current frame address to `last_profiled_frame`. The eval loop then
149+
keeps this pointer valid by updating it to the parent frame whenever a frame
150+
returns (in `_PyEval_FrameClearAndPop`) and increments the sequence.
150151

151152
This creates a "high-water mark" that always points to a frame still on the stack.
152153
On subsequent samples, the profiler can walk from `current_frame` until it reaches
153-
`last_profiled_frame`, knowing that frames from that point downward are unchanged
154-
and can be retrieved from a cache. This significantly reduces the amount of remote
155-
memory reads needed when call stacks are deep and stable at their base.
156-
157-
The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
158-
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
159-
prevents transient frames (called and returned between profiler samples) from
160-
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
154+
`last_profiled_frame`, then validate the pointer and sequence before using cached
155+
callers. This prevents a later frame that reuses the same `_PyInterpreterFrame`
156+
address from being mistaken for the sampled frame. The cache significantly
157+
reduces the amount of remote memory reads needed when call stacks are deep and
158+
stable at their base.
159+
160+
The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the
161+
pointer and sequence when `last_profiled_frame` is non-NULL AND matches the
162+
frame being popped. This prevents transient frames (called and returned between
163+
profiler samples) from corrupting the cache anchor, while avoiding any overhead
164+
when profiling is inactive.
161165

162166

163167
### The Instruction Pointer

Lib/test/test_external_inspection.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3451,6 +3451,90 @@ def level1():
34513451
# Parent frames: must match exactly
34523452
self.assertEqual(loc_cached, loc_no_cache)
34533453

3454+
@skip_if_not_supported
3455+
@unittest.skipIf(
3456+
sys.platform != "linux" or not PROCESS_VM_READV_SUPPORTED,
3457+
"Test only runs on Linux with process_vm_readv support",
3458+
)
3459+
def test_cache_rejects_reused_frame_address_aba(self):
3460+
"""Test that frame address reuse cannot splice cached parent frames."""
3461+
script_body = """\
3462+
sock.sendall(b"ready")
3463+
3464+
def burn_a():
3465+
total = 0
3466+
for i in range(20000):
3467+
total += i
3468+
return total
3469+
3470+
def burn_b():
3471+
total = 0
3472+
for i in range(20000):
3473+
total += i
3474+
return total
3475+
3476+
def a_leaf():
3477+
return burn_a()
3478+
3479+
def b_leaf():
3480+
return burn_b()
3481+
3482+
def a_parent():
3483+
return a_leaf()
3484+
3485+
def b_parent():
3486+
return b_leaf()
3487+
3488+
while True:
3489+
a_parent()
3490+
b_parent()
3491+
"""
3492+
3493+
with self._target_process(script_body) as (p, client_socket, _):
3494+
_wait_for_signal(client_socket, b"ready")
3495+
unwinder = RemoteUnwinder(
3496+
p.pid, cache_frames=True, stats=True
3497+
)
3498+
bad_stacks = []
3499+
seen = 0
3500+
branch_a = {"a_parent", "a_leaf", "burn_a"}
3501+
branch_b = {"b_parent", "b_leaf", "burn_b"}
3502+
3503+
for _ in range(8000):
3504+
try:
3505+
traces = unwinder.get_stack_trace()
3506+
except TRANSIENT_ERRORS:
3507+
continue
3508+
for interp in traces:
3509+
for thread in interp.threads:
3510+
in_a = False
3511+
in_b = False
3512+
for frame in thread.frame_info:
3513+
name = frame.funcname
3514+
if name in branch_a:
3515+
in_a = True
3516+
elif name in branch_b:
3517+
in_b = True
3518+
if in_a and in_b:
3519+
break
3520+
if not (in_a or in_b):
3521+
continue
3522+
seen += 1
3523+
if in_a and in_b:
3524+
funcs = [f.funcname for f in thread.frame_info]
3525+
bad_stacks.append(funcs)
3526+
if len(bad_stacks) >= 3:
3527+
break
3528+
3529+
stats = unwinder.get_stats()
3530+
3531+
self.assertGreater(seen, 0)
3532+
self.assertGreater(
3533+
stats["frame_cache_hits"] + stats["frame_cache_partial_hits"], 0
3534+
)
3535+
self.assertGreater(stats["frames_read_from_cache"], 0)
3536+
self.assertEqual(bad_stacks, [])
3537+
34543538
@skip_if_not_supported
34553539
@unittest.skipIf(
34563540
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Fix skewed stack trackes in the Tachyon profiler when caching is enabled and
1+
Fix skewed stack traces in the Tachyon profiler when caching is enabled and
22
when generators and coroutines are profiled, by updating
33
``tstate->last_profiled_frame`` at every frame-removal site. The issue resulted
44
in total erasure of some callers. Patch by Maurycy Pawłowski-Wieroński.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix another way the Tachyon profiler frame cache could produce impossible
2+
mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by
3+
validating cached frame anchors with a sequence counter.

Modules/_remote_debugging/_remote_debugging.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,9 +225,15 @@ typedef struct {
225225
#define FRAME_CACHE_MAX_THREADS 32
226226
#define FRAME_CACHE_MAX_FRAMES 1024
227227

228+
typedef struct {
229+
uintptr_t frame;
230+
uintptr_t seq;
231+
} FrameCacheAnchor;
232+
228233
typedef struct {
229234
uint64_t thread_id; // 0 = empty slot
230235
uintptr_t thread_state_addr;
236+
uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0]
231237
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
232238
Py_ssize_t num_addrs;
233239
PyObject *thread_id_obj; // owned reference, NULL if empty
@@ -434,7 +440,7 @@ typedef struct {
434440
uintptr_t thread_state_addr; // Owning thread state address
435441
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
436442
uintptr_t gc_frame; // GC frame address (0 if not tracking)
437-
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
443+
FrameCacheAnchor last_profiled; // Last cached frame anchor
438444
StackChunkList *chunks; // Pre-copied stack chunks
439445
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
440446
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
@@ -622,15 +628,21 @@ extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
622628
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
623629
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
624630
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
631+
extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr);
625632
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
626633
extern int frame_cache_lookup_and_extend(
627634
RemoteUnwinderObject *unwinder,
628635
uint64_t thread_id,
629-
uintptr_t last_profiled_frame,
636+
uintptr_t thread_state_addr,
637+
FrameCacheAnchor anchor,
630638
PyObject *frame_info,
631639
uintptr_t *frame_addrs,
632640
Py_ssize_t *num_addrs,
633641
Py_ssize_t max_addrs);
642+
extern int frame_cache_anchor_matches(
643+
RemoteUnwinderObject *unwinder,
644+
uintptr_t thread_state_addr,
645+
FrameCacheAnchor anchor);
634646
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
635647
// Only stores complete stacks that reach base_frame_addr
636648
extern int frame_cache_store(
@@ -640,6 +652,7 @@ extern int frame_cache_store(
640652
const uintptr_t *addrs,
641653
Py_ssize_t num_addrs,
642654
uintptr_t thread_state_addr,
655+
uintptr_t last_profiled_frame_seq,
643656
uintptr_t base_frame_addr,
644657
uintptr_t last_frame_visited);
645658

Modules/_remote_debugging/debug_offsets_validation.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)
3232

3333
enum {
34-
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880,
34+
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888,
3535
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
3636
};
3737

@@ -261,7 +261,8 @@ validate_fixed_field(
261261
APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
262262
APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
263263
APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
264-
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
264+
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
265+
APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
265266

266267
#define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \
267268
APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \

0 commit comments

Comments
 (0)