Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ struct _ts {
struct _PyInterpreterFrame *base_frame;

struct _PyInterpreterFrame *last_profiled_frame;
uintptr_t last_profiled_frame_seq;

Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_debug_offsets.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ typedef struct _Py_DebugOffsets {
uint64_t current_frame;
uint64_t base_frame;
uint64_t last_profiled_frame;
uint64_t last_profiled_frame_seq;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
Expand Down Expand Up @@ -294,6 +295,7 @@ typedef struct _Py_DebugOffsets {
.current_frame = offsetof(PyThreadState, current_frame), \
.base_frame = offsetof(PyThreadState, base_frame), \
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
.last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_interpframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,15 @@ _PyThreadState_GetFrame(PyThreadState *tstate)
// This avoids corrupting the cache when transient frames (called and returned
// between profiler samples) update last_profiled_frame to addresses the
// profiler never saw.
// The sequence distinguishes this anchor from a later frame that reuses the
// same _PyInterpreterFrame address.
#define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \
do { \
PyThreadState *tstate_ = (tstate); \
_PyInterpreterFrame *frame_ = (frame); \
if (tstate_->last_profiled_frame == frame_) { \
tstate_->last_profiled_frame = (previous); \
tstate_->last_profiled_frame_seq++; \
} \
} while (0)

Expand Down
30 changes: 17 additions & 13 deletions InternalDocs/frames.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,26 @@ since the frame chain may have been in an inconsistent state due to concurrent u

### Remote Profiling Frame Cache

The `last_profiled_frame` field in `PyThreadState` supports an optimization for
remote profilers that sample call stacks from external processes. When a remote
profiler reads the call stack, it writes the current frame address to this field.
The eval loop then keeps this pointer valid by updating it to the parent frame
whenever a frame returns (in `_PyEval_FrameClearAndPop`).
The `last_profiled_frame` and `last_profiled_frame_seq` fields in
`PyThreadState` support an optimization for remote profilers that sample call
stacks from external processes. When a remote profiler reads the call stack, it
writes the current frame address to `last_profiled_frame`. The eval loop then
keeps this pointer valid by updating it to the parent frame whenever a frame
returns (in `_PyEval_FrameClearAndPop`) and increments the sequence.

This creates a "high-water mark" that always points to a frame still on the stack.
On subsequent samples, the profiler can walk from `current_frame` until it reaches
`last_profiled_frame`, knowing that frames from that point downward are unchanged
and can be retrieved from a cache. This significantly reduces the amount of remote
memory reads needed when call stacks are deep and stable at their base.

The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
prevents transient frames (called and returned between profiler samples) from
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
`last_profiled_frame`, then validate the pointer and sequence before using cached
callers. This prevents a later frame that reuses the same `_PyInterpreterFrame`
address from being mistaken for the sampled frame. The cache significantly
reduces the amount of remote memory reads needed when call stacks are deep and
stable at their base.

The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the
pointer and sequence when `last_profiled_frame` is non-NULL AND matches the
frame being popped. This prevents transient frames (called and returned between
profiler samples) from corrupting the cache anchor, while avoiding any overhead
when profiling is inactive.


### The Instruction Pointer
Expand Down
84 changes: 84 additions & 0 deletions Lib/test/test_external_inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3451,6 +3451,90 @@ def level1():
# Parent frames: must match exactly
self.assertEqual(loc_cached, loc_no_cache)

@skip_if_not_supported
@unittest.skipIf(
sys.platform != "linux" or not PROCESS_VM_READV_SUPPORTED,
"Test only runs on Linux with process_vm_readv support",
)
def test_cache_rejects_reused_frame_address_aba(self):
"""Test that frame address reuse cannot splice cached parent frames."""
script_body = """\
sock.sendall(b"ready")

def burn_a():
total = 0
for i in range(20000):
total += i
return total

def burn_b():
total = 0
for i in range(20000):
total += i
return total

def a_leaf():
return burn_a()

def b_leaf():
return burn_b()

def a_parent():
return a_leaf()

def b_parent():
return b_leaf()

while True:
a_parent()
b_parent()
"""

with self._target_process(script_body) as (p, client_socket, _):
_wait_for_signal(client_socket, b"ready")
unwinder = RemoteUnwinder(
p.pid, cache_frames=True, stats=True
)
bad_stacks = []
seen = 0
branch_a = {"a_parent", "a_leaf", "burn_a"}
branch_b = {"b_parent", "b_leaf", "burn_b"}

for _ in range(8000):
try:
traces = unwinder.get_stack_trace()
except TRANSIENT_ERRORS:
continue
for interp in traces:
for thread in interp.threads:
in_a = False
in_b = False
for frame in thread.frame_info:
name = frame.funcname
if name in branch_a:
in_a = True
elif name in branch_b:
in_b = True
if in_a and in_b:
break
if not (in_a or in_b):
continue
seen += 1
if in_a and in_b:
funcs = [f.funcname for f in thread.frame_info]
bad_stacks.append(funcs)
if len(bad_stacks) >= 3:
break

stats = unwinder.get_stats()

self.assertGreater(seen, 0)
self.assertGreater(
stats["frame_cache_hits"] + stats["frame_cache_partial_hits"], 0
)
self.assertGreater(stats["frames_read_from_cache"], 0)
self.assertEqual(bad_stacks, [])

@skip_if_not_supported
@unittest.skipIf(
sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Fix skewed stack trackes in the Tachyon profiler when caching is enabled and
Fix skewed stack traces in the Tachyon profiler when caching is enabled and
when generators and coroutines are profiled, by updating
``tstate->last_profiled_frame`` at every frame-removal site. The issue resulted
in total erasure of some callers. Patch by Maurycy Pawłowski-Wieroński.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix another way the Tachyon profiler frame cache could produce impossible
mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by
validating cached frame anchors with a sequence counter.
17 changes: 15 additions & 2 deletions Modules/_remote_debugging/_remote_debugging.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,15 @@ typedef struct {
#define FRAME_CACHE_MAX_THREADS 32
#define FRAME_CACHE_MAX_FRAMES 1024

typedef struct {
uintptr_t frame;
uintptr_t seq;
} FrameCacheAnchor;

typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t thread_state_addr;
uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0]
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *thread_id_obj; // owned reference, NULL if empty
Expand Down Expand Up @@ -434,7 +440,7 @@ typedef struct {
uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
FrameCacheAnchor last_profiled; // Last cached frame anchor
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
Expand Down Expand Up @@ -622,15 +628,21 @@ extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t last_profiled_frame,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs);
extern int frame_cache_anchor_matches(
RemoteUnwinderObject *unwinder,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor);
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
// Only stores complete stacks that reach base_frame_addr
extern int frame_cache_store(
Expand All @@ -640,6 +652,7 @@ extern int frame_cache_store(
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t last_profiled_frame_seq,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);

Expand Down
5 changes: 3 additions & 2 deletions Modules/_remote_debugging/debug_offsets_validation.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)

enum {
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880,
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888,
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
};

Expand Down Expand Up @@ -261,7 +261,8 @@ validate_fixed_field(
APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)

#define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \
APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \
Expand Down
Loading
Loading