@@ -30,6 +30,7 @@ extern "C" {
3030#include "internal/pycore_llist.h" // struct llist_node
3131#include "internal/pycore_long.h" // _PyLong_GetZero
3232#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause
33+ #include "internal/pycore_pyhash.h" // _Py_HashPointerRaw
3334#include "internal/pycore_stackref.h" // Py_TAG_BITS
3435#include "../../Python/remote_debug.h"
3536
@@ -215,6 +216,8 @@ typedef struct {
215216 PyObject * file_name ;
216217 int first_lineno ;
217218 PyObject * linetable ; // bytes
219+ PyObject * last_frame_info ;
220+ ptrdiff_t last_addrq ;
218221 uintptr_t addr_code_adaptive ;
219222} CachedCodeMetadata ;
220223
@@ -224,11 +227,41 @@ typedef struct {
224227
225228typedef struct {
226229 uint64_t thread_id ; // 0 = empty slot
230+ uintptr_t thread_state_addr ;
227231 uintptr_t addrs [FRAME_CACHE_MAX_FRAMES ];
228232 Py_ssize_t num_addrs ;
233+ PyObject * thread_id_obj ; // owned reference, NULL if empty
229234 PyObject * frame_list ; // owned reference, NULL if empty
230235} FrameCacheEntry ;
231236
237+ #define INTERPRETER_THREAD_CACHE_SIZE 32
238+ #if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1 )) != 0
239+ # error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
240+ #endif
241+
242+ // The two per-interpreter L2 caches below are split into per-field tables so
243+ // that a writer rebinding one slot cannot leave stale data in a field owned by
244+ // the other when the slot is reused across interpreters.
245+ typedef struct {
246+ uintptr_t interpreter_addr ;
247+ uintptr_t thread_state_addr ;
248+ } InterpreterTstateCacheEntry ;
249+ typedef struct {
250+ uintptr_t interpreter_addr ;
251+ uint64_t code_object_generation ;
252+ } InterpreterGenerationCacheEntry ;
253+
254+ // Carries already-read thread state and/or frame buffers across helpers so the
255+ // downstream callee can skip a remote read. Address fields are caller-supplied
256+ // inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
257+ // successfully populated them.
258+ typedef struct {
259+ const char * tstate ;
260+ uintptr_t tstate_addr ;
261+ const char * frame ;
262+ uintptr_t frame_addr ;
263+ } RemoteReadPrefetch ;
264+
232265/* Statistics for profiling performance analysis */
233266typedef struct {
234267 uint64_t total_samples ; // Total number of get_stack_trace calls
@@ -242,14 +275,44 @@ typedef struct {
242275 uint64_t code_object_cache_hits ; // Code object cache hits
243276 uint64_t code_object_cache_misses ; // Code object cache misses
244277 uint64_t stale_cache_invalidations ; // Times stale entries were cleared
278+ uint64_t batched_read_attempts ; // Batched remote-read attempts
279+ uint64_t batched_read_successes ; // Attempts that read all requested segments
280+ uint64_t batched_read_misses ; // Attempts that fell back or partially read
281+ uint64_t batched_read_segments_requested ; // Segments requested by batched reads
282+ uint64_t batched_read_segments_completed ; // Segments completed by batched reads
245283} UnwinderStats ;
246284
285+ #if defined(__GNUC__ ) || defined(__clang__ )
286+ # define REMOTE_DEBUG_UNLIKELY (value ) __builtin_expect(!!(value), 0)
287+ #else
288+ # define REMOTE_DEBUG_UNLIKELY (value ) (value)
289+ #endif
290+
247291/* Stats tracking macros - no-op when stats collection is disabled */
248292#define STATS_INC (unwinder , field ) \
249- do { if (( unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
293+ do { if (REMOTE_DEBUG_UNLIKELY(( unwinder)->collect_stats) ) (unwinder)->stats.field++; } while(0)
250294
251295#define STATS_ADD (unwinder , field , val ) \
252- do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
296+ do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)
297+
298+ #if HAVE_PROCESS_VM_READV
299+ # define STATS_BATCHED_READ (unwinder , requested , completed ) \
300+ do { \
301+ if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
302+ (unwinder)->stats.batched_read_attempts++; \
303+ (unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
304+ (unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
305+ if ((completed) == (requested)) { \
306+ (unwinder)->stats.batched_read_successes++; \
307+ } \
308+ else { \
309+ (unwinder)->stats.batched_read_misses++; \
310+ } \
311+ } \
312+ } while(0)
313+ #else
314+ # define STATS_BATCHED_READ (unwinder , requested , completed ) ((void)0)
315+ #endif
253316
254317typedef struct {
255318 PyTypeObject * RemoteDebugging_Type ;
@@ -290,7 +353,6 @@ typedef struct {
290353 struct _Py_AsyncioModuleDebugOffsets async_debug_offsets ;
291354 uintptr_t interpreter_addr ;
292355 uintptr_t tstate_addr ;
293- uint64_t code_object_generation ;
294356 _Py_hashtable_t * code_object_cache ;
295357 int debug ;
296358 int only_active_thread ;
@@ -302,9 +364,17 @@ typedef struct {
302364 int cache_frames ;
303365 int collect_stats ; // whether to collect statistics
304366 uint32_t stale_invalidation_counter ; // counter for throttling frame_cache_invalidate_stale
367+ // L1 single-entry shortcut over cached_tstates[]: most workloads sample one
368+ // interpreter, so check these pairs before hashing into the table below.
369+ uintptr_t cached_tstate_interpreter_addr ;
370+ uintptr_t cached_tstate_addr ;
371+ uintptr_t cached_generation_interpreter_addr ;
372+ uint64_t cached_code_object_generation ;
305373 RemoteDebuggingState * cached_state ;
306374 FrameCacheEntry * frame_cache ; // preallocated array of FRAME_CACHE_MAX_THREADS entries
307375 UnwinderStats stats ; // statistics for performance analysis
376+ InterpreterTstateCacheEntry cached_tstates [INTERPRETER_THREAD_CACHE_SIZE ];
377+ InterpreterGenerationCacheEntry cached_generations [INTERPRETER_THREAD_CACHE_SIZE ];
308378#ifdef Py_GIL_DISABLED
309379 uint32_t tlbc_generation ;
310380 _Py_hashtable_t * tlbc_cache ;
@@ -361,11 +431,13 @@ typedef struct {
361431typedef struct {
362432 /* Inputs */
363433 uintptr_t frame_addr ; // Starting frame address
434+ uintptr_t thread_state_addr ; // Owning thread state address
364435 uintptr_t base_frame_addr ; // Sentinel at bottom (for validation)
365436 uintptr_t gc_frame ; // GC frame address (0 if not tracking)
366437 uintptr_t last_profiled_frame ; // Last cached frame (0 if no cache)
367438 StackChunkList * chunks ; // Pre-copied stack chunks
368439 int skip_first_frame ; // Skip frame_addr itself (continue from its caller)
440+ RemoteReadPrefetch prefetch ; // Optional already-read thread/frame buffers
369441
370442 /* Outputs */
371443 PyObject * frame_info ; // List to append FrameInfo objects
@@ -548,6 +620,7 @@ extern int process_frame_chain(
548620extern int frame_cache_init (RemoteUnwinderObject * unwinder );
549621extern void frame_cache_cleanup (RemoteUnwinderObject * unwinder );
550622extern FrameCacheEntry * frame_cache_find (RemoteUnwinderObject * unwinder , uint64_t thread_id );
623+ extern FrameCacheEntry * frame_cache_find_by_tstate (RemoteUnwinderObject * unwinder , uintptr_t tstate_addr );
551624extern int clear_last_profiled_frames (RemoteUnwinderObject * unwinder );
552625extern void frame_cache_invalidate_stale (RemoteUnwinderObject * unwinder , PyObject * result );
553626extern int frame_cache_lookup_and_extend (
@@ -566,6 +639,7 @@ extern int frame_cache_store(
566639 PyObject * frame_list ,
567640 const uintptr_t * addrs ,
568641 Py_ssize_t num_addrs ,
642+ uintptr_t thread_state_addr ,
569643 uintptr_t base_frame_addr ,
570644 uintptr_t last_frame_visited );
571645
@@ -605,7 +679,8 @@ extern PyObject* unwind_stack_for_thread(
605679 uintptr_t * current_tstate ,
606680 uintptr_t gil_holder_tstate ,
607681 uintptr_t gc_frame ,
608- uintptr_t main_thread_tstate
682+ uintptr_t main_thread_tstate ,
683+ const RemoteReadPrefetch * prefetch
609684);
610685
611686/* Thread stopping functions (for blocking mode) */
0 commit comments