From c5520d361c87a7993799e3e98db335d83aec7070 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 18:56:27 +0100 Subject: [PATCH 1/9] gh-149584: Avoid page reads for hot profiler structs Use exact remote reads for interpreter state, thread state, and interpreter frame structs instead of pulling full remote pages into the profiler page cache. This matches the core change from python/cpython#149585. --- Modules/_remote_debugging/frames.c | 2 +- Modules/_remote_debugging/module.c | 2 +- Modules/_remote_debugging/threads.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index bbdfce3f7201d9..7e56576392737b 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -197,7 +197,7 @@ parse_frame_object( char frame[SIZEOF_INTERP_FRAME]; *address_of_code_object = 0; - Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( + Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( &unwinder->handle, address, SIZEOF_INTERP_FRAME, diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index efdd2e1a2d7b7a..50332645b0197e 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -537,7 +537,7 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self while (current_interpreter != 0) { // Read interpreter state to get the interpreter ID char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; - if (_Py_RemoteDebug_PagedReadRemoteMemory( + if (_Py_RemoteDebug_ReadRemoteMemory( &self->handle, current_interpreter, INTERP_STATE_BUFFER_SIZE, diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 4daa5e5f92bcd9..31d83f561a8ddf 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -303,7 +303,7 @@ unwind_stack_for_thread( StackChunkList chunks = {0}; char ts[SIZEOF_THREAD_STATE]; - int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory( + int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts); if (bytes_read < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); From 8be8d7d6a825d5ab1aaa6e25b81a31c02f90eee5 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 18:56:57 +0100 Subject: [PATCH 2/9] gh-149584: Track live remote page cache entries The profiler clears the page cache between samples, so live entries are always packed at the front. Track the live count and only clear/search that prefix instead of scanning all 1024 slots on the hot path. --- Python/remote_debug.h | 55 +++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/Python/remote_debug.h b/Python/remote_debug.h index 6c089a834dcd40..be7fdb7cfa9428 100644 --- a/Python/remote_debug.h +++ b/Python/remote_debug.h @@ -147,6 +147,7 @@ typedef struct { int memfd; #endif page_cache_entry_t pages[MAX_PAGES]; + int page_cache_count; Py_ssize_t page_size; } proc_handle_t; @@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle) handle->pages[i].data = NULL; handle->pages[i].valid = 0; } + handle->page_cache_count = 0; } UNUSED static void _Py_RemoteDebug_ClearCache(proc_handle_t *handle) { - for (int i = 0; i < MAX_PAGES; i++) { + for (int i = 0; i < handle->page_cache_count; i++) { handle->pages[i].valid = 0; } + handle->page_cache_count = 0; } #if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX @@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) { handle->memfd = -1; #endif handle->page_size = get_page_size(); + handle->page_cache_count = 0; for (int i = 0; i < MAX_PAGES; i++) { handle->pages[i].data = NULL; handle->pages[i].valid = 0; @@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out); } - // Search for valid cached page - for (int i = 0; i < MAX_PAGES; i++) { + // Search only the pages used since the last clear. The cache is cleared + // between profiler samples, so entries are packed at the front. + for (int i = 0; i < handle->page_cache_count; i++) { page_cache_entry_t *entry = &handle->pages[i]; if (entry->valid && entry->page_addr == page_base) { memcpy(out, entry->data + offset_in_page, size); @@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, } } - // Find reusable slot - for (int i = 0; i < MAX_PAGES; i++) { - page_cache_entry_t *entry = &handle->pages[i]; - if (!entry->valid) { + if (handle->page_cache_count < MAX_PAGES) { + page_cache_entry_t *entry = &handle->pages[handle->page_cache_count]; + if (entry->data == NULL) { + entry->data = PyMem_RawMalloc(page_size); if (entry->data == NULL) { - entry->data = PyMem_RawMalloc(page_size); - if (entry->data == NULL) { - PyErr_NoMemory(); - _set_debug_exception_cause(PyExc_MemoryError, - "Cannot allocate %zu bytes for page cache entry " - "during read from PID %d at address 0x%lx", - page_size, handle->pid, addr); - return -1; - } - } - - if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) { - // Try to just copy the exact amount as a fallback - PyErr_Clear(); - goto fallback; + PyErr_NoMemory(); + _set_debug_exception_cause(PyExc_MemoryError, + "Cannot allocate %zu bytes for page cache entry " + "during read from PID %d at address 0x%lx", + page_size, handle->pid, addr); + return -1; } + } - entry->page_addr = page_base; - entry->valid = 1; - memcpy(out, entry->data + offset_in_page, size); - return 0; + if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) { + // Try to just copy the exact amount as a fallback + PyErr_Clear(); + goto fallback; } + + entry->page_addr = page_base; + entry->valid = 1; + handle->page_cache_count++; + memcpy(out, entry->data + offset_in_page, size); + return 0; } fallback: From 5dc0309fb79fa09d4f4961d589982a6cea96247c Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 18:59:37 +0100 Subject: [PATCH 3/9] gh-149584: Batch predicted profiler reads Use the frame cache to predict the next thread state and top frame address, then batch interpreter/thread/frame reads with process_vm_readv when profiling a Linux target. Reuse prefetched frame buffers in the frame walker when the prediction is valid. --- Modules/_remote_debugging/_remote_debugging.h | 13 ++- Modules/_remote_debugging/frame_cache.c | 18 ++++ Modules/_remote_debugging/frames.c | 79 +++++++++++----- Modules/_remote_debugging/module.c | 76 ++++++++++++++-- Modules/_remote_debugging/threads.c | 89 +++++++++++++++++-- Python/remote_debug.h | 43 +++++++++ 6 files changed, 285 insertions(+), 33 deletions(-) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 7369cd1514c296..df1f8cd1a12292 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -224,6 +224,7 @@ typedef struct { typedef struct { uint64_t thread_id; // 0 = empty slot + uintptr_t thread_state_addr; uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; PyObject *frame_list; // owned reference, NULL if empty @@ -302,6 +303,7 @@ typedef struct { int cache_frames; int collect_stats; // whether to collect statistics uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale + uintptr_t cached_tstate_addr; // predicted first thread for batched reads RemoteDebuggingState *cached_state; FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries UnwinderStats stats; // statistics for performance analysis @@ -361,11 +363,14 @@ typedef struct { typedef struct { /* Inputs */ uintptr_t frame_addr; // Starting frame address + uintptr_t thread_state_addr; // Owning thread state address uintptr_t base_frame_addr; // Sentinel at bottom (for validation) uintptr_t gc_frame; // GC frame address (0 if not tracking) uintptr_t last_profiled_frame; // Last cached frame (0 if no cache) StackChunkList *chunks; // Pre-copied stack chunks int skip_first_frame; // Skip frame_addr itself (continue from its caller) + const char *prefetched_frame; // Optional already-read frame buffer + uintptr_t prefetched_frame_addr; // Remote address for prefetched_frame /* Outputs */ PyObject *frame_info; // List to append FrameInfo objects @@ -548,6 +553,7 @@ extern int process_frame_chain( extern int frame_cache_init(RemoteUnwinderObject *unwinder); extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder); extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id); +extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr); extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder); extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result); extern int frame_cache_lookup_and_extend( @@ -566,6 +572,7 @@ extern int frame_cache_store( PyObject *frame_list, const uintptr_t *addrs, Py_ssize_t num_addrs, + uintptr_t thread_state_addr, uintptr_t base_frame_addr, uintptr_t last_frame_visited); @@ -605,7 +612,11 @@ extern PyObject* unwind_stack_for_thread( uintptr_t *current_tstate, uintptr_t gil_holder_tstate, uintptr_t gc_frame, - uintptr_t main_thread_tstate + uintptr_t main_thread_tstate, + const char *prefetched_tstate, + uintptr_t prefetched_tstate_addr, + const char *prefetched_frame, + uintptr_t prefetched_frame_addr ); /* Thread stopping functions (for blocking mode) */ diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index b6566d7cff7b54..d2ec63925680c6 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -53,6 +53,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id) return NULL; } +FrameCacheEntry * +frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr) +{ + if (!unwinder->frame_cache || tstate_addr == 0) { + return NULL; + } + for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) { + assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES); + return &unwinder->frame_cache[i]; + } + } + return NULL; +} + // Allocate a cache slot for a thread // Returns NULL if cache is full (graceful degradation) static FrameCacheEntry * @@ -129,6 +144,7 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) // Clear this entry Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; + unwinder->frame_cache[i].thread_state_addr = 0; unwinder->frame_cache[i].num_addrs = 0; STATS_INC(unwinder, stale_cache_invalidations); } @@ -216,6 +232,7 @@ frame_cache_store( PyObject *frame_list, const uintptr_t *addrs, Py_ssize_t num_addrs, + uintptr_t thread_state_addr, uintptr_t base_frame_addr, uintptr_t last_frame_visited) { @@ -257,6 +274,7 @@ frame_cache_store( return -1; } entry->thread_id = thread_id; + entry->thread_state_addr = thread_state_addr; memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t)); entry->num_addrs = num_addrs; assert(entry->num_addrs == num_addrs); diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index 7e56576392737b..3a7e44f8075acc 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -186,30 +186,16 @@ is_frame_valid( return 1; } -int -parse_frame_object( +static int +parse_frame_buffer( RemoteUnwinderObject *unwinder, PyObject** result, - uintptr_t address, + const char *frame, uintptr_t* address_of_code_object, uintptr_t* previous_frame ) { - char frame[SIZEOF_INTERP_FRAME]; *address_of_code_object = 0; - Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( - &unwinder->handle, - address, - SIZEOF_INTERP_FRAME, - frame - ); - if (bytes_read < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); - return -1; - } - STATS_INC(unwinder, memory_reads); - STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); - *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous); uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable); int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object); @@ -237,6 +223,31 @@ parse_frame_object( return parse_code_object(unwinder, result, &code_ctx); } +int +parse_frame_object( + RemoteUnwinderObject *unwinder, + PyObject** result, + uintptr_t address, + uintptr_t* address_of_code_object, + uintptr_t* previous_frame +) { + char frame[SIZEOF_INTERP_FRAME]; + Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + address, + SIZEOF_INTERP_FRAME, + frame + ); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame"); + return -1; + } + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); + + return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame); +} + int parse_frame_from_chunks( RemoteUnwinderObject *unwinder, @@ -312,15 +323,32 @@ process_frame_chain( } assert(frame_count <= MAX_FRAMES); - if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) { + if (ctx->chunks && ctx->chunks->count > 0) { + if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) { + goto parsed_frame; + } PyErr_Clear(); + } + { uintptr_t address_of_code_object = 0; - if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) { + int parse_result; + if (ctx->prefetched_frame && ctx->prefetched_frame_addr == frame_addr) { + parse_result = parse_frame_buffer( + unwinder, &frame, ctx->prefetched_frame, + &address_of_code_object, &next_frame_addr); + } + else { + parse_result = parse_frame_object( + unwinder, &frame, frame_addr, + &address_of_code_object, &next_frame_addr); + } + if (parse_result < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain"); return -1; } } +parsed_frame: // Skip first frame if requested (used for cache miss continuation) if (ctx->skip_first_frame && frame_count == 1) { Py_XDECREF(frame); @@ -501,8 +529,16 @@ try_full_cache_hit( PyObject *current_frame = NULL; uintptr_t code_object_addr = 0; uintptr_t previous_frame = 0; - int parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr, + int parse_result; + if (ctx->prefetched_frame && ctx->prefetched_frame_addr == ctx->frame_addr) { + parse_result = parse_frame_buffer(unwinder, ¤t_frame, + ctx->prefetched_frame, &code_object_addr, &previous_frame); + } + else { + parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr, + &code_object_addr, &previous_frame); + } if (parse_result < 0) { return -1; } @@ -606,7 +642,8 @@ collect_frames_with_cache( } if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs, - ctx->base_frame_addr, ctx->last_frame_visited) < 0) { + ctx->thread_state_addr, ctx->base_frame_addr, + ctx->last_frame_visited) < 0) { return -1; } diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 50332645b0197e..170fa8aa069c2b 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -360,6 +360,7 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, self->cache_frames = cache_frames; self->collect_stats = stats; self->stale_invalidation_counter = 0; + self->cached_tstate_addr = 0; self->debug = debug; self->only_active_thread = only_active_thread; self->mode = mode; @@ -473,6 +474,46 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return 0; } +static int +read_interp_state_and_maybe_thread_frame( + RemoteUnwinderObject *unwinder, + uintptr_t interpreter_addr, + char *interp_state_buffer, + uintptr_t predicted_tstate_addr, + char *tstate_buffer, + int *tstate_read, + uintptr_t predicted_frame_addr, + char *frame_buffer, + int *frame_read) +{ + *tstate_read = 0; + *frame_read = 0; + if (predicted_tstate_addr != 0) { + size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size; + _Py_RemoteReadSegment segments[3] = { + {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE}, + {predicted_tstate_addr, tstate_buffer, tstate_size}, + {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + }; + int nsegs = predicted_frame_addr != 0 ? 3 : 2; + Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( + &unwinder->handle, segments, nsegs); + if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) { + Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE + + (Py_ssize_t)tstate_size; + *tstate_read = nread >= with_tstate; + *frame_read = nsegs == 3 + && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME; + return 0; + } + } + return _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + interpreter_addr, + INTERP_STATE_BUFFER_SIZE, + interp_state_buffer); +} + /*[clinic input] @permit_long_docstring_body @critical_section @@ -537,11 +578,29 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self while (current_interpreter != 0) { // Read interpreter state to get the interpreter ID char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; - if (_Py_RemoteDebug_ReadRemoteMemory( - &self->handle, + char prefetched_tstate[SIZEOF_THREAD_STATE]; + char prefetched_frame[SIZEOF_INTERP_FRAME]; + int have_prefetched_tstate = 0; + int have_prefetched_frame = 0; + uintptr_t predicted_tstate_addr = self->cache_frames ? self->cached_tstate_addr : 0; + uintptr_t predicted_frame_addr = 0; + if (predicted_tstate_addr != 0) { + FrameCacheEntry *entry = frame_cache_find_by_tstate(self, predicted_tstate_addr); + if (entry && entry->num_addrs > 0) { + predicted_frame_addr = entry->addrs[0]; + } + } + + if (read_interp_state_and_maybe_thread_frame( + self, current_interpreter, - INTERP_STATE_BUFFER_SIZE, - interp_state_buffer) < 0) { + interp_state_buffer, + predicted_tstate_addr, + prefetched_tstate, + &have_prefetched_tstate, + predicted_frame_addr, + prefetched_frame, + &have_prefetched_frame) < 0) { set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer"); Py_CLEAR(result); goto exit; @@ -611,6 +670,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self // Target specific thread (only process first interpreter) current_tstate = self->tstate_addr; } + if (current_tstate != 0) { + self->cached_tstate_addr = current_tstate; + } // Acquire main thread state information uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer, @@ -621,7 +683,11 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self PyObject* frame_info = unwind_stack_for_thread(self, ¤t_tstate, gil_holder_tstate, gc_frame, - main_thread_tstate); + main_thread_tstate, + have_prefetched_tstate ? prefetched_tstate : NULL, + predicted_tstate_addr, + have_prefetched_frame ? prefetched_frame : NULL, + predicted_frame_addr); if (!frame_info) { // Check if this was an intentional skip due to mode-based filtering if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL || diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 31d83f561a8ddf..fa89cf6406736d 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -289,13 +289,44 @@ typedef struct { unsigned int :24; } _thread_status; +static int +read_thread_state_and_maybe_frame( + RemoteUnwinderObject *unwinder, + uintptr_t tstate_addr, + size_t tstate_size, + char *tstate_buffer, + uintptr_t predicted_frame_addr, + char *frame_buffer, + int *frame_read) +{ + *frame_read = 0; + if (predicted_frame_addr != 0) { + _Py_RemoteReadSegment segments[2] = { + {tstate_addr, tstate_buffer, tstate_size}, + {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + }; + Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( + &unwinder->handle, segments, 2); + if (nread >= (Py_ssize_t)tstate_size) { + *frame_read = nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME); + return 0; + } + } + return _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, tstate_addr, tstate_size, tstate_buffer); +} + PyObject* unwind_stack_for_thread( RemoteUnwinderObject *unwinder, uintptr_t *current_tstate, uintptr_t gil_holder_tstate, uintptr_t gc_frame, - uintptr_t main_thread_tstate + uintptr_t main_thread_tstate, + const char *prefetched_tstate, + uintptr_t prefetched_tstate_addr, + const char *prefetched_frame, + uintptr_t prefetched_frame_addr ) { PyObject *frame_info = NULL; PyObject *thread_id = NULL; @@ -303,14 +334,57 @@ unwind_stack_for_thread( StackChunkList chunks = {0}; char ts[SIZEOF_THREAD_STATE]; - int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( - &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts); - if (bytes_read < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); - goto error; + char local_prefetched_frame[SIZEOF_INTERP_FRAME]; + const char *prefetched_frame_for_ctx = NULL; + int have_prefetched_frame = 0; + uintptr_t predicted_frame_addr = 0; + if (prefetched_tstate && prefetched_tstate_addr == *current_tstate) { + memcpy(ts, prefetched_tstate, (size_t)unwinder->debug_offsets.thread_state.size); + if (prefetched_frame && prefetched_frame_addr != 0) { + have_prefetched_frame = 1; + prefetched_frame_for_ctx = prefetched_frame; + predicted_frame_addr = prefetched_frame_addr; + } + } + else if (unwinder->cache_frames) { + FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate); + if (entry && entry->num_addrs > 0) { + predicted_frame_addr = entry->addrs[0]; + } + + int bytes_read = read_thread_state_and_maybe_frame( + unwinder, + *current_tstate, + (size_t)unwinder->debug_offsets.thread_state.size, + ts, + predicted_frame_addr, + local_prefetched_frame, + &have_prefetched_frame); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); + goto error; + } + if (have_prefetched_frame) { + prefetched_frame_for_ctx = local_prefetched_frame; + } + } + else { + int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( + &unwinder->handle, + *current_tstate, + (size_t)unwinder->debug_offsets.thread_state.size, + ts); + if (bytes_read < 0) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); + goto error; + } } STATS_INC(unwinder, memory_reads); STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size); + if (have_prefetched_frame) { + STATS_INC(unwinder, memory_reads); + STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); + } long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id); @@ -432,9 +506,12 @@ unwind_stack_for_thread( uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; FrameWalkContext ctx = { .frame_addr = frame_addr, + .thread_state_addr = *current_tstate, .base_frame_addr = base_frame_addr, .gc_frame = gc_frame, .chunks = &chunks, + .prefetched_frame = have_prefetched_frame ? prefetched_frame_for_ctx : NULL, + .prefetched_frame_addr = predicted_frame_addr, .frame_info = frame_info, .frame_addrs = addrs, .num_addrs = 0, diff --git a/Python/remote_debug.h b/Python/remote_debug.h index be7fdb7cfa9428..7b2c4f3bcb8077 100644 --- a/Python/remote_debug.h +++ b/Python/remote_debug.h @@ -1333,6 +1333,49 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle, return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out); } +typedef struct { + uintptr_t remote_addr; + void *local_buf; + size_t size; +} _Py_RemoteReadSegment; + +#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4 + +// Batched read of multiple remote regions in a single syscall when supported. +// Returns total bytes read (>= 0) on success, -1 if batched reads are +// unavailable or the syscall failed. Callers compare the return value against +// cumulative segment sizes to determine which segments were fully populated. +UNUSED static Py_ssize_t +_Py_RemoteDebug_BatchedReadRemoteMemory( + proc_handle_t *handle, + const _Py_RemoteReadSegment *segments, + int nsegs) +{ +#if defined(__linux__) && HAVE_PROCESS_VM_READV + if (handle->memfd == -1 + && nsegs > 0 + && nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) { + struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS]; + struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS]; + for (int i = 0; i < nsegs; i++) { + local[i].iov_base = segments[i].local_buf; + local[i].iov_len = segments[i].size; + remote[i].iov_base = (void *)segments[i].remote_addr; + remote[i].iov_len = segments[i].size; + } + ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0); + if (nread >= 0) { + return (Py_ssize_t)nread; + } + } +#else + (void)handle; + (void)segments; + (void)nsegs; +#endif + return -1; +} + UNUSED static int _Py_RemoteDebug_ReadDebugOffsets( proc_handle_t *handle, From c69a0f361700db334d41615c9c6b647f5ef353c1 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 19:00:33 +0100 Subject: [PATCH 4/9] gh-149584: Reuse profiler result objects Cache the last FrameInfo tuple per code object/instruction offset, reuse cached thread id objects, and append cached parent frames directly on full frame-cache hits. This cuts Python allocation churn in the steady-state profiler path. --- Modules/_remote_debugging/_remote_debugging.h | 3 +++ Modules/_remote_debugging/code_objects.c | 13 +++++++++++ Modules/_remote_debugging/frame_cache.c | 8 +++++++ Modules/_remote_debugging/frames.c | 22 +++++-------------- Modules/_remote_debugging/module.c | 1 + Modules/_remote_debugging/threads.c | 14 +++++++++--- 6 files changed, 41 insertions(+), 20 deletions(-) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index df1f8cd1a12292..422fe9e09873e8 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -215,6 +215,8 @@ typedef struct { PyObject *file_name; int first_lineno; PyObject *linetable; // bytes + PyObject *last_frame_info; + ptrdiff_t last_addrq; uintptr_t addr_code_adaptive; } CachedCodeMetadata; @@ -227,6 +229,7 @@ typedef struct { uintptr_t thread_state_addr; uintptr_t addrs[FRAME_CACHE_MAX_FRAMES]; Py_ssize_t num_addrs; + PyObject *thread_id_obj; // owned reference, NULL if empty PyObject *frame_list; // owned reference, NULL if empty } FrameCacheEntry; diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c index 7b95c0f2d4fa8d..2ac6edb3f662f6 100644 --- a/Modules/_remote_debugging/code_objects.c +++ b/Modules/_remote_debugging/code_objects.c @@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder, meta->func_name = func; meta->file_name = file; meta->linetable = linetable; + meta->last_frame_info = NULL; + meta->last_addrq = -1; meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno); meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive; @@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder, addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive; #endif ; // Empty statement to avoid C23 extension warning + + if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) { + *result = Py_NewRef(meta->last_frame_info); + return 0; + } + LocationInfo info = {0}; bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable), PyBytes_GET_SIZE(meta->linetable), @@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder, goto error; } + if (!unwinder->opcodes) { + Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple)); + meta->last_addrq = addrq; + } + *result = tuple; return 0; diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c index d2ec63925680c6..19fc406bca9ac9 100644 --- a/Modules/_remote_debugging/frame_cache.c +++ b/Modules/_remote_debugging/frame_cache.c @@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder) return; } for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) { + Py_CLEAR(unwinder->frame_cache[i].thread_id_obj); Py_CLEAR(unwinder->frame_cache[i].frame_list); } PyMem_Free(unwinder->frame_cache); @@ -142,6 +143,7 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result) } if (!found) { // Clear this entry + Py_CLEAR(unwinder->frame_cache[i].thread_id_obj); Py_CLEAR(unwinder->frame_cache[i].frame_list); unwinder->frame_cache[i].thread_id = 0; unwinder->frame_cache[i].thread_state_addr = 0; @@ -275,6 +277,12 @@ frame_cache_store( } entry->thread_id = thread_id; entry->thread_state_addr = thread_state_addr; + if (entry->thread_id_obj == NULL) { + entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id); + if (entry->thread_id_obj == NULL) { + return -1; + } + } memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t)); entry->num_addrs = num_addrs; assert(entry->num_addrs == num_addrs); diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index 3a7e44f8075acc..1eafd3588db50b 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -543,35 +543,23 @@ try_full_cache_hit( return -1; } - Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); - PyObject *parent_slice = NULL; - if (cached_size > 1) { - parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size); - if (!parent_slice) { - Py_XDECREF(current_frame); - return -1; - } - } - if (current_frame != NULL) { if (PyList_Append(ctx->frame_info, current_frame) < 0) { Py_DECREF(current_frame); - Py_XDECREF(parent_slice); return -1; } Py_DECREF(current_frame); STATS_ADD(unwinder, frames_read_from_memory, 1); } - if (parent_slice) { - Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info); - int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice); - Py_DECREF(parent_slice); - if (result < 0) { + Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list); + for (Py_ssize_t i = 1; i < cached_size; i++) { + PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i); + if (PyList_Append(ctx->frame_info, cached_frame) < 0) { return -1; } - STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1); } + STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0); STATS_INC(unwinder, frame_cache_hits); return 1; diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 170fa8aa069c2b..25928b658fd147 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr) Py_DECREF(meta->func_name); Py_DECREF(meta->file_name); Py_DECREF(meta->linetable); + Py_XDECREF(meta->last_frame_info); PyMem_RawFree(meta); } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index fa89cf6406736d..3e3164094480ea 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -546,10 +546,18 @@ unwind_stack_for_thread( *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next); - thread_id = PyLong_FromLongLong(tid); + if (unwinder->cache_frames) { + FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid); + if (entry && entry->thread_id_obj) { + thread_id = Py_NewRef(entry->thread_id_obj); + } + } if (thread_id == NULL) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID"); - goto error; + thread_id = PyLong_FromLongLong(tid); + if (thread_id == NULL) { + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID"); + goto error; + } } RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder); From 7a85c9a720d4687cf97ccd9f4c36bb6d5cd95a53 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 10 May 2026 19:27:03 +0100 Subject: [PATCH 5/9] gh-149584: Add NEWS for Tachyon profiler overhead fix --- .../Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst diff --git a/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst new file mode 100644 index 00000000000000..6734250fdd6af3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst @@ -0,0 +1,4 @@ +Fix excessive overhead in the Tachyon profiler when inspecting a remote +process by avoiding repeated remote page-cache scans, batching predicted +remote reads, and reusing cached profiler result objects. Patch by Pablo +Galindo and Maurycy Pawłowski-Wieroński. From 46a0b2cd48a690492e2fca0e94367113ed8de69a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 13 May 2026 00:53:12 +0100 Subject: [PATCH 6/9] Address review feedback --- Lib/profiling/sampling/sample.py | 31 +++ Lib/test/test_external_inspection.py | 7 + Modules/_remote_debugging/_remote_debugging.h | 72 +++++- Modules/_remote_debugging/clinic/module.c.h | 9 +- Modules/_remote_debugging/frames.c | 8 +- Modules/_remote_debugging/module.c | 233 +++++++++++++----- Modules/_remote_debugging/threads.c | 36 ++- 7 files changed, 299 insertions(+), 97 deletions(-) diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py index 5bbe2483581333..980d057cf49dab 100644 --- a/Lib/profiling/sampling/sample.py +++ b/Lib/profiling/sampling/sample.py @@ -296,6 +296,37 @@ def _print_unwinder_stats(self): print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})") print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})") + # Batched remote read stats + batched_attempts = stats.get('batched_read_attempts', 0) + batched_successes = stats.get('batched_read_successes', 0) + batched_misses = stats.get('batched_read_misses', 0) + segments_requested = stats.get('batched_read_segments_requested', 0) + segments_completed = stats.get('batched_read_segments_completed', 0) + if batched_attempts > 0 or segments_requested > 0: + batched_success_rate = stats.get('batched_read_success_rate', 0.0) + batched_miss_rate = ( + (batched_misses / batched_attempts * 100) + if batched_attempts > 0 else 0 + ) + segment_completion_rate = stats.get( + 'batched_read_segment_completion_rate', 0.0 + ) + + print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}") + print(f" Attempts: {batched_attempts:n}") + print( + f" Successes: {batched_successes:n} " + f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})" + ) + print( + f" Misses: {batched_misses:n} " + f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})" + ) + print( + f" Segments read: {segments_completed:n}/{segments_requested:n} " + f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})" + ) + # Memory operations memory_reads = stats.get('memory_reads', 0) memory_bytes = stats.get('memory_bytes_read', 0) diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index a29e6cdbbf6c78..6b1529aa173f01 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -3767,6 +3767,13 @@ def test_get_stats(self): "frames_read_from_cache", "frames_read_from_memory", "frame_cache_hit_rate", + "batched_read_attempts", + "batched_read_successes", + "batched_read_misses", + "batched_read_segments_requested", + "batched_read_segments_completed", + "batched_read_success_rate", + "batched_read_segment_completion_rate", ] for key in expected_keys: self.assertIn(key, stats) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 422fe9e09873e8..2140765ce288d8 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -233,6 +233,23 @@ typedef struct { PyObject *frame_list; // owned reference, NULL if empty } FrameCacheEntry; +#define INTERPRETER_THREAD_CACHE_SIZE 32 +#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0 +# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two" +#endif + +typedef struct { + uintptr_t interpreter_addr; + uintptr_t thread_state_addr; +} InterpreterThreadCacheEntry; + +typedef struct { + const char *tstate; + uintptr_t tstate_addr; + const char *frame; + uintptr_t frame_addr; +} RemoteReadPrefetch; + /* Statistics for profiling performance analysis */ typedef struct { uint64_t total_samples; // Total number of get_stack_trace calls @@ -246,6 +263,11 @@ typedef struct { uint64_t code_object_cache_hits; // Code object cache hits uint64_t code_object_cache_misses; // Code object cache misses uint64_t stale_cache_invalidations; // Times stale entries were cleared + uint64_t batched_read_attempts; // Batched remote-read attempts + uint64_t batched_read_successes; // Attempts that read all requested segments + uint64_t batched_read_misses; // Attempts that fell back or partially read + uint64_t batched_read_segments_requested; // Segments requested by batched reads + uint64_t batched_read_segments_completed; // Segments completed by batched reads } UnwinderStats; /* Stats tracking macros - no-op when stats collection is disabled */ @@ -255,6 +277,46 @@ typedef struct { #define STATS_ADD(unwinder, field, val) \ do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0) +#define STATS_BATCHED_READ(unwinder, requested, completed) \ + do { \ + if ((unwinder)->collect_stats) { \ + (unwinder)->stats.batched_read_attempts++; \ + (unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \ + (unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \ + if ((completed) == (requested)) { \ + (unwinder)->stats.batched_read_successes++; \ + } \ + else { \ + (unwinder)->stats.batched_read_misses++; \ + } \ + } \ + } while(0) + +static inline int +_Py_RemoteDebug_CountCompletedSegments( + const _Py_RemoteReadSegment *segments, + int nsegs, + Py_ssize_t nread) +{ + if (nread < 0) { + return 0; + } + + int completed = 0; + Py_ssize_t bytes_needed = 0; + for (int i = 0; i < nsegs; i++) { + if (segments[i].size > (size_t)(PY_SSIZE_T_MAX - bytes_needed)) { + break; + } + bytes_needed += (Py_ssize_t)segments[i].size; + if (nread < bytes_needed) { + break; + } + completed++; + } + return completed; +} + typedef struct { PyTypeObject *RemoteDebugging_Type; PyTypeObject *TaskInfo_Type; @@ -306,7 +368,7 @@ typedef struct { int cache_frames; int collect_stats; // whether to collect statistics uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale - uintptr_t cached_tstate_addr; // predicted first thread for batched reads + InterpreterThreadCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE]; RemoteDebuggingState *cached_state; FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries UnwinderStats stats; // statistics for performance analysis @@ -372,8 +434,7 @@ typedef struct { uintptr_t last_profiled_frame; // Last cached frame (0 if no cache) StackChunkList *chunks; // Pre-copied stack chunks int skip_first_frame; // Skip frame_addr itself (continue from its caller) - const char *prefetched_frame; // Optional already-read frame buffer - uintptr_t prefetched_frame_addr; // Remote address for prefetched_frame + RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers /* Outputs */ PyObject *frame_info; // List to append FrameInfo objects @@ -616,10 +677,7 @@ extern PyObject* unwind_stack_for_thread( uintptr_t gil_holder_tstate, uintptr_t gc_frame, uintptr_t main_thread_tstate, - const char *prefetched_tstate, - uintptr_t prefetched_tstate_addr, - const char *prefetched_frame, - uintptr_t prefetched_frame_addr + const RemoteReadPrefetch *prefetch ); /* Thread stopping functions (for blocking mode) */ diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h index d56622fb82ab56..78b1d3e8d80962 100644 --- a/Modules/_remote_debugging/clinic/module.c.h +++ b/Modules/_remote_debugging/clinic/module.c.h @@ -411,8 +411,15 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__, " - code_object_cache_hits: Code object cache hits\n" " - code_object_cache_misses: Code object cache misses\n" " - stale_cache_invalidations: Times stale cache entries were cleared\n" +" - batched_read_attempts: Batched remote-read attempts\n" +" - batched_read_successes: Attempts that read all requested segments\n" +" - batched_read_misses: Attempts that fell back or partially read\n" +" - batched_read_segments_requested: Segments requested by batched reads\n" +" - batched_read_segments_completed: Segments completed by batched reads\n" " - frame_cache_hit_rate: Percentage of samples that hit the cache\n" " - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n" +" - batched_read_success_rate: Percentage of batched reads that completed all segments\n" +" - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads\n" "\n" "Raises:\n" " RuntimeError: If stats collection was not enabled (stats=False)"); @@ -1540,4 +1547,4 @@ _remote_debugging_get_gc_stats(PyObject *module, PyObject *const *args, Py_ssize exit: return return_value; } -/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/ diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c index 1eafd3588db50b..8d8019396b3e31 100644 --- a/Modules/_remote_debugging/frames.c +++ b/Modules/_remote_debugging/frames.c @@ -332,9 +332,9 @@ process_frame_chain( { uintptr_t address_of_code_object = 0; int parse_result; - if (ctx->prefetched_frame && ctx->prefetched_frame_addr == frame_addr) { + if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) { parse_result = parse_frame_buffer( - unwinder, &frame, ctx->prefetched_frame, + unwinder, &frame, ctx->prefetch.frame, &address_of_code_object, &next_frame_addr); } else { @@ -530,9 +530,9 @@ try_full_cache_hit( uintptr_t code_object_addr = 0; uintptr_t previous_frame = 0; int parse_result; - if (ctx->prefetched_frame && ctx->prefetched_frame_addr == ctx->frame_addr) { + if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) { parse_result = parse_frame_buffer(unwinder, ¤t_frame, - ctx->prefetched_frame, + ctx->prefetch.frame, &code_object_addr, &previous_frame); } else { diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 25928b658fd147..a7b94defa9890e 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -361,7 +361,7 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, self->cache_frames = cache_frames; self->collect_stats = stats; self->stale_invalidation_counter = 0; - self->cached_tstate_addr = 0; + memset(self->cached_tstates, 0, sizeof(self->cached_tstates)); self->debug = debug; self->only_active_thread = only_active_thread; self->mode = mode; @@ -475,36 +475,125 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return 0; } +static size_t +interpreter_thread_cache_index(uintptr_t interpreter_addr) +{ + // The interpreter ID lives in PyInterpreterState, which is the state we are + // trying to prefetch. At this point the cheap stable key is the remote + // interpreter address, so use it for a small direct-mapped prediction cache. + // The full address is stored in each entry and checked on lookup, so hash + // collisions are detected as misses; storing a colliding entry only replaces + // the previous prediction and cannot return the wrong thread state. + return ((interpreter_addr >> 4) ^ (interpreter_addr >> 12)) + & (INTERPRETER_THREAD_CACHE_SIZE - 1); +} + +static uintptr_t +get_cached_tstate_for_interpreter( + RemoteUnwinderObject *self, + uintptr_t interpreter_addr) +{ + if (interpreter_addr == 0) { + return 0; + } + + InterpreterThreadCacheEntry *entry = + &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)]; + if (entry->interpreter_addr == interpreter_addr) { + return entry->thread_state_addr; + } + return 0; +} + +static void +set_cached_tstate_for_interpreter( + RemoteUnwinderObject *self, + uintptr_t interpreter_addr, + uintptr_t thread_state_addr) +{ + if (interpreter_addr == 0 || thread_state_addr == 0) { + return; + } + + InterpreterThreadCacheEntry *entry = + &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)]; + entry->interpreter_addr = interpreter_addr; + entry->thread_state_addr = thread_state_addr; +} + +static void +refresh_generation_caches_from_interp_state( + RemoteUnwinderObject *self, + const char *interp_state_buffer) +{ + uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer, + self->debug_offsets.interpreter_state.code_object_generation); + + if (code_object_generation != self->code_object_generation) { + self->code_object_generation = code_object_generation; + _Py_hashtable_clear(self->code_object_cache); + } + +#ifdef Py_GIL_DISABLED + uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer, + self->debug_offsets.interpreter_state.tlbc_generation); + if (current_tlbc_generation != self->tlbc_generation) { + self->tlbc_generation = current_tlbc_generation; + _Py_hashtable_clear(self->tlbc_cache); + } +#endif +} + +static int +refresh_generation_caches_for_interpreter( + RemoteUnwinderObject *self, + uintptr_t interpreter_addr) +{ + char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; + if (_Py_RemoteDebug_ReadRemoteMemory( + &self->handle, + interpreter_addr, + INTERP_STATE_BUFFER_SIZE, + interp_state_buffer) < 0) { + set_exception_cause(self, PyExc_RuntimeError, + "Failed to read interpreter state buffer"); + return -1; + } + refresh_generation_caches_from_interp_state(self, interp_state_buffer); + return 0; +} + static int read_interp_state_and_maybe_thread_frame( RemoteUnwinderObject *unwinder, uintptr_t interpreter_addr, char *interp_state_buffer, - uintptr_t predicted_tstate_addr, char *tstate_buffer, - int *tstate_read, - uintptr_t predicted_frame_addr, char *frame_buffer, - int *frame_read) + RemoteReadPrefetch *prefetch) { - *tstate_read = 0; - *frame_read = 0; - if (predicted_tstate_addr != 0) { + prefetch->tstate = NULL; + prefetch->frame = NULL; + if (prefetch->tstate_addr != 0) { size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size; _Py_RemoteReadSegment segments[3] = { {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE}, - {predicted_tstate_addr, tstate_buffer, tstate_size}, - {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, + {prefetch->tstate_addr, tstate_buffer, tstate_size}, + {prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME}, }; - int nsegs = predicted_frame_addr != 0 ? 3 : 2; + int nsegs = prefetch->frame_addr != 0 ? 3 : 2; Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( &unwinder->handle, segments, nsegs); - if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) { - Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE - + (Py_ssize_t)tstate_size; - *tstate_read = nread >= with_tstate; - *frame_read = nsegs == 3 - && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME; + int completed = _Py_RemoteDebug_CountCompletedSegments( + segments, nsegs, nread); + STATS_BATCHED_READ(unwinder, nsegs, completed); + if (completed >= 1) { + if (completed >= 2) { + prefetch->tstate = tstate_buffer; + } + if (completed >= 3) { + prefetch->frame = frame_buffer; + } return 0; } } @@ -581,14 +670,15 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self char interp_state_buffer[INTERP_STATE_BUFFER_SIZE]; char prefetched_tstate[SIZEOF_THREAD_STATE]; char prefetched_frame[SIZEOF_INTERP_FRAME]; - int have_prefetched_tstate = 0; - int have_prefetched_frame = 0; - uintptr_t predicted_tstate_addr = self->cache_frames ? self->cached_tstate_addr : 0; - uintptr_t predicted_frame_addr = 0; - if (predicted_tstate_addr != 0) { - FrameCacheEntry *entry = frame_cache_find_by_tstate(self, predicted_tstate_addr); + RemoteReadPrefetch prefetch = {0}; + if (self->cache_frames) { + prefetch.tstate_addr = get_cached_tstate_for_interpreter( + self, current_interpreter); + } + if (prefetch.tstate_addr != 0) { + FrameCacheEntry *entry = frame_cache_find_by_tstate(self, prefetch.tstate_addr); if (entry && entry->num_addrs > 0) { - predicted_frame_addr = entry->addrs[0]; + prefetch.frame_addr = entry->addrs[0]; } } @@ -596,16 +686,14 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self self, current_interpreter, interp_state_buffer, - predicted_tstate_addr, prefetched_tstate, - &have_prefetched_tstate, - predicted_frame_addr, prefetched_frame, - &have_prefetched_frame) < 0) { + &prefetch) < 0) { set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer"); Py_CLEAR(result); goto exit; } + refresh_generation_caches_from_interp_state(self, interp_state_buffer); uintptr_t gc_frame = 0; if (self->gc) { @@ -617,25 +705,6 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer, self->debug_offsets.interpreter_state.id); - // Get code object generation from buffer - uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer, - self->debug_offsets.interpreter_state.code_object_generation); - - if (code_object_generation != self->code_object_generation) { - self->code_object_generation = code_object_generation; - _Py_hashtable_clear(self->code_object_cache); - } - -#ifdef Py_GIL_DISABLED - // Check TLBC generation and invalidate cache if needed - uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer, - self->debug_offsets.interpreter_state.tlbc_generation); - if (current_tlbc_generation != self->tlbc_generation) { - self->tlbc_generation = current_tlbc_generation; - _Py_hashtable_clear(self->tlbc_cache); - } -#endif - // Create a list to hold threads for this interpreter PyObject *interpreter_threads = PyList_New(0); if (!interpreter_threads) { @@ -672,7 +741,7 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self current_tstate = self->tstate_addr; } if (current_tstate != 0) { - self->cached_tstate_addr = current_tstate; + set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate); } // Acquire main thread state information @@ -685,10 +754,7 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self gil_holder_tstate, gc_frame, main_thread_tstate, - have_prefetched_tstate ? prefetched_tstate : NULL, - predicted_tstate_addr, - have_prefetched_frame ? prefetched_frame : NULL, - predicted_frame_addr); + &prefetch); if (!frame_info) { // Check if this was an intentional skip due to mode-based filtering if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL || @@ -838,6 +904,9 @@ _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s if (ensure_async_debug_offsets(self) < 0) { return NULL; } + if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) { + return NULL; + } PyObject *result = PyList_New(0); if (result == NULL) { @@ -927,6 +996,9 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject if (ensure_async_debug_offsets(self) < 0) { return NULL; } + if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) { + return NULL; + } PyObject *result = PyList_New(0); if (result == NULL) { @@ -971,8 +1043,15 @@ RemoteUnwinder was created with stats=True. - code_object_cache_hits: Code object cache hits - code_object_cache_misses: Code object cache misses - stale_cache_invalidations: Times stale cache entries were cleared + - batched_read_attempts: Batched remote-read attempts + - batched_read_successes: Attempts that read all requested segments + - batched_read_misses: Attempts that fell back or partially read + - batched_read_segments_requested: Segments requested by batched reads + - batched_read_segments_completed: Segments completed by batched reads - frame_cache_hit_rate: Percentage of samples that hit the cache - code_object_cache_hit_rate: Percentage of code object lookups that hit cache + - batched_read_success_rate: Percentage of batched reads that completed all segments + - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads Raises: RuntimeError: If stats collection was not enabled (stats=False) @@ -980,7 +1059,7 @@ RemoteUnwinder was created with stats=True. static PyObject * _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) -/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/ +/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/ { if (!self->collect_stats) { PyErr_SetString(PyExc_RuntimeError, @@ -1015,9 +1094,24 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) ADD_STAT(code_object_cache_hits); ADD_STAT(code_object_cache_misses); ADD_STAT(stale_cache_invalidations); + ADD_STAT(batched_read_attempts); + ADD_STAT(batched_read_successes); + ADD_STAT(batched_read_misses); + ADD_STAT(batched_read_segments_requested); + ADD_STAT(batched_read_segments_completed); #undef ADD_STAT +#define ADD_DERIVED_STAT(name, value) do { \ + PyObject *val = PyFloat_FromDouble(value); \ + if (!val || PyDict_SetItemString(result, name, val) < 0) { \ + Py_XDECREF(val); \ + Py_DECREF(result); \ + return NULL; \ + } \ + Py_DECREF(val); \ +} while(0) + // Calculate and add derived statistics // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups double frame_cache_hit_rate = 0.0; @@ -1026,26 +1120,33 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self) frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits) / (double)total_cache_lookups; } - PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate); - if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) { - Py_XDECREF(hit_rate); - Py_DECREF(result); - return NULL; - } - Py_DECREF(hit_rate); + ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate); double code_object_hit_rate = 0.0; uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses; if (total_code_lookups > 0) { code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups; } - PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate); - if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) { - Py_XDECREF(code_hit_rate); - Py_DECREF(result); - return NULL; + ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate); + + double batched_read_success_rate = 0.0; + if (self->stats.batched_read_attempts > 0) { + batched_read_success_rate = + 100.0 * (double)self->stats.batched_read_successes + / (double)self->stats.batched_read_attempts; } - Py_DECREF(code_hit_rate); + ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate); + + double batched_read_segment_completion_rate = 0.0; + if (self->stats.batched_read_segments_requested > 0) { + batched_read_segment_completion_rate = + 100.0 * (double)self->stats.batched_read_segments_completed + / (double)self->stats.batched_read_segments_requested; + } + ADD_DERIVED_STAT("batched_read_segment_completion_rate", + batched_read_segment_completion_rate); + +#undef ADD_DERIVED_STAT return result; } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 3e3164094480ea..24a9fd2918a0b2 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -307,8 +307,10 @@ read_thread_state_and_maybe_frame( }; Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( &unwinder->handle, segments, 2); - if (nread >= (Py_ssize_t)tstate_size) { - *frame_read = nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME); + int completed = _Py_RemoteDebug_CountCompletedSegments(segments, 2, nread); + STATS_BATCHED_READ(unwinder, 2, completed); + if (completed >= 1) { + *frame_read = completed == 2; return 0; } } @@ -323,10 +325,7 @@ unwind_stack_for_thread( uintptr_t gil_holder_tstate, uintptr_t gc_frame, uintptr_t main_thread_tstate, - const char *prefetched_tstate, - uintptr_t prefetched_tstate_addr, - const char *prefetched_frame, - uintptr_t prefetched_frame_addr + const RemoteReadPrefetch *prefetch ) { PyObject *frame_info = NULL; PyObject *thread_id = NULL; @@ -335,18 +334,17 @@ unwind_stack_for_thread( char ts[SIZEOF_THREAD_STATE]; char local_prefetched_frame[SIZEOF_INTERP_FRAME]; - const char *prefetched_frame_for_ctx = NULL; - int have_prefetched_frame = 0; - uintptr_t predicted_frame_addr = 0; - if (prefetched_tstate && prefetched_tstate_addr == *current_tstate) { - memcpy(ts, prefetched_tstate, (size_t)unwinder->debug_offsets.thread_state.size); - if (prefetched_frame && prefetched_frame_addr != 0) { - have_prefetched_frame = 1; - prefetched_frame_for_ctx = prefetched_frame; - predicted_frame_addr = prefetched_frame_addr; + RemoteReadPrefetch ctx_prefetch = {0}; + if (prefetch && prefetch->tstate && prefetch->tstate_addr == *current_tstate) { + memcpy(ts, prefetch->tstate, (size_t)unwinder->debug_offsets.thread_state.size); + if (prefetch->frame && prefetch->frame_addr != 0) { + ctx_prefetch.frame = prefetch->frame; + ctx_prefetch.frame_addr = prefetch->frame_addr; } } else if (unwinder->cache_frames) { + uintptr_t predicted_frame_addr = 0; + int have_prefetched_frame = 0; FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate); if (entry && entry->num_addrs > 0) { predicted_frame_addr = entry->addrs[0]; @@ -365,7 +363,8 @@ unwind_stack_for_thread( goto error; } if (have_prefetched_frame) { - prefetched_frame_for_ctx = local_prefetched_frame; + ctx_prefetch.frame = local_prefetched_frame; + ctx_prefetch.frame_addr = predicted_frame_addr; } } else { @@ -381,7 +380,7 @@ unwind_stack_for_thread( } STATS_INC(unwinder, memory_reads); STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size); - if (have_prefetched_frame) { + if (ctx_prefetch.frame) { STATS_INC(unwinder, memory_reads); STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME); } @@ -510,8 +509,7 @@ unwind_stack_for_thread( .base_frame_addr = base_frame_addr, .gc_frame = gc_frame, .chunks = &chunks, - .prefetched_frame = have_prefetched_frame ? prefetched_frame_for_ctx : NULL, - .prefetched_frame_addr = predicted_frame_addr, + .prefetch = ctx_prefetch, .frame_info = frame_info, .frame_addrs = addrs, .num_addrs = 0, From f7fe3beda1d911455fe36ec50901a7e745219548 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 13 May 2026 00:53:13 +0100 Subject: [PATCH 7/9] Add async benchmark modes --- .../benchmark_external_inspection.py | 100 ++++++++++++++++-- 1 file changed, 91 insertions(+), 9 deletions(-) diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py index fee3435496da0b..8e367422a961da 100644 --- a/Tools/inspection/benchmark_external_inspection.py +++ b/Tools/inspection/benchmark_external_inspection.py @@ -151,6 +151,45 @@ def create_threads(n): time.sleep(0.05) ''' +ASYNC_CODE = '''\ +import asyncio +import contextlib +import math + +def compute_slice(seed): + result = 0.0 + for i in range(2000): + result += math.sin(seed + i) * math.sqrt(i + 1) + return result + +async def leaf_task(seed): + total = 0.0 + while True: + total += compute_slice(seed) + await asyncio.sleep(0) + +async def parent_task(seed): + child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}") + try: + while True: + compute_slice(seed) + await asyncio.sleep(0.001) + finally: + child.cancel() + with contextlib.suppress(asyncio.CancelledError): + await child + +async def main(): + tasks = [ + asyncio.create_task(parent_task(i), name=f"parent-{i}") + for i in range(8) + ] + await asyncio.gather(*tasks) + +if __name__ == "__main__": + asyncio.run(main()) +''' + CODE_EXAMPLES = { "basic": { "code": CODE, @@ -164,10 +203,29 @@ def create_threads(n): "code": CODE_WITH_TONS_OF_THREADS, "description": "Tons of threads doing mixed CPU/IO work", }, + "asyncio": { + "code": ASYNC_CODE, + "description": "Asyncio tasks with active and awaited coroutine chains", + }, +} + +OPERATIONS = { + "stack_trace": { + "method": "get_stack_trace", + "label": "get_stack_trace()", + }, + "async_stack_trace": { + "method": "get_async_stack_trace", + "label": "get_async_stack_trace()", + }, + "all_awaited_by": { + "method": "get_all_awaited_by", + "label": "get_all_awaited_by()", + }, } -def benchmark(unwinder, duration_seconds=10, blocking=False): +def benchmark(unwinder, duration_seconds=10, blocking=False, operation="stack_trace"): """Benchmark mode - measure raw sampling speed for specified duration""" sample_count = 0 fail_count = 0 @@ -175,11 +233,14 @@ def benchmark(unwinder, duration_seconds=10, blocking=False): start_time = time.perf_counter() end_time = start_time + duration_seconds total_attempts = 0 + operation_info = OPERATIONS[operation] + operation_method = getattr(unwinder, operation_info["method"]) colors = get_colors(can_colorize()) print( - f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} seconds...{colors.RESET}" + f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed " + f"for {duration_seconds} seconds...{colors.RESET}" ) try: @@ -190,8 +251,8 @@ def benchmark(unwinder, duration_seconds=10, blocking=False): if blocking: unwinder.pause_threads() try: - stack_trace = unwinder.get_stack_trace() - if stack_trace: + sample = operation_method() + if sample: sample_count += 1 finally: if blocking: @@ -239,6 +300,7 @@ def benchmark(unwinder, duration_seconds=10, blocking=False): (sample_count / total_attempts) * 100 if total_attempts > 0 else 0 ), "total_work_time": total_work_time, + "operation": operation_info["label"], "avg_work_time_us": ( (total_work_time / total_attempts) * 1e6 if total_attempts > 0 else 0 ), @@ -252,7 +314,7 @@ def print_benchmark_results(results): colors = get_colors(can_colorize()) print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}") - print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark Results{colors.RESET}") + print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark Results{colors.RESET}") print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}") # Basic statistics @@ -329,6 +391,8 @@ def parse_arguments(): %(prog)s -d 60 # Run basic benchmark for 60 seconds %(prog)s --code deep_static # Run deep static call stack benchmark %(prog)s --code deep_static -d 30 # Run deep static benchmark for 30 seconds + %(prog)s --operation async_stack_trace + %(prog)s --operation all_awaited_by Available code examples: {examples_desc} @@ -348,8 +412,15 @@ def parse_arguments(): "--code", "-c", choices=list(CODE_EXAMPLES.keys()), - default="basic", - help="Code example to benchmark (default: basic)", + default=None, + help="Code example to benchmark (default: basic, or asyncio for async operations)", + ) + + parser.add_argument( + "--operation", + choices=list(OPERATIONS.keys()), + default="stack_trace", + help="Remote unwinder operation to benchmark (default: stack_trace)", ) parser.add_argument( @@ -365,7 +436,10 @@ def parse_arguments(): help="Stop all threads before sampling for consistent snapshots", ) - return parser.parse_args() + args = parser.parse_args() + if args.code is None: + args.code = "asyncio" if args.operation != "stack_trace" else "basic" + return args def create_target_process(temp_file, code_example="basic"): @@ -420,6 +494,9 @@ def main(): print( f"{colors.CYAN}Benchmark Duration:{colors.RESET} {colors.YELLOW}{args.duration}{colors.RESET} seconds" ) + print( + f"{colors.CYAN}Operation:{colors.RESET} {colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}" + ) print( f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if args.blocking else colors.YELLOW}{'enabled' if args.blocking else 'disabled'}{colors.RESET}" ) @@ -451,7 +528,12 @@ def main(): unwinder = _remote_debugging.RemoteUnwinder( process.pid, cache_frames=True, **kwargs ) - results = benchmark(unwinder, duration_seconds=args.duration, blocking=args.blocking) + results = benchmark( + unwinder, + duration_seconds=args.duration, + blocking=args.blocking, + operation=args.operation, + ) finally: cleanup_process(process, temp_file_path) From 299518714a0eb33bf9621a785ba402a09464ec66 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 13 May 2026 00:59:07 +0100 Subject: [PATCH 8/9] cleanp --- Modules/_remote_debugging/_remote_debugging.h | 42 ++++++------------- Modules/_remote_debugging/module.c | 37 ++++++++++++---- Modules/_remote_debugging/threads.c | 12 ++++-- 3 files changed, 50 insertions(+), 41 deletions(-) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 2140765ce288d8..4d33d4f976edc1 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -30,6 +30,7 @@ extern "C" { #include "internal/pycore_llist.h" // struct llist_node #include "internal/pycore_long.h" // _PyLong_GetZero #include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause +#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw #include "internal/pycore_stackref.h" // Py_TAG_BITS #include "../../Python/remote_debug.h" @@ -270,16 +271,22 @@ typedef struct { uint64_t batched_read_segments_completed; // Segments completed by batched reads } UnwinderStats; +#if defined(__GNUC__) || defined(__clang__) +# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0) +#else +# define REMOTE_DEBUG_UNLIKELY(value) (value) +#endif + /* Stats tracking macros - no-op when stats collection is disabled */ #define STATS_INC(unwinder, field) \ - do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0) + do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0) #define STATS_ADD(unwinder, field, val) \ - do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0) + do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0) #define STATS_BATCHED_READ(unwinder, requested, completed) \ do { \ - if ((unwinder)->collect_stats) { \ + if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \ (unwinder)->stats.batched_read_attempts++; \ (unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \ (unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \ @@ -292,31 +299,6 @@ typedef struct { } \ } while(0) -static inline int -_Py_RemoteDebug_CountCompletedSegments( - const _Py_RemoteReadSegment *segments, - int nsegs, - Py_ssize_t nread) -{ - if (nread < 0) { - return 0; - } - - int completed = 0; - Py_ssize_t bytes_needed = 0; - for (int i = 0; i < nsegs; i++) { - if (segments[i].size > (size_t)(PY_SSIZE_T_MAX - bytes_needed)) { - break; - } - bytes_needed += (Py_ssize_t)segments[i].size; - if (nread < bytes_needed) { - break; - } - completed++; - } - return completed; -} - typedef struct { PyTypeObject *RemoteDebugging_Type; PyTypeObject *TaskInfo_Type; @@ -368,10 +350,12 @@ typedef struct { int cache_frames; int collect_stats; // whether to collect statistics uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale - InterpreterThreadCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE]; + uintptr_t cached_tstate_interpreter_addr; // hot last-interpreter prediction + uintptr_t cached_tstate_addr; // hot first-thread prediction RemoteDebuggingState *cached_state; FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries UnwinderStats stats; // statistics for performance analysis + InterpreterThreadCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE]; #ifdef Py_GIL_DISABLED uint32_t tlbc_generation; _Py_hashtable_t *tlbc_cache; diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index a7b94defa9890e..95ea653a7dd904 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -361,6 +361,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, self->cache_frames = cache_frames; self->collect_stats = stats; self->stale_invalidation_counter = 0; + self->cached_tstate_interpreter_addr = 0; + self->cached_tstate_addr = 0; memset(self->cached_tstates, 0, sizeof(self->cached_tstates)); self->debug = debug; self->only_active_thread = only_active_thread; @@ -478,13 +480,10 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, static size_t interpreter_thread_cache_index(uintptr_t interpreter_addr) { - // The interpreter ID lives in PyInterpreterState, which is the state we are - // trying to prefetch. At this point the cheap stable key is the remote - // interpreter address, so use it for a small direct-mapped prediction cache. - // The full address is stored in each entry and checked on lookup, so hash - // collisions are detected as misses; storing a colliding entry only replaces - // the previous prediction and cannot return the wrong thread state. - return ((interpreter_addr >> 4) ^ (interpreter_addr >> 12)) + // Direct-mapped table indexed by the remote interpreter address. Each entry + // stores the full address and verifies it on lookup, so hash collisions + // degrade to misses and cannot return a wrong tstate. + return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr) & (INTERPRETER_THREAD_CACHE_SIZE - 1); } @@ -497,9 +496,15 @@ get_cached_tstate_for_interpreter( return 0; } + if (self->cached_tstate_interpreter_addr == interpreter_addr) { + return self->cached_tstate_addr; + } + InterpreterThreadCacheEntry *entry = &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)]; if (entry->interpreter_addr == interpreter_addr) { + self->cached_tstate_interpreter_addr = interpreter_addr; + self->cached_tstate_addr = entry->thread_state_addr; return entry->thread_state_addr; } return 0; @@ -515,6 +520,9 @@ set_cached_tstate_for_interpreter( return; } + self->cached_tstate_interpreter_addr = interpreter_addr; + self->cached_tstate_addr = thread_state_addr; + InterpreterThreadCacheEntry *entry = &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)]; entry->interpreter_addr = interpreter_addr; @@ -584,8 +592,19 @@ read_interp_state_and_maybe_thread_frame( int nsegs = prefetch->frame_addr != 0 ? 3 : 2; Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( &unwinder->handle, segments, nsegs); - int completed = _Py_RemoteDebug_CountCompletedSegments( - segments, nsegs, nread); + int completed = 0; + if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) { + completed = 1; + Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE + + (Py_ssize_t)tstate_size; + if (nread >= with_tstate) { + completed = 2; + } + if (nsegs == 3 + && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) { + completed = 3; + } + } STATS_BATCHED_READ(unwinder, nsegs, completed); if (completed >= 1) { if (completed >= 2) { diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 24a9fd2918a0b2..1201a8d4a30c28 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -307,7 +307,13 @@ read_thread_state_and_maybe_frame( }; Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory( &unwinder->handle, segments, 2); - int completed = _Py_RemoteDebug_CountCompletedSegments(segments, 2, nread); + int completed = 0; + if (nread >= (Py_ssize_t)tstate_size) { + completed = 1; + if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) { + completed = 2; + } + } STATS_BATCHED_READ(unwinder, 2, completed); if (completed >= 1) { *frame_read = completed == 2; @@ -335,9 +341,9 @@ unwind_stack_for_thread( char ts[SIZEOF_THREAD_STATE]; char local_prefetched_frame[SIZEOF_INTERP_FRAME]; RemoteReadPrefetch ctx_prefetch = {0}; - if (prefetch && prefetch->tstate && prefetch->tstate_addr == *current_tstate) { + if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) { memcpy(ts, prefetch->tstate, (size_t)unwinder->debug_offsets.thread_state.size); - if (prefetch->frame && prefetch->frame_addr != 0) { + if (prefetch->frame) { ctx_prefetch.frame = prefetch->frame; ctx_prefetch.frame_addr = prefetch->frame_addr; } From d576d388020bc283a145fea4bd634ec3b2cd9f47 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 13 May 2026 01:13:59 +0100 Subject: [PATCH 9/9] small cleanp --- Lib/profiling/sampling/sample.py | 8 ++------ Modules/_remote_debugging/_remote_debugging.h | 10 ++++++++-- Modules/_remote_debugging/module.c | 8 ++++---- Modules/_remote_debugging/threads.c | 19 +++++++++++-------- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py index 980d057cf49dab..235eacd889877b 100644 --- a/Lib/profiling/sampling/sample.py +++ b/Lib/profiling/sampling/sample.py @@ -296,18 +296,14 @@ def _print_unwinder_stats(self): print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})") print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})") - # Batched remote read stats batched_attempts = stats.get('batched_read_attempts', 0) batched_successes = stats.get('batched_read_successes', 0) batched_misses = stats.get('batched_read_misses', 0) segments_requested = stats.get('batched_read_segments_requested', 0) segments_completed = stats.get('batched_read_segments_completed', 0) - if batched_attempts > 0 or segments_requested > 0: + if batched_attempts > 0: batched_success_rate = stats.get('batched_read_success_rate', 0.0) - batched_miss_rate = ( - (batched_misses / batched_attempts * 100) - if batched_attempts > 0 else 0 - ) + batched_miss_rate = 100.0 - batched_success_rate segment_completion_rate = stats.get( 'batched_read_segment_completion_rate', 0.0 ) diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 4d33d4f976edc1..9eac4bfd7ec6ea 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -244,6 +244,10 @@ typedef struct { uintptr_t thread_state_addr; } InterpreterThreadCacheEntry; +// Carries already-read thread state and/or frame buffers across helpers so the +// downstream callee can skip a remote read. Address fields are caller-supplied +// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read +// successfully populated them. typedef struct { const char *tstate; uintptr_t tstate_addr; @@ -350,8 +354,10 @@ typedef struct { int cache_frames; int collect_stats; // whether to collect statistics uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale - uintptr_t cached_tstate_interpreter_addr; // hot last-interpreter prediction - uintptr_t cached_tstate_addr; // hot first-thread prediction + // L1 single-entry shortcut over cached_tstates[]: most workloads sample one + // interpreter, so check this pair before hashing into the table below. + uintptr_t cached_tstate_interpreter_addr; + uintptr_t cached_tstate_addr; RemoteDebuggingState *cached_state; FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries UnwinderStats stats; // statistics for performance analysis diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 95ea653a7dd904..ea138b03252367 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -477,7 +477,7 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self, return 0; } -static size_t +static inline size_t interpreter_thread_cache_index(uintptr_t interpreter_addr) { // Direct-mapped table indexed by the remote interpreter address. Each entry @@ -487,7 +487,7 @@ interpreter_thread_cache_index(uintptr_t interpreter_addr) & (INTERPRETER_THREAD_CACHE_SIZE - 1); } -static uintptr_t +static inline uintptr_t get_cached_tstate_for_interpreter( RemoteUnwinderObject *self, uintptr_t interpreter_addr) @@ -510,7 +510,7 @@ get_cached_tstate_for_interpreter( return 0; } -static void +static inline void set_cached_tstate_for_interpreter( RemoteUnwinderObject *self, uintptr_t interpreter_addr, @@ -759,7 +759,7 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self // Target specific thread (only process first interpreter) current_tstate = self->tstate_addr; } - if (current_tstate != 0) { + if (current_tstate != 0 && self->cache_frames) { set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate); } diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c index 1201a8d4a30c28..7284f43042061a 100644 --- a/Modules/_remote_debugging/threads.c +++ b/Modules/_remote_debugging/threads.c @@ -338,11 +338,12 @@ unwind_stack_for_thread( PyObject *result = NULL; StackChunkList chunks = {0}; - char ts[SIZEOF_THREAD_STATE]; + char local_ts[SIZEOF_THREAD_STATE]; char local_prefetched_frame[SIZEOF_INTERP_FRAME]; + const char *ts; RemoteReadPrefetch ctx_prefetch = {0}; if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) { - memcpy(ts, prefetch->tstate, (size_t)unwinder->debug_offsets.thread_state.size); + ts = prefetch->tstate; if (prefetch->frame) { ctx_prefetch.frame = prefetch->frame; ctx_prefetch.frame_addr = prefetch->frame_addr; @@ -356,33 +357,35 @@ unwind_stack_for_thread( predicted_frame_addr = entry->addrs[0]; } - int bytes_read = read_thread_state_and_maybe_frame( + int rc = read_thread_state_and_maybe_frame( unwinder, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, - ts, + local_ts, predicted_frame_addr, local_prefetched_frame, &have_prefetched_frame); - if (bytes_read < 0) { + if (rc < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); goto error; } + ts = local_ts; if (have_prefetched_frame) { ctx_prefetch.frame = local_prefetched_frame; ctx_prefetch.frame_addr = predicted_frame_addr; } } else { - int bytes_read = _Py_RemoteDebug_ReadRemoteMemory( + int rc = _Py_RemoteDebug_ReadRemoteMemory( &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, - ts); - if (bytes_read < 0) { + local_ts); + if (rc < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state"); goto error; } + ts = local_ts; } STATS_INC(unwinder, memory_reads); STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);