Skip to content

Commit 0124011

Browse files
committed
gh-149584: Batch predicted profiler reads
Use the frame cache to predict the next thread state and top frame address, then batch interpreter/thread/frame reads with process_vm_readv when profiling a Linux target. Reuse prefetched frame buffers in the frame walker when the prediction is valid. Local benchmark: Tools/inspection/benchmark_external_inspection.py basic, 3s quiet mode, cache_frames/all_threads. Previous commit: 747,946 calls/sec, 688,282 samples/sec, 1.337 us/call. After this commit: 897,703 calls/sec, 806,084 samples/sec, 1.114 us/call. Incremental win: +149,756 calls/sec (+20.0%), -0.223 us/call.
1 parent 5d92b1f commit 0124011

5 files changed

Lines changed: 256 additions & 33 deletions

File tree

Modules/_remote_debugging/_remote_debugging.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ typedef struct {
224224

225225
typedef struct {
226226
uint64_t thread_id; // 0 = empty slot
227+
uintptr_t thread_state_addr;
227228
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
228229
Py_ssize_t num_addrs;
229230
PyObject *frame_list; // owned reference, NULL if empty
@@ -302,6 +303,7 @@ typedef struct {
302303
int cache_frames;
303304
int collect_stats; // whether to collect statistics
304305
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
306+
uintptr_t cached_tstate_addr; // predicted first thread for batched reads
305307
RemoteDebuggingState *cached_state;
306308
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
307309
UnwinderStats stats; // statistics for performance analysis
@@ -361,11 +363,14 @@ typedef struct {
361363
typedef struct {
362364
/* Inputs */
363365
uintptr_t frame_addr; // Starting frame address
366+
uintptr_t thread_state_addr; // Owning thread state address
364367
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
365368
uintptr_t gc_frame; // GC frame address (0 if not tracking)
366369
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
367370
StackChunkList *chunks; // Pre-copied stack chunks
368371
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
372+
const char *prefetched_frame; // Optional already-read frame buffer
373+
uintptr_t prefetched_frame_addr; // Remote address for prefetched_frame
369374

370375
/* Outputs */
371376
PyObject *frame_info; // List to append FrameInfo objects
@@ -548,6 +553,7 @@ extern int process_frame_chain(
548553
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
549554
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
550555
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
556+
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
551557
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
552558
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
553559
extern int frame_cache_lookup_and_extend(
@@ -566,6 +572,7 @@ extern int frame_cache_store(
566572
PyObject *frame_list,
567573
const uintptr_t *addrs,
568574
Py_ssize_t num_addrs,
575+
uintptr_t thread_state_addr,
569576
uintptr_t base_frame_addr,
570577
uintptr_t last_frame_visited);
571578

@@ -605,7 +612,11 @@ extern PyObject* unwind_stack_for_thread(
605612
uintptr_t *current_tstate,
606613
uintptr_t gil_holder_tstate,
607614
uintptr_t gc_frame,
608-
uintptr_t main_thread_tstate
615+
uintptr_t main_thread_tstate,
616+
const char *prefetched_tstate,
617+
uintptr_t prefetched_tstate_addr,
618+
const char *prefetched_frame,
619+
uintptr_t prefetched_frame_addr
609620
);
610621

611622
/* Thread stopping functions (for blocking mode) */

Modules/_remote_debugging/frame_cache.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
5353
return NULL;
5454
}
5555

56+
FrameCacheEntry *
57+
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
58+
{
59+
if (!unwinder->frame_cache || tstate_addr == 0) {
60+
return NULL;
61+
}
62+
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
63+
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
64+
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
65+
return &unwinder->frame_cache[i];
66+
}
67+
}
68+
return NULL;
69+
}
70+
5671
// Allocate a cache slot for a thread
5772
// Returns NULL if cache is full (graceful degradation)
5873
static FrameCacheEntry *
@@ -129,6 +144,7 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
129144
// Clear this entry
130145
Py_CLEAR(unwinder->frame_cache[i].frame_list);
131146
unwinder->frame_cache[i].thread_id = 0;
147+
unwinder->frame_cache[i].thread_state_addr = 0;
132148
unwinder->frame_cache[i].num_addrs = 0;
133149
STATS_INC(unwinder, stale_cache_invalidations);
134150
}
@@ -216,6 +232,7 @@ frame_cache_store(
216232
PyObject *frame_list,
217233
const uintptr_t *addrs,
218234
Py_ssize_t num_addrs,
235+
uintptr_t thread_state_addr,
219236
uintptr_t base_frame_addr,
220237
uintptr_t last_frame_visited)
221238
{
@@ -257,6 +274,7 @@ frame_cache_store(
257274
return -1;
258275
}
259276
entry->thread_id = thread_id;
277+
entry->thread_state_addr = thread_state_addr;
260278
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
261279
entry->num_addrs = num_addrs;
262280
assert(entry->num_addrs == num_addrs);

Modules/_remote_debugging/frames.c

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -186,30 +186,16 @@ is_frame_valid(
186186
return 1;
187187
}
188188

189-
int
190-
parse_frame_object(
189+
static int
190+
parse_frame_buffer(
191191
RemoteUnwinderObject *unwinder,
192192
PyObject** result,
193-
uintptr_t address,
193+
const char *frame,
194194
uintptr_t* address_of_code_object,
195195
uintptr_t* previous_frame
196196
) {
197-
char frame[SIZEOF_INTERP_FRAME];
198197
*address_of_code_object = 0;
199198

200-
Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
201-
&unwinder->handle,
202-
address,
203-
SIZEOF_INTERP_FRAME,
204-
frame
205-
);
206-
if (bytes_read < 0) {
207-
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
208-
return -1;
209-
}
210-
STATS_INC(unwinder, memory_reads);
211-
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
212-
213199
*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
214200
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
215201
int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
@@ -237,6 +223,31 @@ parse_frame_object(
237223
return parse_code_object(unwinder, result, &code_ctx);
238224
}
239225

226+
int
227+
parse_frame_object(
228+
RemoteUnwinderObject *unwinder,
229+
PyObject** result,
230+
uintptr_t address,
231+
uintptr_t* address_of_code_object,
232+
uintptr_t* previous_frame
233+
) {
234+
char frame[SIZEOF_INTERP_FRAME];
235+
Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
236+
&unwinder->handle,
237+
address,
238+
SIZEOF_INTERP_FRAME,
239+
frame
240+
);
241+
if (bytes_read < 0) {
242+
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
243+
return -1;
244+
}
245+
STATS_INC(unwinder, memory_reads);
246+
STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
247+
248+
return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame);
249+
}
250+
240251
int
241252
parse_frame_from_chunks(
242253
RemoteUnwinderObject *unwinder,
@@ -312,15 +323,32 @@ process_frame_chain(
312323
}
313324
assert(frame_count <= MAX_FRAMES);
314325

315-
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
326+
if (ctx->chunks && ctx->chunks->count > 0) {
327+
if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) {
328+
goto parsed_frame;
329+
}
316330
PyErr_Clear();
331+
}
332+
{
317333
uintptr_t address_of_code_object = 0;
318-
if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
334+
int parse_result;
335+
if (ctx->prefetched_frame && ctx->prefetched_frame_addr == frame_addr) {
336+
parse_result = parse_frame_buffer(
337+
unwinder, &frame, ctx->prefetched_frame,
338+
&address_of_code_object, &next_frame_addr);
339+
}
340+
else {
341+
parse_result = parse_frame_object(
342+
unwinder, &frame, frame_addr,
343+
&address_of_code_object, &next_frame_addr);
344+
}
345+
if (parse_result < 0) {
319346
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
320347
return -1;
321348
}
322349
}
323350

351+
parsed_frame:
324352
// Skip first frame if requested (used for cache miss continuation)
325353
if (ctx->skip_first_frame && frame_count == 1) {
326354
Py_XDECREF(frame);
@@ -501,8 +529,16 @@ try_full_cache_hit(
501529
PyObject *current_frame = NULL;
502530
uintptr_t code_object_addr = 0;
503531
uintptr_t previous_frame = 0;
504-
int parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
532+
int parse_result;
533+
if (ctx->prefetched_frame && ctx->prefetched_frame_addr == ctx->frame_addr) {
534+
parse_result = parse_frame_buffer(unwinder, &current_frame,
535+
ctx->prefetched_frame,
505536
&code_object_addr, &previous_frame);
537+
}
538+
else {
539+
parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
540+
&code_object_addr, &previous_frame);
541+
}
506542
if (parse_result < 0) {
507543
return -1;
508544
}
@@ -606,7 +642,8 @@ collect_frames_with_cache(
606642
}
607643

608644
if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
609-
ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
645+
ctx->thread_state_addr, ctx->base_frame_addr,
646+
ctx->last_frame_visited) < 0) {
610647
return -1;
611648
}
612649

Modules/_remote_debugging/module.c

Lines changed: 77 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
360360
self->cache_frames = cache_frames;
361361
self->collect_stats = stats;
362362
self->stale_invalidation_counter = 0;
363+
self->cached_tstate_addr = 0;
363364
self->debug = debug;
364365
self->only_active_thread = only_active_thread;
365366
self->mode = mode;
@@ -473,6 +474,52 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
473474
return 0;
474475
}
475476

477+
static int
478+
read_interp_state_and_maybe_thread_frame(
479+
RemoteUnwinderObject *unwinder,
480+
uintptr_t interpreter_addr,
481+
char *interp_state_buffer,
482+
uintptr_t predicted_tstate_addr,
483+
char *tstate_buffer,
484+
int *tstate_read,
485+
uintptr_t predicted_frame_addr,
486+
char *frame_buffer,
487+
int *frame_read)
488+
{
489+
*tstate_read = 0;
490+
*frame_read = 0;
491+
#if defined(__linux__) && HAVE_PROCESS_VM_READV
492+
if (predicted_tstate_addr != 0 && unwinder->handle.memfd == -1) {
493+
struct iovec local[3] = {
494+
{.iov_base = interp_state_buffer, .iov_len = INTERP_STATE_BUFFER_SIZE},
495+
{.iov_base = tstate_buffer,
496+
.iov_len = (size_t)unwinder->debug_offsets.thread_state.size},
497+
{.iov_base = frame_buffer, .iov_len = SIZEOF_INTERP_FRAME},
498+
};
499+
struct iovec remote[3] = {
500+
{.iov_base = (void *)interpreter_addr, .iov_len = INTERP_STATE_BUFFER_SIZE},
501+
{.iov_base = (void *)predicted_tstate_addr,
502+
.iov_len = (size_t)unwinder->debug_offsets.thread_state.size},
503+
{.iov_base = (void *)predicted_frame_addr, .iov_len = SIZEOF_INTERP_FRAME},
504+
};
505+
int iovcnt = predicted_frame_addr != 0 ? 3 : 2;
506+
ssize_t nread = process_vm_readv(unwinder->handle.pid, local, iovcnt, remote, iovcnt, 0);
507+
if (nread >= (ssize_t)INTERP_STATE_BUFFER_SIZE) {
508+
Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE
509+
+ (Py_ssize_t)unwinder->debug_offsets.thread_state.size;
510+
*tstate_read = nread >= with_tstate;
511+
*frame_read = iovcnt == 3 && nread == with_tstate + (ssize_t)SIZEOF_INTERP_FRAME;
512+
return 0;
513+
}
514+
}
515+
#endif
516+
return _Py_RemoteDebug_ReadRemoteMemory(
517+
&unwinder->handle,
518+
interpreter_addr,
519+
INTERP_STATE_BUFFER_SIZE,
520+
interp_state_buffer);
521+
}
522+
476523
/*[clinic input]
477524
@permit_long_docstring_body
478525
@critical_section
@@ -537,11 +584,29 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
537584
while (current_interpreter != 0) {
538585
// Read interpreter state to get the interpreter ID
539586
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
540-
if (_Py_RemoteDebug_ReadRemoteMemory(
541-
&self->handle,
587+
char prefetched_tstate[SIZEOF_THREAD_STATE];
588+
char prefetched_frame[SIZEOF_INTERP_FRAME];
589+
int have_prefetched_tstate = 0;
590+
int have_prefetched_frame = 0;
591+
uintptr_t predicted_tstate_addr = self->cache_frames ? self->cached_tstate_addr : 0;
592+
uintptr_t predicted_frame_addr = 0;
593+
if (predicted_tstate_addr != 0) {
594+
FrameCacheEntry *entry = frame_cache_find_by_tstate(self, predicted_tstate_addr);
595+
if (entry && entry->num_addrs > 0) {
596+
predicted_frame_addr = entry->addrs[0];
597+
}
598+
}
599+
600+
if (read_interp_state_and_maybe_thread_frame(
601+
self,
542602
current_interpreter,
543-
INTERP_STATE_BUFFER_SIZE,
544-
interp_state_buffer) < 0) {
603+
interp_state_buffer,
604+
predicted_tstate_addr,
605+
prefetched_tstate,
606+
&have_prefetched_tstate,
607+
predicted_frame_addr,
608+
prefetched_frame,
609+
&have_prefetched_frame) < 0) {
545610
set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer");
546611
Py_CLEAR(result);
547612
goto exit;
@@ -611,6 +676,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
611676
// Target specific thread (only process first interpreter)
612677
current_tstate = self->tstate_addr;
613678
}
679+
if (current_tstate != 0) {
680+
self->cached_tstate_addr = current_tstate;
681+
}
614682

615683
// Acquire main thread state information
616684
uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
@@ -621,7 +689,11 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
621689
PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate,
622690
gil_holder_tstate,
623691
gc_frame,
624-
main_thread_tstate);
692+
main_thread_tstate,
693+
have_prefetched_tstate ? prefetched_tstate : NULL,
694+
predicted_tstate_addr,
695+
have_prefetched_frame ? prefetched_frame : NULL,
696+
predicted_frame_addr);
625697
if (!frame_info) {
626698
// Check if this was an intentional skip due to mode-based filtering
627699
if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL ||

0 commit comments

Comments
 (0)