diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index f27ec4350bb2c8..ded6be47837e8b 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -108,6 +108,10 @@ extern PyStatus _PyPerfTrampoline_AfterFork_Child(void); #ifdef PY_HAVE_PERF_TRAMPOLINE extern _PyPerf_Callbacks _Py_perfmap_callbacks; extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks; +extern void _PyPerfJit_WriteNamedCode(const void *code_addr, + unsigned int code_size, + const char *entry, + const char *filename); #endif static inline PyObject* diff --git a/Include/internal/pycore_jit_unwind.h b/Include/internal/pycore_jit_unwind.h new file mode 100644 index 00000000000000..fdd0dce3a5d15c --- /dev/null +++ b/Include/internal/pycore_jit_unwind.h @@ -0,0 +1,28 @@ +#ifndef Py_CORE_JIT_UNWIND_H +#define Py_CORE_JIT_UNWIND_H + +#ifdef PY_HAVE_PERF_TRAMPOLINE + +#include + +/* Return the size of the generated .eh_frame data for the given encoding. */ +size_t _PyJitUnwind_EhFrameSize(int absolute_addr); + +/* + * Build DWARF .eh_frame data for JIT code; returns size written or 0 on error. + * absolute_addr selects the FDE address encoding: + * - 0: PC-relative offsets (perf jitdump synthesized DSO). + * - nonzero: absolute addresses (GDB JIT in-memory ELF). + */ +size_t _PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr); + +void _PyJitUnwind_GdbRegisterCode(const void *code_addr, + unsigned int code_size, + const char *entry, + const char *filename); + +#endif // PY_HAVE_PERF_TRAMPOLINE + +#endif // Py_CORE_JIT_UNWIND_H diff --git a/Lib/test/test_gdb/gdb_jit_sample.py b/Lib/test/test_gdb/gdb_jit_sample.py new file mode 100644 index 00000000000000..ad9a0634aa3c61 --- /dev/null +++ b/Lib/test/test_gdb/gdb_jit_sample.py @@ -0,0 +1,20 @@ +# Sample script for use by test_gdb.test_jit + +import operator +import sys + + +def jit_bt_hot(depth, warming_up_caller=False): + if warming_up_caller: + return + if depth == 0: + id(42) + return + + warming_up = True + while warming_up: + warming_up = sys._jit.is_enabled() & (not sys._jit.is_active()) + operator.call(jit_bt_hot, depth - 1, warming_up) + + +jit_bt_hot(10) diff --git a/Lib/test/test_gdb/test_jit.py b/Lib/test/test_gdb/test_jit.py new file mode 100644 index 00000000000000..872b838c9571f6 --- /dev/null +++ b/Lib/test/test_gdb/test_jit.py @@ -0,0 +1,35 @@ +import os +import re +import sys +import unittest + +from .util import setup_module, DebuggerTests + + +JIT_SAMPLE_SCRIPT = os.path.join(os.path.dirname(__file__), "gdb_jit_sample.py") + + +def setUpModule(): + setup_module() + + +@unittest.skipUnless( + hasattr(sys, "_jit") and sys._jit.is_available(), + "requires a JIT-enabled build", +) +class JitBacktraceTests(DebuggerTests): + def test_bt_unwinds_through_jit_frames(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=["bt"], + PYTHON_JIT="1", + ) + self.assertIn("py::jit_executor:", gdb_output) + self.assertIn("py::jit_shim:", gdb_output) + self.assertRegex( + gdb_output, + re.compile( + r"py::jit_executor:.*(_PyEval_EvalFrameDefault|_PyEval_Vector)", + re.DOTALL, + ), + ) diff --git a/Lib/test/test_gdb/util.py b/Lib/test/test_gdb/util.py index 8097fd52ababe6..fe8f9146bf1eb0 100644 --- a/Lib/test/test_gdb/util.py +++ b/Lib/test/test_gdb/util.py @@ -160,7 +160,8 @@ def get_stack_trace(self, source=None, script=None, breakpoint=BREAKPOINT_FN, cmds_after_breakpoint=None, import_site=False, - ignore_stderr=False): + ignore_stderr=False, + **env_vars): ''' Run 'python -c SOURCE' under gdb with a breakpoint. @@ -239,7 +240,7 @@ def get_stack_trace(self, source=None, script=None, args += [script] # Use "args" to invoke gdb, capturing stdout, stderr: - out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED) + out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED, **env_vars) if not ignore_stderr: for line in err.splitlines(): diff --git a/Makefile.pre.in b/Makefile.pre.in index 5ea00537629de0..b1d85c10890bbe 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -510,6 +510,7 @@ PYTHON_OBJS= \ Python/suggestions.o \ Python/perf_trampoline.o \ Python/perf_jit_trampoline.o \ + Python/jit_unwind.o \ Python/remote_debugging.o \ Python/$(DYNLOADFILE) \ $(LIBOBJS) \ diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst new file mode 100644 index 00000000000000..57e897cabc494b --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst @@ -0,0 +1 @@ +Add support for unwinding JIT frames using GDB. Patch by Diego Russo diff --git a/Python/jit.c b/Python/jit.c index 4990c743224d3c..f6ad1565a2d109 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -15,6 +15,7 @@ #include "pycore_interpframe.h" #include "pycore_interpolation.h" #include "pycore_intrinsics.h" +#include "pycore_jit_unwind.h" #include "pycore_lazyimportobject.h" #include "pycore_list.h" #include "pycore_long.h" @@ -60,6 +61,28 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } +static void +jit_record_code(const void *code_addr, size_t code_size, + const char *entry, const char *filename) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + _PyPerf_Callbacks callbacks; + _PyPerfTrampoline_GetCallbacks(&callbacks); + if (callbacks.write_state == _Py_perfmap_jit_callbacks.write_state) { + _PyPerfJit_WriteNamedCode( + code_addr, (unsigned int)code_size, entry, filename); + return; + } + _PyJitUnwind_GdbRegisterCode( + code_addr, (unsigned int)code_size, entry, filename); +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; +#endif +} + static size_t _Py_jit_shim_size = 0; static int @@ -731,6 +754,10 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz } executor->jit_code = memory; executor->jit_size = total_size; + jit_record_code(memory, + code_size + state.trampolines.size, + "jit_executor", + ""); return 0; } @@ -781,6 +808,10 @@ compile_shim(void) return NULL; } _Py_jit_shim_size = total_size; + jit_record_code(memory, + code_size + state.trampolines.size, + "jit_shim", + ""); return (_PyJitEntryFuncPtr)memory; } diff --git a/Python/jit_unwind.c b/Python/jit_unwind.c new file mode 100644 index 00000000000000..3ad682c03e7575 --- /dev/null +++ b/Python/jit_unwind.c @@ -0,0 +1,899 @@ +/* + * Python JIT - DWARF .eh_frame builder + * + * This file contains the DWARF CFI generator used to build .eh_frame + * data for JIT code (perf jitdump and other unwinders). + */ + +#include "Python.h" +#include "pycore_jit_unwind.h" + +#ifdef PY_HAVE_PERF_TRAMPOLINE + +#if defined(__linux__) +# include +#endif +#include +#include + +// ============================================================================= +// DWARF CONSTANTS +// ============================================================================= + +/* + * DWARF (Debug With Arbitrary Record Formats) constants + * + * DWARF is a debugging data format used to provide stack unwinding information. + * These constants define the various encoding types and opcodes used in + * DWARF Call Frame Information (CFI) records. + */ + +/* DWARF Call Frame Information version */ +#define DWRF_CIE_VERSION 1 + +/* DWARF CFA (Call Frame Address) opcodes */ +enum { + DWRF_CFA_nop = 0x0, // No operation + DWRF_CFA_offset_extended = 0x5, // Extended offset instruction + DWRF_CFA_def_cfa = 0xc, // Define CFA rule + DWRF_CFA_def_cfa_register = 0xd, // Define CFA register + DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset + DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset + DWRF_CFA_advance_loc = 0x40, // Advance location counter + DWRF_CFA_offset = 0x80, // Simple offset instruction + DWRF_CFA_restore = 0xc0 // Restore register +}; + +/* DWARF Exception Handling pointer encodings */ +enum { + DWRF_EH_PE_absptr = 0x00, // Absolute pointer + DWRF_EH_PE_omit = 0xff, // Omitted value + + /* Data type encodings */ + DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 + DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte + DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte + DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte + DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 + DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte + DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte + DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte + DWRF_EH_PE_signed = 0x08, // Signed flag + + /* Reference type encodings */ + DWRF_EH_PE_pcrel = 0x10, // PC-relative + DWRF_EH_PE_textrel = 0x20, // Text-relative + DWRF_EH_PE_datarel = 0x30, // Data-relative + DWRF_EH_PE_funcrel = 0x40, // Function-relative + DWRF_EH_PE_aligned = 0x50, // Aligned + DWRF_EH_PE_indirect = 0x80 // Indirect +}; + +/* + * Architecture-specific DWARF register numbers + * + * These constants define the register numbering scheme used by DWARF + * for each supported architecture. The numbers must match the ABI + * specification for proper stack unwinding. + */ +enum { +#ifdef __x86_64__ + /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ + DWRF_REG_AX, // RAX + DWRF_REG_DX, // RDX + DWRF_REG_CX, // RCX + DWRF_REG_BX, // RBX + DWRF_REG_SI, // RSI + DWRF_REG_DI, // RDI + DWRF_REG_BP, // RBP + DWRF_REG_SP, // RSP + DWRF_REG_8, // R8 + DWRF_REG_9, // R9 + DWRF_REG_10, // R10 + DWRF_REG_11, // R11 + DWRF_REG_12, // R12 + DWRF_REG_13, // R13 + DWRF_REG_14, // R14 + DWRF_REG_15, // R15 + DWRF_REG_RA, // Return address (RIP) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 register numbering */ + DWRF_REG_FP = 29, // Frame Pointer + DWRF_REG_RA = 30, // Link register (return address) + DWRF_REG_SP = 31, // Stack pointer +#else +# error "Unsupported target architecture" +#endif +}; + +// ============================================================================= +// ELF OBJECT CONTEXT +// ============================================================================= + +/* + * Context for building ELF/DWARF structures + * + * This structure maintains state while constructing DWARF unwind information. + * It acts as a simple buffer manager with pointers to track current position + * and important landmarks within the buffer. + */ +typedef struct ELFObjectContext { + uint8_t* p; // Current write position in buffer + uint8_t* startp; // Start of buffer (for offset calculations) + uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) + uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) + uintptr_t code_addr; // Address of the code section + size_t code_size; // Size of the code section +} ELFObjectContext; + +// ============================================================================= +// DWARF GENERATION UTILITIES +// ============================================================================= + +/* + * Append a null-terminated string to the ELF context buffer. + * + * Args: + * ctx: ELF object context + * str: String to append (must be null-terminated) + * + * Returns: Offset from start of buffer where string was written + */ +static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { + uint8_t* p = ctx->p; + uint32_t ofs = (uint32_t)(p - ctx->startp); + + /* Copy string including null terminator */ + do { + *p++ = (uint8_t)*str; + } while (*str++); + + ctx->p = p; + return ofs; +} + +/* + * Append a SLEB128 (Signed Little Endian Base 128) value + * + * SLEB128 is a variable-length encoding used extensively in DWARF. + * It efficiently encodes small numbers in fewer bytes. + * + * Args: + * ctx: ELF object context + * v: Signed value to encode + */ +static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { + *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Append a ULEB128 (Unsigned Little Endian Base 128) value + * + * Similar to SLEB128 but for unsigned values. + * + * Args: + * ctx: ELF object context + * v: Unsigned value to encode + */ +static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; v >= 0x80; v >>= 7) { + *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (char)v; // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Macros for generating DWARF structures + * + * These macros provide a convenient way to write various data types + * to the DWARF buffer while automatically advancing the pointer. + */ +#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit +#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit +#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit +#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit +#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address +#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 +#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 +#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string + +/* Align to specified boundary with NOP instructions */ +#define DWRF_ALIGNNOP(s) \ + while ((uintptr_t)p & ((s)-1)) { \ + *p++ = DWRF_CFA_nop; \ + } + +/* Write a DWARF section with automatic size calculation */ +#define DWRF_SECTION(name, stmt) \ + { \ + uint32_t* szp_##name = (uint32_t*)p; \ + p += 4; \ + stmt; \ + *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ + } + +// ============================================================================= +// DWARF EH FRAME GENERATION +// ============================================================================= + +static void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr); + +size_t +_PyJitUnwind_EhFrameSize(int absolute_addr) +{ + /* The .eh_frame we emit is small and bounded; keep a generous buffer. */ + uint8_t scratch[512]; + _Static_assert(sizeof(scratch) >= 256, + "scratch buffer may be too small for elf_init_ehframe"); + ELFObjectContext ctx; + ctx.code_size = 1; + ctx.code_addr = 0; + ctx.startp = ctx.p = scratch; + ctx.eh_frame_p = NULL; + ctx.fde_p = NULL; + /* Generate once into scratch to learn the required size. */ + elf_init_ehframe(&ctx, absolute_addr); + ptrdiff_t size = ctx.p - ctx.startp; + assert(size <= (ptrdiff_t)sizeof(scratch)); + return (size_t)size; +} + +size_t +_PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr) +{ + if (buffer == NULL || code_addr == NULL || code_size == 0) { + return 0; + } + /* Generate the frame twice: once to size-check, once to write. */ + size_t required = _PyJitUnwind_EhFrameSize(absolute_addr); + if (required == 0 || required > buffer_size) { + return 0; + } + ELFObjectContext ctx; + ctx.code_size = code_size; + ctx.code_addr = (uintptr_t)code_addr; + ctx.startp = ctx.p = buffer; + ctx.eh_frame_p = NULL; + ctx.fde_p = NULL; + elf_init_ehframe(&ctx, absolute_addr); + size_t written = (size_t)(ctx.p - ctx.startp); + /* The frame size is independent of code_addr/code_size (fixed-width fields). */ + assert(written == required); + return written; +} + +/* + * Generate a minimal .eh_frame for a single JIT code region. + * + * The .eh_frame section contains Call Frame Information (CFI) that describes + * how to unwind the stack at any point in the code. This is essential for + * unwinding through JIT-generated code. + * + * The generated data contains: + * 1. A CIE (Common Information Entry) describing the calling convention. + * 2. An FDE (Frame Description Entry) describing how to unwind the JIT frame. + * + * The caller selects the FDE address encoding through absolute_addr: + * - 0: emit PC-relative addresses for perf's synthesized DSO layout. + * - 1: emit absolute addresses for the GDB JIT in-memory ELF. + */ +static void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr) { + int fde_ptr_enc = absolute_addr + ? DWRF_EH_PE_absptr + : (DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); + uint8_t* p = ctx->p; + uint8_t* framep = p; // Remember start of frame data + + /* + * DWARF Unwind Table for Trampoline Function + * + * This section defines DWARF Call Frame Information (CFI) using encoded macros + * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function + * preserves and restores registers. This is used by profiling tools (e.g., `perf`) + * and debuggers for stack unwinding in JIT-compiled code. + * + * ------------------------------------------------- + * TO REGENERATE THIS TABLE FROM GCC OBJECTS: + * ------------------------------------------------- + * + * 1. Create a trampoline source file (e.g., `trampoline.c`): + * + * #include + * typedef PyObject* (*py_evaluator)(void*, void*, int); + * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { + * return evaluator(ts, f, throwflag); + * } + * + * 2. Compile to an object file with frame pointer preservation: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * 3. Extract DWARF unwind info from the object file: + * + * readelf -w trampoline.o + * + * Example output from `.eh_frame`: + * + * 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * 00000014 FDE cie=00000000 pc=0..14 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. + * + * ---------------------------------- + * HOW TO TRANSLATE TO DWRF_* MACROS: + * ---------------------------------- + * + * After compiling your trampoline with: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * run: + * + * readelf -w trampoline.o + * + * to inspect the generated `.eh_frame` data. You will see two main components: + * + * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. + * 2. An FDE (Frame Description Entry): function-specific unwind instructions. + * + * --------------------- + * Translating the CIE: + * --------------------- + * From `readelf -w`, you might see: + * + * 00000000 0000000000000010 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * Augmentation data: 1b + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * Map this to: + * + * DWRF_SECTION(CIE, + * DWRF_U32(0); // CIE ID (always 0 for CIEs) + * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 + * DWRF_STR("zR"); // Augmentation string "zR" + * DWRF_UV(4); // Code alignment factor = 4 + * DWRF_SV(-8); // Data alignment factor = -8 + * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) + * DWRF_UV(1); // Augmentation data length = 1 + * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers + * + * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa + * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) + * DWRF_UV(0); // Offset = 0 + * + * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary + * ) + * + * Notes: + * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. + * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. + * + * --------------------- + * Translating the FDE: + * --------------------- + * From `readelf -w`: + * + * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * Map the FDE header and instructions to: + * + * DWRF_SECTION(FDE, + * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) + * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) + * DWRF_U32(ctx->code_size); // Code range covered by this FDE + * DWRF_U8(0); // Augmentation data length (none) + * + * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + * DWRF_UV(16); + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) + * DWRF_UV(2); // At offset 2 * 8 = 16 bytes + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) + * DWRF_UV(1); // At offset 1 * 8 = 8 bytes + * + * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 + * + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + * DWRF_UV(0); + * ) + * + * To regenerate: + * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. + * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as + * the code is in a different address space every time. + * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: + * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) + * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) + * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset + * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) + * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. + * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. + */ + + /* + * Emit DWARF EH CIE (Common Information Entry) + * + * The CIE describes the calling conventions and basic unwinding rules + * that apply to all functions in this compilation unit. + */ + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID (0 indicates this is a CIE) + DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) + DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) +#ifdef __x86_64__ + DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) +#endif + DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) + DWRF_U8(DWRF_REG_RA); // Return address register number + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + /* Initial CFI instructions - describe default calling convention */ +#ifdef __x86_64__ + /* x86_64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size + DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved + DWRF_UV(1); // At offset 1 from CFA +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) + // No initial register saves in AArch64 CIE +#endif + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + ctx->eh_frame_p = p; // Remember start of FDE data + + /* + * Emit DWARF EH FDE (Frame Description Entry) + * + * The FDE describes unwinding information specific to this function. + * It references the CIE and provides function-specific CFI instructions. + * + * The PC-relative offset is calculated after the entire EH frame is built + * to ensure accurate positioning relative to the synthesized DSO layout. + */ + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + if (absolute_addr) { + DWRF_ADDR(ctx->code_addr); // Absolute code start + DWRF_ADDR((uintptr_t)ctx->code_size); // Code range covered + } + else { + /* + * In perf jitdump mode (absolute_addr == 0), the FDE PC field is + * encoded PC-relative and points back to code_start. For the GDB + * JIT interface we reuse the same generator with absolute_addr == 1; + * the EH frame is then carried in a .eh_frame section of an + * in-memory ELF (no EhFrameHeader). + */ + ctx->fde_p = p; // Remember where PC offset field is located for later calculation + DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) + DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) + } + DWRF_U8(0); // Augmentation data length (none) + + /* + * Architecture-specific CFI instructions + * + * These instructions describe how registers are saved and restored + * during function calls. Each architecture has different calling + * conventions and register usage patterns. + */ +#ifdef __x86_64__ + /* x86_64 calling convention unwinding rules with frame pointer */ +# if defined(__CET__) && (__CET__ & 1) + DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) +# endif + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) + DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 + DWRF_UV(16); // New offset: SP + 16 + DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 + DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 + DWRF_UV(DWRF_REG_BP); // Use base pointer register + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 + DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 + DWRF_UV(DWRF_REG_SP); // Use stack pointer register + DWRF_UV(8); // New offset: SP + 8 +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 calling convention unwinding rules */ + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + DWRF_UV(16); // Stack pointer moved by 16 bytes + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved + DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved + DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) + DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) + DWRF_UV(0); // Back to original stack position + + DWRF_U8(DWRF_CFA_def_cfa_register); // CFA = FP (x29) + DWRF_UV(DWRF_REG_FP); + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = FP + 16 + DWRF_UV(16); + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 saved + DWRF_UV(2); + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 saved + DWRF_UV(1); + +#else +# error "Unsupported target architecture" +#endif + + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + ctx->p = p; // Update context pointer to end of generated data + + /* Calculate and update the PC-relative offset in the FDE + * + * When perf processes the jitdump, it creates a synthesized DSO with this layout: + * + * Synthesized DSO Memory Layout: + * ┌─────────────────────────────────────────────────────────────┐ < code_start + * │ Code Section │ + * │ (round_up(code_size, 8) bytes) │ + * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data + * │ EH Frame Data │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ CIE data │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ FDE Header: │ │ + * │ │ - CIE offset (4 bytes) │ │ + * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start + * │ │ - address range (4 bytes) │ │ (this specific field) + * │ │ CFI Instructions... │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * ├─────────────────────────────────────────────────────────────┤ < reference_point + * │ EhFrameHeader │ + * │ (navigation metadata) │ + * └─────────────────────────────────────────────────────────────┘ + * + * The PC offset field in the FDE must contain the distance from itself to code_start: + * + * distance = code_start - fde_pc_field + * + * Where: + * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame + * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) + * + * Therefore: + * distance = code_start_location - fde_pc_field_location + * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) + * = -rounded_code_size - fde_offset_in_frame + * = -(round_up(code_size, 8) + fde_offset_in_frame) + * + * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field. + * + */ + if (!absolute_addr) { + int32_t rounded_code_size = + (int32_t)_Py_SIZE_ROUND_UP(ctx->code_size, 8); + int32_t fde_offset_in_frame = (int32_t)(ctx->fde_p - framep); + *(int32_t *)ctx->fde_p = -(rounded_code_size + fde_offset_in_frame); + } +} + +#if defined(__linux__) && defined(__ELF__) +enum { + JIT_NOACTION = 0, + JIT_REGISTER_FN = 1, + JIT_UNREGISTER_FN = 2, +}; + +struct jit_code_entry { + struct jit_code_entry *next; + struct jit_code_entry *prev; + const char *symfile_addr; + uint64_t symfile_size; +}; + +struct jit_descriptor { + uint32_t version; + uint32_t action_flag; + struct jit_code_entry *relevant_entry; + struct jit_code_entry *first_entry; +}; + +static volatile struct jit_descriptor __jit_debug_descriptor = { + 1, JIT_NOACTION, NULL, NULL +}; + +static void __attribute__((noinline)) __jit_debug_register_code(void) +{ + /* Keep this call visible to debuggers and not optimized away. */ + (void)__jit_debug_descriptor.action_flag; +#if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static uint16_t +gdb_jit_machine_id(void) +{ + /* Map the current target to ELF e_machine; return 0 to skip registration. */ +#if defined(__x86_64__) || defined(_M_X64) + return EM_X86_64; +#elif defined(__aarch64__) && !defined(__ILP32__) + return EM_AARCH64; +#else + return 0; +#endif +} + +static void +gdb_jit_register_code( + const void *code_addr, + unsigned int code_size, + const char *symname, + const uint8_t *eh_frame, + size_t eh_frame_size +) +{ + /* + * Build a minimal in-memory ELF for GDB's JIT interface and link it into + * __jit_debug_descriptor so debuggers can resolve JIT code. + */ + if (code_addr == NULL || code_size == 0 || symname == NULL) { + return; + } + + const uint16_t machine = gdb_jit_machine_id(); + if (machine == 0) { + return; + } + + enum { + SH_NULL = 0, + SH_TEXT, + SH_EH_FRAME, + SH_SHSTRTAB, + SH_STRTAB, + SH_SYMTAB, + SH_NUM, + }; + static const char shstrtab[] = + "\0.text\0.eh_frame\0.shstrtab\0.strtab\0.symtab"; + _Static_assert(sizeof(shstrtab) == + 1 + sizeof(".text") + sizeof(".eh_frame") + + sizeof(".shstrtab") + sizeof(".strtab") + sizeof(".symtab"), + "shstrtab size mismatch"); + const size_t shstrtab_size = sizeof(shstrtab); + const size_t sh_text = 1; + const size_t sh_eh_frame = sh_text + sizeof(".text"); + const size_t sh_shstrtab = sh_eh_frame + sizeof(".eh_frame"); + const size_t sh_strtab = sh_shstrtab + sizeof(".shstrtab"); + const size_t sh_symtab = sh_strtab + sizeof(".strtab"); + const size_t text_size = code_size; + const size_t text_padded = _Py_SIZE_ROUND_UP(text_size, 8); + const size_t strtab_size = 1 + strlen(symname) + 1; + const size_t symtab_size = 3 * sizeof(Elf64_Sym); + + size_t offset = sizeof(Elf64_Ehdr); + offset = _Py_SIZE_ROUND_UP(offset, 16); + const size_t text_off = offset; + const size_t eh_off = text_off + text_padded; + offset = eh_off + eh_frame_size; + const size_t shstr_off = offset; + offset += shstrtab_size; + const size_t str_off = offset; + offset += strtab_size; + offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Sym)); + const size_t sym_off = offset; + offset += symtab_size; + offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Shdr)); + const size_t sh_off = offset; + + const size_t shnum = SH_NUM; + const size_t total_size = sh_off + shnum * sizeof(Elf64_Shdr); + uint8_t *buf = (uint8_t *)PyMem_RawMalloc(total_size); + if (buf == NULL) { + return; + } + memset(buf, 0, total_size); + + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)buf; + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELFOSABI_NONE; + ehdr->e_type = ET_DYN; + ehdr->e_machine = machine; + ehdr->e_version = EV_CURRENT; + ehdr->e_entry = 0; + ehdr->e_phoff = 0; + ehdr->e_shoff = sh_off; + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_shentsize = sizeof(Elf64_Shdr); + ehdr->e_shnum = shnum; + ehdr->e_shstrndx = SH_SHSTRTAB; + + memcpy(buf + text_off, code_addr, text_size); + memcpy(buf + eh_off, eh_frame, eh_frame_size); + + char *shstr = (char *)(buf + shstr_off); + memcpy(shstr, shstrtab, shstrtab_size); + + char *strtab = (char *)(buf + str_off); + strtab[0] = '\0'; + memcpy(strtab + 1, symname, strlen(symname)); + strtab[strtab_size - 1] = '\0'; + + Elf64_Sym *syms = (Elf64_Sym *)(buf + sym_off); + memset(syms, 0, symtab_size); + /* Section symbol for .text (local) */ + syms[1].st_info = ELF64_ST_INFO(STB_LOCAL, STT_SECTION); + syms[1].st_shndx = 1; + /* Function symbol */ + syms[2].st_name = 1; + syms[2].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC); + syms[2].st_other = STV_DEFAULT; + syms[2].st_shndx = 1; + /* For ET_DYN/ET_EXEC, st_value is the absolute virtual address. */ + syms[2].st_value = (Elf64_Addr)(uintptr_t)code_addr; + syms[2].st_size = code_size; + + Elf64_Shdr *shdrs = (Elf64_Shdr *)(buf + sh_off); + memset(shdrs, 0, shnum * sizeof(Elf64_Shdr)); + + shdrs[SH_TEXT].sh_name = sh_text; + shdrs[SH_TEXT].sh_type = SHT_PROGBITS; + shdrs[SH_TEXT].sh_flags = SHF_ALLOC | SHF_EXECINSTR; + shdrs[SH_TEXT].sh_addr = (Elf64_Addr)(uintptr_t)code_addr; + shdrs[SH_TEXT].sh_offset = text_off; + shdrs[SH_TEXT].sh_size = text_size; + shdrs[SH_TEXT].sh_addralign = 16; + + shdrs[SH_EH_FRAME].sh_name = sh_eh_frame; + shdrs[SH_EH_FRAME].sh_type = SHT_PROGBITS; + shdrs[SH_EH_FRAME].sh_flags = SHF_ALLOC; + shdrs[SH_EH_FRAME].sh_addr = + (Elf64_Addr)((uintptr_t)code_addr + text_padded); + shdrs[SH_EH_FRAME].sh_offset = eh_off; + shdrs[SH_EH_FRAME].sh_size = eh_frame_size; + shdrs[SH_EH_FRAME].sh_addralign = 8; + + shdrs[SH_SHSTRTAB].sh_name = sh_shstrtab; + shdrs[SH_SHSTRTAB].sh_type = SHT_STRTAB; + shdrs[SH_SHSTRTAB].sh_offset = shstr_off; + shdrs[SH_SHSTRTAB].sh_size = shstrtab_size; + shdrs[SH_SHSTRTAB].sh_addralign = 1; + + shdrs[SH_STRTAB].sh_name = sh_strtab; + shdrs[SH_STRTAB].sh_type = SHT_STRTAB; + shdrs[SH_STRTAB].sh_offset = str_off; + shdrs[SH_STRTAB].sh_size = strtab_size; + shdrs[SH_STRTAB].sh_addralign = 1; + + shdrs[SH_SYMTAB].sh_name = sh_symtab; + shdrs[SH_SYMTAB].sh_type = SHT_SYMTAB; + shdrs[SH_SYMTAB].sh_offset = sym_off; + shdrs[SH_SYMTAB].sh_size = symtab_size; + shdrs[SH_SYMTAB].sh_link = SH_STRTAB; + shdrs[SH_SYMTAB].sh_info = 2; + shdrs[SH_SYMTAB].sh_addralign = 8; + shdrs[SH_SYMTAB].sh_entsize = sizeof(Elf64_Sym); + + struct jit_code_entry *entry = PyMem_RawMalloc(sizeof(*entry)); + if (entry == NULL) { + PyMem_RawFree(buf); + return; + } + entry->symfile_addr = (const char *)buf; + entry->symfile_size = total_size; + entry->prev = NULL; + entry->next = __jit_debug_descriptor.first_entry; + if (entry->next != NULL) { + entry->next->prev = entry; + } + __jit_debug_descriptor.first_entry = entry; + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_REGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + +} +#endif // __linux__ && __ELF__ + +void +_PyJitUnwind_GdbRegisterCode(const void *code_addr, + unsigned int code_size, + const char *entry, + const char *filename) +{ +#if defined(__linux__) && defined(__ELF__) + /* GDB expects a stable symbol name and absolute addresses in .eh_frame. */ + if (entry == NULL) { + entry = ""; + } + if (filename == NULL) { + filename = ""; + } + size_t name_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; + char *name = (char *)PyMem_RawMalloc(name_size); + if (name == NULL) { + return; + } + snprintf(name, name_size, "py::%s:%s", entry, filename); + + uint8_t buffer[1024]; + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 1); + if (eh_frame_size == 0) { + PyMem_RawFree(name); + return; + } + + gdb_jit_register_code(code_addr, code_size, name, + buffer, eh_frame_size); + PyMem_RawFree(name); +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; +#endif +} + +#endif // PY_HAVE_PERF_TRAMPOLINE diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 0ba856ea610e59..0e2b3f42f5f65b 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -62,6 +62,7 @@ #include "pycore_frame.h" #include "pycore_interp.h" #include "pycore_mmap.h" // _PyAnnotateMemoryMap() +#include "pycore_jit_unwind.h" #include "pycore_runtime.h" // _PyRuntime #ifdef PY_HAVE_PERF_TRAMPOLINE @@ -73,6 +74,7 @@ #include // File control operations #include // Standard I/O operations #include // Standard library functions +#include // memcpy, strlen #include // Memory mapping functions (mmap) #include // System data types #include // System calls (sysconf, getpid) @@ -246,6 +248,31 @@ typedef struct { */ } CodeUnwindingInfoEvent; +/* + * EH Frame Header structure for DWARF unwinding + * + * This header provides metadata about the .eh_frame data that follows. + * It uses PC-relative and data-relative encodings to keep the synthesized + * DSO self-contained when perf injects it. + */ +typedef struct __attribute__((packed)) { + uint8_t version; + uint8_t eh_frame_ptr_enc; + uint8_t fde_count_enc; + uint8_t table_enc; + int32_t eh_frame_ptr; + uint32_t eh_fde_count; + int32_t from; + int32_t to; +} EhFrameHeader; +_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch"); + +/* DWARF encoding constants used in EH frame headers */ +static const uint8_t DwarfUData4 = 0x03; +static const uint8_t DwarfSData4 = 0x0b; +static const uint8_t DwarfPcRel = 0x10; +static const uint8_t DwarfDataRel = 0x30; + // ============================================================================= // GLOBAL STATE MANAGEMENT // ============================================================================= @@ -262,7 +289,8 @@ typedef struct { PyThread_type_lock map_lock; // Thread synchronization lock void* mapped_buffer; // Memory-mapped region (signals perf we're active) size_t mapped_size; // Size of the mapped region - int code_id; // Counter for unique code region identifiers + uint32_t code_id; // Counter for unique code region identifiers + uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs } PerfMapJitState; /* Global singleton instance */ @@ -316,40 +344,6 @@ static int64_t get_current_time_microseconds(void) { return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; } -// ============================================================================= -// UTILITY FUNCTIONS -// ============================================================================= - -/* - * Round up a value to the next multiple of a given number - * - * This is essential for maintaining proper alignment requirements in the - * jitdump format. Many structures need to be aligned to specific boundaries - * (typically 8 or 16 bytes) for efficient processing by perf. - * - * Args: - * value: The value to round up - * multiple: The multiple to round up to - * - * Returns: The smallest value >= input that is a multiple of 'multiple' - */ -static size_t round_up(int64_t value, int64_t multiple) { - if (multiple == 0) { - return value; // Avoid division by zero - } - - int64_t remainder = value % multiple; - if (remainder == 0) { - return value; // Already aligned - } - - /* Calculate how much to add to reach the next multiple */ - int64_t difference = multiple - remainder; - int64_t rounded_up_value = value + difference; - - return rounded_up_value; -} - // ============================================================================= // FILE I/O UTILITIES // ============================================================================= @@ -406,623 +400,6 @@ static void perf_map_jit_write_header(int pid, FILE* out_file) { perf_map_jit_write_fully(&header, sizeof(header)); } -// ============================================================================= -// DWARF CONSTANTS AND UTILITIES -// ============================================================================= - -/* - * DWARF (Debug With Arbitrary Record Formats) constants - * - * DWARF is a debugging data format used to provide stack unwinding information. - * These constants define the various encoding types and opcodes used in - * DWARF Call Frame Information (CFI) records. - */ - -/* DWARF Call Frame Information version */ -#define DWRF_CIE_VERSION 1 - -/* DWARF CFA (Call Frame Address) opcodes */ -enum { - DWRF_CFA_nop = 0x0, // No operation - DWRF_CFA_offset_extended = 0x5, // Extended offset instruction - DWRF_CFA_def_cfa = 0xc, // Define CFA rule - DWRF_CFA_def_cfa_register = 0xd, // Define CFA register - DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset - DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset - DWRF_CFA_advance_loc = 0x40, // Advance location counter - DWRF_CFA_offset = 0x80, // Simple offset instruction - DWRF_CFA_restore = 0xc0 // Restore register -}; - -/* DWARF Exception Handling pointer encodings */ -enum { - DWRF_EH_PE_absptr = 0x00, // Absolute pointer - DWRF_EH_PE_omit = 0xff, // Omitted value - - /* Data type encodings */ - DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 - DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte - DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte - DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte - DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 - DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte - DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte - DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte - DWRF_EH_PE_signed = 0x08, // Signed flag - - /* Reference type encodings */ - DWRF_EH_PE_pcrel = 0x10, // PC-relative - DWRF_EH_PE_textrel = 0x20, // Text-relative - DWRF_EH_PE_datarel = 0x30, // Data-relative - DWRF_EH_PE_funcrel = 0x40, // Function-relative - DWRF_EH_PE_aligned = 0x50, // Aligned - DWRF_EH_PE_indirect = 0x80 // Indirect -}; - -/* Additional DWARF constants for debug information */ -enum { DWRF_TAG_compile_unit = 0x11 }; -enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; -enum { - DWRF_AT_name = 0x03, // Name attribute - DWRF_AT_stmt_list = 0x10, // Statement list - DWRF_AT_low_pc = 0x11, // Low PC address - DWRF_AT_high_pc = 0x12 // High PC address -}; -enum { - DWRF_FORM_addr = 0x01, // Address form - DWRF_FORM_data4 = 0x06, // 4-byte data - DWRF_FORM_string = 0x08 // String form -}; - -/* Line number program opcodes */ -enum { - DWRF_LNS_extended_op = 0, // Extended opcode - DWRF_LNS_copy = 1, // Copy operation - DWRF_LNS_advance_pc = 2, // Advance program counter - DWRF_LNS_advance_line = 3 // Advance line number -}; - -/* Line number extended opcodes */ -enum { - DWRF_LNE_end_sequence = 1, // End of sequence - DWRF_LNE_set_address = 2 // Set address -}; - -/* - * Architecture-specific DWARF register numbers - * - * These constants define the register numbering scheme used by DWARF - * for each supported architecture. The numbers must match the ABI - * specification for proper stack unwinding. - */ -enum { -#ifdef __x86_64__ - /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ - DWRF_REG_AX, // RAX - DWRF_REG_DX, // RDX - DWRF_REG_CX, // RCX - DWRF_REG_BX, // RBX - DWRF_REG_SI, // RSI - DWRF_REG_DI, // RDI - DWRF_REG_BP, // RBP - DWRF_REG_SP, // RSP - DWRF_REG_8, // R8 - DWRF_REG_9, // R9 - DWRF_REG_10, // R10 - DWRF_REG_11, // R11 - DWRF_REG_12, // R12 - DWRF_REG_13, // R13 - DWRF_REG_14, // R14 - DWRF_REG_15, // R15 - DWRF_REG_RA, // Return address (RIP) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 register numbering */ - DWRF_REG_FP = 29, // Frame Pointer - DWRF_REG_RA = 30, // Link register (return address) - DWRF_REG_SP = 31, // Stack pointer -#else -# error "Unsupported target architecture" -#endif -}; - -/* DWARF encoding constants used in EH frame headers */ -static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data -static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data -static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding -static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding - -// ============================================================================= -// ELF OBJECT CONTEXT -// ============================================================================= - -/* - * Context for building ELF/DWARF structures - * - * This structure maintains state while constructing DWARF unwind information. - * It acts as a simple buffer manager with pointers to track current position - * and important landmarks within the buffer. - */ -typedef struct ELFObjectContext { - uint8_t* p; // Current write position in buffer - uint8_t* startp; // Start of buffer (for offset calculations) - uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) - uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) - uint32_t code_size; // Size of the code being described -} ELFObjectContext; - -/* - * EH Frame Header structure for DWARF unwinding - * - * This structure provides metadata about the DWARF unwinding information - * that follows. It's required by the perf jitdump format to enable proper - * stack unwinding during profiling. - */ -typedef struct { - unsigned char version; // EH frame version (always 1) - unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer - unsigned char fde_count_enc; // Encoding of FDE count - unsigned char table_enc; // Encoding of table entries - int32_t eh_frame_ptr; // Pointer to EH frame data - int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) - int32_t from; // Start address of code range - int32_t to; // End address of code range -} EhFrameHeader; - -// ============================================================================= -// DWARF GENERATION UTILITIES -// ============================================================================= - -/* - * Append a null-terminated string to the ELF context buffer - * - * Args: - * ctx: ELF object context - * str: String to append (must be null-terminated) - * - * Returns: Offset from start of buffer where string was written - */ -static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { - uint8_t* p = ctx->p; - uint32_t ofs = (uint32_t)(p - ctx->startp); - - /* Copy string including null terminator */ - do { - *p++ = (uint8_t)*str; - } while (*str++); - - ctx->p = p; - return ofs; -} - -/* - * Append a SLEB128 (Signed Little Endian Base 128) value - * - * SLEB128 is a variable-length encoding used extensively in DWARF. - * It efficiently encodes small numbers in fewer bytes. - * - * Args: - * ctx: ELF object context - * v: Signed value to encode - */ -static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { - *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Append a ULEB128 (Unsigned Little Endian Base 128) value - * - * Similar to SLEB128 but for unsigned values. - * - * Args: - * ctx: ELF object context - * v: Unsigned value to encode - */ -static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; v >= 0x80; v >>= 7) { - *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (char)v; // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Macros for generating DWARF structures - * - * These macros provide a convenient way to write various data types - * to the DWARF buffer while automatically advancing the pointer. - */ -#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit -#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit -#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit -#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit -#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address -#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 -#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 -#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string - -/* Align to specified boundary with NOP instructions */ -#define DWRF_ALIGNNOP(s) \ - while ((uintptr_t)p & ((s)-1)) { \ - *p++ = DWRF_CFA_nop; \ - } - -/* Write a DWARF section with automatic size calculation */ -#define DWRF_SECTION(name, stmt) \ - { \ - uint32_t* szp_##name = (uint32_t*)p; \ - p += 4; \ - stmt; \ - *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ - } - -// ============================================================================= -// DWARF EH FRAME GENERATION -// ============================================================================= - -static void elf_init_ehframe(ELFObjectContext* ctx); - -/* - * Initialize DWARF .eh_frame section for a code region - * - * The .eh_frame section contains Call Frame Information (CFI) that describes - * how to unwind the stack at any point in the code. This is essential for - * proper profiling as it allows perf to generate accurate call graphs. - * - * The function generates two main components: - * 1. CIE (Common Information Entry) - describes calling conventions - * 2. FDE (Frame Description Entry) - describes specific function unwinding - * - * Args: - * ctx: ELF object context containing code size and buffer pointers - */ -static size_t calculate_eh_frame_size(void) { - /* Calculate the EH frame size for the trampoline function */ - extern void *_Py_trampoline_func_start; - extern void *_Py_trampoline_func_end; - - size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; - - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; - - elf_init_ehframe(&ctx); - return ctx.p - ctx.startp; -} - -static void elf_init_ehframe(ELFObjectContext* ctx) { - uint8_t* p = ctx->p; - uint8_t* framep = p; // Remember start of frame data - - /* - * DWARF Unwind Table for Trampoline Function - * - * This section defines DWARF Call Frame Information (CFI) using encoded macros - * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function - * preserves and restores registers. This is used by profiling tools (e.g., `perf`) - * and debuggers for stack unwinding in JIT-compiled code. - * - * ------------------------------------------------- - * TO REGENERATE THIS TABLE FROM GCC OBJECTS: - * ------------------------------------------------- - * - * 1. Create a trampoline source file (e.g., `trampoline.c`): - * - * #include - * typedef PyObject* (*py_evaluator)(void*, void*, int); - * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { - * return evaluator(ts, f, throwflag); - * } - * - * 2. Compile to an object file with frame pointer preservation: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * 3. Extract DWARF unwind info from the object file: - * - * readelf -w trampoline.o - * - * Example output from `.eh_frame`: - * - * 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * 00000014 FDE cie=00000000 pc=0..14 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. - * - * ---------------------------------- - * HOW TO TRANSLATE TO DWRF_* MACROS: - * ---------------------------------- - * - * After compiling your trampoline with: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * run: - * - * readelf -w trampoline.o - * - * to inspect the generated `.eh_frame` data. You will see two main components: - * - * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. - * 2. An FDE (Frame Description Entry): function-specific unwind instructions. - * - * --------------------- - * Translating the CIE: - * --------------------- - * From `readelf -w`, you might see: - * - * 00000000 0000000000000010 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * Augmentation data: 1b - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * Map this to: - * - * DWRF_SECTION(CIE, - * DWRF_U32(0); // CIE ID (always 0 for CIEs) - * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 - * DWRF_STR("zR"); // Augmentation string "zR" - * DWRF_UV(4); // Code alignment factor = 4 - * DWRF_SV(-8); // Data alignment factor = -8 - * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) - * DWRF_UV(1); // Augmentation data length = 1 - * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers - * - * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa - * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) - * DWRF_UV(0); // Offset = 0 - * - * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary - * ) - * - * Notes: - * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. - * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. - * - * --------------------- - * Translating the FDE: - * --------------------- - * From `readelf -w`: - * - * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * Map the FDE header and instructions to: - * - * DWRF_SECTION(FDE, - * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) - * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) - * DWRF_U32(ctx->code_size); // Code range covered by this FDE - * DWRF_U8(0); // Augmentation data length (none) - * - * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - * DWRF_UV(16); - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) - * DWRF_UV(2); // At offset 2 * 8 = 16 bytes - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) - * DWRF_UV(1); // At offset 1 * 8 = 8 bytes - * - * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 - * - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP - * DWRF_UV(0); - * ) - * - * To regenerate: - * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. - * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as - * the code is in a different address space every time. - * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: - * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) - * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) - * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset - * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) - * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. - * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. - */ - - /* - * Emit DWARF EH CIE (Common Information Entry) - * - * The CIE describes the calling conventions and basic unwinding rules - * that apply to all functions in this compilation unit. - */ - DWRF_SECTION(CIE, - DWRF_U32(0); // CIE ID (0 indicates this is a CIE) - DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) - DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) -#ifdef __x86_64__ - DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) -#endif - DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) - DWRF_U8(DWRF_REG_RA); // Return address register number - DWRF_UV(1); // Augmentation data length - DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding - - /* Initial CFI instructions - describe default calling convention */ -#ifdef __x86_64__ - /* x86_64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size - DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved - DWRF_UV(1); // At offset 1 from CFA -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) - // No initial register saves in AArch64 CIE -#endif - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->eh_frame_p = p; // Remember start of FDE data - - /* - * Emit DWARF EH FDE (Frame Description Entry) - * - * The FDE describes unwinding information specific to this function. - * It references the CIE and provides function-specific CFI instructions. - * - * The PC-relative offset is calculated after the entire EH frame is built - * to ensure accurate positioning relative to the synthesized DSO layout. - */ - DWRF_SECTION(FDE, - DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) - ctx->fde_p = p; // Remember where PC offset field is located for later calculation - DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) - DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) - DWRF_U8(0); // Augmentation data length (none) - - /* - * Architecture-specific CFI instructions - * - * These instructions describe how registers are saved and restored - * during function calls. Each architecture has different calling - * conventions and register usage patterns. - */ -#ifdef __x86_64__ - /* x86_64 calling convention unwinding rules with frame pointer */ -# if defined(__CET__) && (__CET__ & 1) - DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) -# endif - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) - DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 - DWRF_UV(16); // New offset: SP + 16 - DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 - DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) - DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 - DWRF_UV(DWRF_REG_BP); // Use base pointer register - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 - DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 - DWRF_UV(DWRF_REG_SP); // Use stack pointer register - DWRF_UV(8); // New offset: SP + 8 -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 calling convention unwinding rules */ - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - DWRF_UV(16); // Stack pointer moved by 16 bytes - DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved - DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved - DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) - DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) - DWRF_UV(0); // Back to original stack position -#else -# error "Unsupported target architecture" -#endif - - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->p = p; // Update context pointer to end of generated data - - /* Calculate and update the PC-relative offset in the FDE - * - * When perf processes the jitdump, it creates a synthesized DSO with this layout: - * - * Synthesized DSO Memory Layout: - * ┌─────────────────────────────────────────────────────────────┐ < code_start - * │ Code Section │ - * │ (round_up(code_size, 8) bytes) │ - * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data - * │ EH Frame Data │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ CIE data │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ FDE Header: │ │ - * │ │ - CIE offset (4 bytes) │ │ - * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start - * │ │ - address range (4 bytes) │ │ (this specific field) - * │ │ CFI Instructions... │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * ├─────────────────────────────────────────────────────────────┤ < reference_point - * │ EhFrameHeader │ - * │ (navigation metadata) │ - * └─────────────────────────────────────────────────────────────┘ - * - * The PC offset field in the FDE must contain the distance from itself to code_start: - * - * distance = code_start - fde_pc_field - * - * Where: - * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame - * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) - * - * Therefore: - * distance = code_start_location - fde_pc_field_location - * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) - * = -rounded_code_size - fde_offset_in_frame - * = -(round_up(code_size, 8) + fde_offset_in_frame) - * - * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, - * - */ - if (ctx->fde_p != NULL) { - int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); - int32_t rounded_code_size = round_up(ctx->code_size, 8); - int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); - - - // Update the PC-relative offset in the FDE - *(int32_t*)ctx->fde_p = pc_relative_offset; - } -} - // ============================================================================= // JITDUMP INITIALIZATION // ============================================================================= @@ -1128,11 +505,13 @@ static void* perf_map_jit_init(void) { /* Initialize code ID counter */ perf_jit_map_state.code_id = 0; + perf_jit_map_state.build_id_salt = + ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks(); /* Calculate padding size based on actual unwind info requirements */ - size_t eh_frame_size = calculate_eh_frame_size(); + size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0); size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; - trampoline_api.code_padding = round_up(unwind_data_size, 16); + trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16); trampoline_api.code_alignment = 32; return &perf_jit_map_state; @@ -1143,30 +522,19 @@ static void* perf_map_jit_init(void) { // ============================================================================= /* - * Write a complete jitdump entry for a Python function - * - * This is the main function called by Python's trampoline system whenever - * a new piece of JIT-compiled code needs to be recorded. It writes both - * the unwinding information and the code load event to the jitdump file. - * - * The function performs these steps: - * 1. Initialize jitdump system if not already done - * 2. Extract function name and filename from Python code object - * 3. Generate DWARF unwinding information - * 4. Write unwinding info event to jitdump file - * 5. Write code load event to jitdump file - * - * Args: - * state: Jitdump state (currently unused, uses global state) - * code_addr: Address where the compiled code resides - * code_size: Size of the compiled code in bytes - * co: Python code object containing metadata + * Write a complete jitdump entry for a code region with a provided name. * - * IMPORTANT: This function signature is part of Python's internal API - * and must not be changed without coordinating with core Python development. + * This shares the same implementation as the trampoline callback, but + * allows callers that don't have a PyCodeObject to reuse the jitdump + * infrastructure. */ -static void perf_map_jit_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) +static void perf_map_jit_write_entry_with_name( + void *state, + const void *code_addr, + unsigned int code_size, + const char *entry, + const char *filename +) { /* Initialize jitdump system on first use */ if (perf_jit_map_state.perf_map == NULL) { @@ -1176,21 +544,11 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, } } - /* - * Extract function information from Python code object - * - * We create a human-readable function name by combining the qualified - * name (includes class/module context) with the filename. This helps - * developers identify functions in perf reports. - */ - const char *entry = ""; - if (co->co_qualname != NULL) { - entry = PyUnicode_AsUTF8(co->co_qualname); + if (entry == NULL) { + entry = ""; } - - const char *filename = ""; - if (co->co_filename != NULL) { - filename = PyUnicode_AsUTF8(co->co_filename); + if (filename == NULL) { + filename = ""; } /* @@ -1218,16 +576,13 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, * Without it, perf cannot generate accurate call graphs, especially * in optimized code where frame pointers may be omitted. */ - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written - - /* Generate EH frame (Exception Handling frame) data */ - elf_init_ehframe(&ctx); - int eh_frame_size = ctx.p - ctx.startp; - + uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 0); + if (eh_frame_size == 0) { + PyMem_RawFree(perf_map_entry); + return; + } /* * Write Code Unwinding Information Event * @@ -1244,12 +599,12 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); - ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment + ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment /* Calculate total event size with padding */ - int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; - int padding_size = round_up(content_size, 8) - content_size; // 8-byte align - ev2.base.size = content_size + padding_size; + int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size); + int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align + ev2.base.size = (uint32_t)(content_size + padding_size); /* Write the unwinding info event header */ perf_map_jit_write_fully(&ev2, sizeof(ev2)); @@ -1268,15 +623,16 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte /* Calculate relative offsets for EH frame navigation */ - f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); + f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char)); f.eh_fde_count = 1; // We generate exactly one FDE per function - f.from = -(round_up(code_size, 8) + eh_frame_size); - - int cie_size = ctx.eh_frame_p - ctx.startp; - f.to = -(eh_frame_size - cie_size); + f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size); + uint32_t cie_payload_size; + memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size)); + int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size); + f.to = -(int32_t)(eh_frame_size - cie_size); /* Write EH frame data and header */ - perf_map_jit_write_fully(ctx.startp, eh_frame_size); + perf_map_jit_write_fully(buffer, eh_frame_size); perf_map_jit_write_fully(&f, sizeof(f)); /* Write padding to maintain alignment */ @@ -1313,12 +669,85 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, /* Write code load event and associated data */ perf_map_jit_write_fully(&ev, sizeof(ev)); perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator - perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code + /* + * Ensure each synthetic DSO has unique .text bytes. + * + * perf merges DSOs that share a build-id. Since trampolines can share + * identical code and unwind bytes, perf may resolve all JIT frames to + * the first symbol it saw (including entries from previous runs when + * build-id caching is enabled). Patch a small marker in the emitted + * bytes to make the build-id depend on a per-process salt and code id + * without modifying the live code. + */ + uint64_t marker = perf_jit_map_state.build_id_salt ^ + ((uint64_t)perf_jit_map_state.code_id << 32) ^ + (uint64_t)code_size; + if (size >= sizeof(marker)) { + size_t prefix = size - sizeof(marker); + perf_map_jit_write_fully((void *)(base), prefix); + perf_map_jit_write_fully(&marker, sizeof(marker)); + } + else if (size > 0) { + uint8_t tmp[sizeof(marker)]; + memcpy(tmp, (void *)(base), size); + for (size_t i = 0; i < size; i++) { + tmp[i] ^= (uint8_t)(marker >> (i * 8)); + } + perf_map_jit_write_fully(tmp, size); + } /* Clean up allocated memory */ PyMem_RawFree(perf_map_entry); } +/* + * Write a complete jitdump entry for a Python function + * + * This is the main function called by Python's trampoline system whenever + * a new piece of JIT-compiled code needs to be recorded. It writes both + * the unwinding information and the code load event to the jitdump file. + * + * The function performs these steps: + * 1. Initialize jitdump system if not already done + * 2. Extract function name and filename from Python code object + * 3. Generate DWARF unwinding information + * 4. Write unwinding info event to jitdump file + * 5. Write code load event to jitdump file + * + * Args: + * state: Jitdump state (currently unused, uses global state) + * code_addr: Address where the compiled code resides + * code_size: Size of the compiled code in bytes + * co: Python code object containing metadata + * + * IMPORTANT: This function signature is part of Python's internal API + * and must not be changed without coordinating with core Python development. + */ +static void perf_map_jit_write_entry(void *state, const void *code_addr, + unsigned int code_size, PyCodeObject *co) +{ + const char *entry = ""; + const char *filename = ""; + if (co != NULL) { + if (co->co_qualname != NULL) { + entry = PyUnicode_AsUTF8(co->co_qualname); + } + if (co->co_filename != NULL) { + filename = PyUnicode_AsUTF8(co->co_filename); + } + } + perf_map_jit_write_entry_with_name(state, code_addr, code_size, + entry, filename); +} + +void +_PyPerfJit_WriteNamedCode(const void *code_addr, unsigned int code_size, + const char *entry, const char *filename) +{ + perf_map_jit_write_entry_with_name( + NULL, code_addr, code_size, entry, filename); +} + // ============================================================================= // CLEANUP AND FINALIZATION // ============================================================================= diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index a251a045b91144..d12d303b722720 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -326,6 +326,7 @@ def format_tsv_lines(lines): _abs('Python/compile.c'): (20_000, 500), _abs('Python/optimizer.c'): (100_000, 5_000), _abs('Python/parking_lot.c'): (40_000, 1000), + _abs('Python/perf_jit_trampoline.c'): (40_000, 1000), _abs('Python/pylifecycle.c'): (750_000, 5000), _abs('Python/pystate.c'): (750_000, 5000), _abs('Python/initconfig.c'): (50_000, 500), diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index cbec0bf262f0e0..e7ed2a76b5176e 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -386,6 +386,7 @@ Python/intrinsics.c - _PyIntrinsics_UnaryFunctions - Python/intrinsics.c - _PyIntrinsics_BinaryFunctions - Python/lock.c - TIME_TO_BE_FAIR_NS - Python/opcode_targets.h - opcode_targets - +Python/jit_unwind.c - __jit_debug_descriptor - Python/perf_trampoline.c - _Py_perfmap_callbacks - Python/perf_jit_trampoline.c - _Py_perfmap_jit_callbacks - Python/perf_jit_trampoline.c - perf_jit_map_state -