Skip to content

Commit 0dd2c49

Browse files
committed
JIT compile a trampoline function to avoid shim code
1 parent 719e5c3 commit 0dd2c49

File tree

12 files changed

+137
-69
lines changed

12 files changed

+137
-69
lines changed

Include/internal/pycore_ceval.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,15 @@ _PyEval_EvalFrame(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwfl
123123
return tstate->interp->eval_frame(tstate, frame, throwflag);
124124
}
125125

126+
#if defined(_Py_TIER2) && !defined(_Py_JIT)
127+
_Py_CODEUNIT *_PyTier2Interpreter(
128+
struct _PyExecutorObject *current_executor, _PyInterpreterFrame *frame,
129+
_PyStackRef *stack_pointer, PyThreadState *tstate
130+
);
131+
#endif
132+
133+
extern _PyJitEntryFuncPtr _Py_jit_entry;
134+
126135
extern PyObject*
127136
_PyEval_Vector(PyThreadState *tstate,
128137
PyFunctionObject *func, PyObject *locals,

Include/internal/pycore_interp_structs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,7 @@ struct _Py_unique_id_pool {
765765

766766
#endif
767767

768+
typedef _Py_CODEUNIT *(*_PyJitEntryFuncPtr)(struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate);
768769

769770
/* PyInterpreterState holds the global state for one of the runtime's
770771
interpreters. Typically the initial (main) interpreter is the only one.

Python/bytecodes.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2971,7 +2971,7 @@ dummy_func(
29712971
assert(tstate->current_executor == NULL);
29722972
assert(executor != tstate->interp->cold_executor);
29732973
tstate->jit_exit = NULL;
2974-
GOTO_TIER_TWO(executor);
2974+
TIER1_TO_TIER2(executor);
29752975
}
29762976
}
29772977
else {
@@ -3037,7 +3037,7 @@ dummy_func(
30373037
}
30383038
assert(executor != tstate->interp->cold_executor);
30393039
tstate->jit_exit = NULL;
3040-
GOTO_TIER_TWO(executor);
3040+
TIER1_TO_TIER2(executor);
30413041
#else
30423042
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
30433043
#endif /* _Py_TIER2 */
@@ -5257,7 +5257,7 @@ dummy_func(
52575257
}
52585258
#endif
52595259
tstate->jit_exit = exit;
5260-
GOTO_TIER_TWO(exit->executor);
5260+
TIER2_TO_TIER2(exit->executor);
52615261
}
52625262

52635263
tier2 op(_CHECK_VALIDITY, (--)) {
@@ -5353,7 +5353,7 @@ dummy_func(
53535353

53545354
tier2 op(_START_EXECUTOR, (executor/4 --)) {
53555355
#ifndef _Py_JIT
5356-
current_executor = (_PyExecutorObject*)executor;
5356+
assert(current_executor == (_PyExecutorObject*)executor);
53575357
#endif
53585358
assert(tstate->jit_exit == NULL || tstate->jit_exit->executor == current_executor);
53595359
tstate->current_executor = (PyObject *)executor;
@@ -5434,7 +5434,7 @@ dummy_func(
54345434
}
54355435
assert(tstate->jit_exit == exit);
54365436
exit->executor = executor;
5437-
GOTO_TIER_TWO(exit->executor);
5437+
TIER2_TO_TIER2(exit->executor);
54385438
}
54395439

54405440
label(pop_2_error) {

Python/ceval.c

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,8 @@ maybe_lltrace_resume_frame(_PyInterpreterFrame *frame, PyObject *globals)
275275
}
276276
int r = PyDict_Contains(globals, &_Py_ID(__lltrace__));
277277
if (r < 0) {
278-
return -1;
278+
PyErr_Clear();
279+
return 0;
279280
}
280281
int lltrace = r * 5; // Levels 1-4 only trace uops
281282
if (!lltrace) {
@@ -1109,11 +1110,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
11091110
#endif
11101111
}
11111112

1112-
#if defined(_Py_TIER2) && !defined(_Py_JIT)
1113-
/* Tier 2 interpreter state */
1114-
_PyExecutorObject *current_executor = NULL;
1115-
const _PyUOpInstruction *next_uop = NULL;
1116-
#endif
11171113
#if Py_TAIL_CALL_INTERP
11181114
# if Py_STATS
11191115
return _TAIL_CALL_start_frame(frame, NULL, tstate, NULL, 0, lastopcode);
@@ -1126,14 +1122,41 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
11261122
#endif
11271123

11281124

1125+
early_exit:
1126+
assert(_PyErr_Occurred(tstate));
1127+
_Py_LeaveRecursiveCallPy(tstate);
1128+
assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
1129+
// GH-99729: We need to unlink the frame *before* clearing it:
1130+
_PyInterpreterFrame *dying = frame;
1131+
frame = tstate->current_frame = dying->previous;
1132+
_PyEval_FrameClearAndPop(tstate, dying);
1133+
frame->return_offset = 0;
1134+
assert(frame->owner == FRAME_OWNED_BY_INTERPRETER);
1135+
/* Restore previous frame and exit */
1136+
tstate->current_frame = frame->previous;
1137+
return NULL;
1138+
}
11291139
#ifdef _Py_TIER2
1130-
1131-
// Tier 2 is also here!
1132-
enter_tier_two:
1133-
11341140
#ifdef _Py_JIT
1135-
assert(0);
1141+
_PyJitEntryFuncPtr _Py_jit_entry = _Py_LazyJitTrampoline;
11361142
#else
1143+
_PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter;
1144+
#endif
1145+
#endif
1146+
1147+
#if defined(_Py_TIER2) && !defined(_Py_JIT)
1148+
1149+
_Py_CODEUNIT *
1150+
_PyTier2Interpreter(
1151+
_PyExecutorObject *current_executor, _PyInterpreterFrame *frame,
1152+
_PyStackRef *stack_pointer, PyThreadState *tstate
1153+
) {
1154+
const _PyUOpInstruction *next_uop;
1155+
int oparg;
1156+
tier2_start:
1157+
1158+
next_uop = current_executor->trace;
1159+
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT);
11371160

11381161
#undef LOAD_IP
11391162
#define LOAD_IP(UNUSED) (void)0
@@ -1151,7 +1174,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
11511174
#undef ENABLE_SPECIALIZATION_FT
11521175
#define ENABLE_SPECIALIZATION_FT 0
11531176

1154-
; // dummy statement after a label, before a declaration
11551177
uint16_t uopcode;
11561178
#ifdef Py_STATS
11571179
int lastuop = 0;
@@ -1225,24 +1247,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
12251247
next_uop = current_executor->trace + target;
12261248
goto tier2_dispatch;
12271249

1228-
#endif // _Py_JIT
1229-
1250+
}
12301251
#endif // _Py_TIER2
12311252

1232-
early_exit:
1233-
assert(_PyErr_Occurred(tstate));
1234-
_Py_LeaveRecursiveCallPy(tstate);
1235-
assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
1236-
// GH-99729: We need to unlink the frame *before* clearing it:
1237-
_PyInterpreterFrame *dying = frame;
1238-
frame = tstate->current_frame = dying->previous;
1239-
_PyEval_FrameClearAndPop(tstate, dying);
1240-
frame->return_offset = 0;
1241-
assert(frame->owner == FRAME_OWNED_BY_INTERPRETER);
1242-
/* Restore previous frame and exit */
1243-
tstate->current_frame = frame->previous;
1244-
return NULL;
1245-
}
12461253

12471254
#ifdef DO_NOT_OPTIMIZE_INTERP_LOOP
12481255
# pragma optimize("", on)

Python/ceval_macros.h

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,6 @@ do { \
133133
_PyFrame_SetStackPointer(frame, stack_pointer); \
134134
int lltrace = maybe_lltrace_resume_frame(frame, GLOBALS()); \
135135
stack_pointer = _PyFrame_GetStackPointer(frame); \
136-
if (lltrace < 0) { \
137-
JUMP_TO_LABEL(exit_unwind); \
138-
} \
139136
frame->lltrace = lltrace; \
140137
} while (0)
141138
#else
@@ -354,16 +351,10 @@ _PyFrame_SetStackPointer(frame, stack_pointer)
354351

355352
/* Tier-switching macros. */
356353

357-
#ifdef _Py_JIT
358-
#define GOTO_TIER_TWO(EXECUTOR) \
354+
#define TIER1_TO_TIER2(EXECUTOR) \
359355
do { \
360356
OPT_STAT_INC(traces_executed); \
361-
_PyExecutorObject *_executor = (EXECUTOR); \
362-
jit_func jitted = _executor->jit_code; \
363-
/* Keep the shim frame alive via the executor: */ \
364-
Py_INCREF(_executor); \
365-
next_instr = jitted(frame, stack_pointer, tstate); \
366-
Py_DECREF(_executor); \
357+
next_instr = _Py_jit_entry((EXECUTOR), frame, stack_pointer, tstate); \
367358
frame = tstate->current_frame; \
368359
stack_pointer = _PyFrame_GetStackPointer(frame); \
369360
if (next_instr == NULL) { \
@@ -372,31 +363,22 @@ do { \
372363
} \
373364
DISPATCH(); \
374365
} while (0)
375-
#else
376-
#define GOTO_TIER_TWO(EXECUTOR) \
377-
do { \
378-
OPT_STAT_INC(traces_executed); \
379-
_PyExecutorObject *_executor = (EXECUTOR); \
380-
next_uop = _executor->trace; \
381-
assert(next_uop->opcode == _START_EXECUTOR || next_uop->opcode == _COLD_EXIT); \
382-
goto enter_tier_two; \
366+
367+
#define TIER2_TO_TIER2(EXECUTOR) \
368+
do { \
369+
OPT_STAT_INC(traces_executed); \
370+
current_executor = (EXECUTOR); \
371+
next_uop = current_executor->trace; \
372+
goto tier2_start; \
383373
} while (0)
384-
#endif
385374

386375
#define GOTO_TIER_ONE(TARGET) \
387376
do \
388377
{ \
389378
tstate->current_executor = NULL; \
390-
next_instr = (TARGET); \
391379
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
392380
_PyFrame_SetStackPointer(frame, stack_pointer); \
393-
stack_pointer = _PyFrame_GetStackPointer(frame); \
394-
if (next_instr == NULL) \
395-
{ \
396-
next_instr = frame->instr_ptr; \
397-
goto error; \
398-
} \
399-
DISPATCH(); \
381+
return TARGET; \
400382
} while (0)
401383

402384
#define CURRENT_OPARG() (next_uop[-1].oparg)

Python/executor_cases.c.h

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/generated_cases.c.h

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/jit.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,65 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz
571571
return 0;
572572
}
573573

574+
// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
575+
static _PyJitEntryFuncPtr
576+
compile_trampoline(void)
577+
{
578+
_PyExecutorObject dummy;
579+
const StencilGroup *group;
580+
// Loop once to find the total compiled size:
581+
size_t code_size = 0;
582+
size_t data_size = 0;
583+
jit_state state = {0};
584+
group = &trampoline;
585+
code_size += group->code_size;
586+
data_size += group->data_size;
587+
combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
588+
// Round up to the nearest page:
589+
size_t page_size = get_page_size();
590+
assert((page_size & (page_size - 1)) == 0);
591+
size_t code_padding = DATA_ALIGN - ((code_size + state.trampolines.size) & (DATA_ALIGN - 1));
592+
size_t padding = page_size - ((code_size + state.trampolines.size + code_padding + data_size) & (page_size - 1));
593+
size_t total_size = code_size + state.trampolines.size + code_padding + data_size + padding;
594+
unsigned char *memory = jit_alloc(total_size);
595+
if (memory == NULL) {
596+
return NULL;
597+
}
598+
// Loop again to emit the code:
599+
unsigned char *code = memory;
600+
state.trampolines.mem = memory + code_size;
601+
unsigned char *data = memory + code_size + state.trampolines.size + code_padding;
602+
// Compile the shim, which handles converting between the native
603+
// calling convention and the calling convention used by jitted code
604+
// (which may be different for efficiency reasons).
605+
group = &trampoline;
606+
group->emit(code, data, &dummy, NULL, &state);
607+
code += group->code_size;
608+
data += group->data_size;
609+
assert(code == memory + code_size);
610+
assert(data == memory + code_size + state.trampolines.size + code_padding + data_size);
611+
if (mark_executable(memory, total_size)) {
612+
jit_free(memory, total_size);
613+
return NULL;
614+
}
615+
return (_PyJitEntryFuncPtr)memory;
616+
}
617+
618+
619+
_Py_CODEUNIT *
620+
_Py_LazyJitTrampoline(
621+
_PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate
622+
) {
623+
assert(_Py_jit_entry == _Py_LazyJitTrampoline);
624+
PyInterpreterState *interp = PyInterpreterState_Get();
625+
_PyJitEntryFuncPtr trampoline = compile_trampoline();
626+
if (trampoline == NULL) {
627+
Py_FatalError("Cannot allocate core JIT code");
628+
}
629+
_Py_jit_entry = trampoline;
630+
return trampoline(executor, frame, stack_pointer, tstate);
631+
}
632+
574633
void
575634
_PyJIT_Free(_PyExecutorObject *executor)
576635
{

Python/pystate.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,11 @@ free_interpreter(PyInterpreterState *interp)
494494
static inline int check_interpreter_whence(long);
495495
#endif
496496

497+
extern _Py_CODEUNIT *
498+
_Py_LazyJitTrampoline(
499+
struct _PyExecutorObject *exec, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate
500+
);
501+
497502
/* Get the interpreter state to a minimal consistent state.
498503
Further init happens in pylifecycle.c before it can be used.
499504
All fields not initialized here are expected to be zeroed out,
@@ -574,6 +579,8 @@ init_interpreter(PyInterpreterState *interp,
574579
/* Fix the self-referential, statically initialized fields. */
575580
interp->dtoa = (struct _dtoa_state)_dtoa_state_INIT(interp);
576581
}
582+
583+
577584
#if !defined(Py_GIL_DISABLED) && defined(Py_STACKREF_DEBUG)
578585
interp->next_stackref = INITIAL_STACKREF_INDEX;
579586
_Py_hashtable_allocator_t alloc = {

Tools/jit/_targets.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
193193
async with asyncio.TaskGroup() as group:
194194
coro = self._compile("shim", TOOLS_JIT / "shim.c", work)
195195
tasks.append(group.create_task(coro, name="shim"))
196+
coro = self._compile("trampoline", TOOLS_JIT / "trampoline.c", work)
197+
tasks.append(group.create_task(coro, name="trampoline"))
196198
template = TOOLS_JIT_TEMPLATE_C.read_text()
197199
for case, opname in cases_and_opnames:
198200
# Write out a copy of the template with *only* this case

0 commit comments

Comments
 (0)