Bug report
Bug description:
Follow-up to #149481.
cc: @markshannon
TL;DR
Hang happens when a FOR_ITER observes more than MAX_CHAIN_DEPTH (=4) distinct iterator types.
Minimal reproducible example
F = [
lambda: enumerate(range(500)),
lambda: zip(range(500), range(500)),
lambda: map(int, range(500)),
lambda: filter(None, range(500)),
lambda: reversed(range(500)),
]
for o in range(200):
for _ in F[o % 5](): pass
Hint: Set PYTHON_JIT_SIDE_EXIT_INITIAL_VALUE=1 to reduce warmup time.
Root cause analysis (by opus 4.7)
A tier-2 trace specializing FOR_ITER starts with _GUARD_TYPE_ITER whose deopt target is the FOR_ITER byte offset itself.
|
op(_FOR_ITER_TIER_TWO, (iter, null_or_index -- iter, null_or_index, next)) { |
|
bool definite = true; |
|
PyTypeObject *type = sym_get_type(iter); |
|
if (type == NULL) { |
|
type = sym_get_probable_type(iter); |
|
definite = false; |
|
} |
|
if (type != NULL && type != &PyGen_Type && type->tp_iternext != NULL |
|
&& !_PyType_HasSlotTpIternext(type)) { |
|
PyType_Watch(TYPE_WATCHER_ID, (PyObject *)type); |
|
_Py_BloomFilter_Add(dependencies, type); |
|
if (!definite) { |
|
sym_set_type(iter, type); |
|
assert((this_instr - 1)->opcode == _RECORD_NOS_TYPE); |
|
int32_t orig_target = (this_instr - 1)->target; |
|
ADD_OP(_GUARD_TYPE_ITER, 0, (uintptr_t)type); |
|
uop_buffer_last(&ctx->out_buffer)->target = orig_target; |
|
} |
|
ADD_OP(_ITER_NEXT_INLINE, 0, (uintptr_t)type->tp_iternext); |
|
} |
|
next = sym_new_not_null(ctx); |
|
} |
After the side-trace chain reaches
MAX_CHAIN_DEPTH,
_PyOptimizer_Optimize takes the
progress_needed path: it builds a fresh trace
R' and
insert_executors it at that same
FOR_ITER byte (so the byte now reads
ENTER_EXECUTOR(R')), while leaving the parent's
exit->executor cold. Because
R''s own first guard targets the
FOR_ITER byte,
R' is its own deopt target.
|
int chain_depth = _tstate->jit_tracer_state->initial_state.chain_depth; |
|
chain_depth %= MAX_CHAIN_DEPTH; |
|
bool progress_needed = chain_depth == 0; |
|
PyCodeObject *code = (PyCodeObject *)_tstate->jit_tracer_state->initial_state.code; |
|
_Py_CODEUNIT *start = _tstate->jit_tracer_state->initial_state.start_instr; |
|
if (progress_needed && !has_space_for_executor(code, start)) { |
|
interp->compiling = false; |
|
return 0; |
|
} |
|
_PyExecutorObject *executor; |
|
int err = uop_optimize(frame, tstate, &executor, progress_needed); |
|
if (err <= 0) { |
|
interp->compiling = false; |
|
return err; |
|
} |
|
assert(executor != NULL); |
|
if (progress_needed) { |
|
int index = get_index_for_executor(code, start); |
|
if (index < 0) { |
|
/* Out of memory. Don't raise and assume that the |
|
* error will show up elsewhere. |
|
* |
|
* If an optimizer has already produced an executor, |
|
* it might get confused by the executor disappearing, |
|
* but there is not much we can do about that here. */ |
|
Py_DECREF(executor); |
|
interp->compiling = false; |
|
return 0; |
|
} |
|
insert_executor(code, start, index, executor); |
|
} |
|
executor->vm_data.chain_depth = chain_depth; |
|
assert(executor->vm_data.valid); |
|
_PyExitData *exit = _tstate->jit_tracer_state->initial_state.exit; |
|
if (exit != NULL && !progress_needed) { |
|
exit->executor = executor; |
|
} |
|
else { |
|
// An executor inserted into the code object now has a strong reference |
|
// to it from the code object. Thus, we don't need this reference anymore. |
|
Py_DECREF(executor); |
|
} |
When the 5th type arrives,
R''s guard fails and
_EXIT_TRACE falls into the upper branch of
_COLD_EXIT (target is
ENTER_EXECUTOR). That branch has no chain-depth, self-link, or eval-breaker check — it just reads
co_executors[target->op.arg] (which is
R' itself), warm-links
exit->executor = R', and
TIER2_TO_TIER2s into it. From this single fire onwards, every guard failure is a direct self tail-call that bypasses
_COLD_EXIT. The iterator never advances,
_ITER_NEXT_INLINE is never reached,
_Py_HandlePending never runs, SIGINT is silently swallowed;
gdb shows the thread pinned in
py::jit:executor with PC drifting inside ~0x90 bytes.
|
tier2 op(_COLD_EXIT, ( -- )) { |
|
_PyExitData *exit = tstate->jit_exit; |
|
assert(exit != NULL); |
|
assert(frame->owner < FRAME_OWNED_BY_INTERPRETER); |
|
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; |
|
_Py_BackoffCounter temperature = exit->temperature; |
|
_PyExecutorObject *executor; |
|
if (target->op.code == ENTER_EXECUTOR) { |
|
PyCodeObject *code = _PyFrame_GetCode(frame); |
|
executor = code->co_executors->executors[target->op.arg]; |
|
Py_INCREF(executor); |
|
assert(tstate->jit_exit == exit); |
|
exit->executor = executor; |
|
TIER2_TO_TIER2(exit->executor); |
|
} |
CPython versions tested on:
CPython main branch
Operating systems tested on:
Linux
Linked PRs
Bug report
Bug description:
Follow-up to #149481.
cc: @markshannon
TL;DR
Hang happens when a
FOR_ITERobserves more thanMAX_CHAIN_DEPTH(=4) distinct iterator types.Minimal reproducible example
Hint: Set
PYTHON_JIT_SIDE_EXIT_INITIAL_VALUE=1to reduce warmup time.Root cause analysis (by opus 4.7)
A tier-2 trace specializing
FOR_ITERstarts with_GUARD_TYPE_ITERwhose deopt target is theFOR_ITERbyte offset itself.cpython/Python/optimizer_bytecodes.c
Lines 1456 to 1477 in d36e5b8
After the side-trace chain reaches
MAX_CHAIN_DEPTH,_PyOptimizer_Optimizetakes theprogress_neededpath: it builds a fresh traceR'andinsert_executors it at that sameFOR_ITERbyte (so the byte now readsENTER_EXECUTOR(R')), while leaving the parent'sexit->executorcold. BecauseR''s own first guard targets theFOR_ITERbyte,R'is its own deopt target.cpython/Python/optimizer.c
Lines 156 to 197 in d36e5b8
When the 5th type arrives,
R''s guard fails and_EXIT_TRACEfalls into the upper branch of_COLD_EXIT(target isENTER_EXECUTOR). That branch has no chain-depth, self-link, or eval-breaker check — it just readsco_executors[target->op.arg](which isR'itself), warm-linksexit->executor = R', andTIER2_TO_TIER2s into it. From this single fire onwards, every guard failure is a direct self tail-call that bypasses_COLD_EXIT. The iterator never advances,_ITER_NEXT_INLINEis never reached,_Py_HandlePendingnever runs, SIGINT is silently swallowed;gdbshows the thread pinned inpy::jit:executorwith PC drifting inside ~0x90 bytes.cpython/Python/bytecodes.c
Lines 6290 to 6304 in d36e5b8
CPython versions tested on:
CPython main branch
Operating systems tested on:
Linux
Linked PRs
_COLD_EXIT#149573