Skip to content

Commit 374874c

Browse files
committed
rebased + _Py_VectorCallInstrumentation_StackRefSteal
1 parent ff52e90 commit 374874c

File tree

12 files changed

+484
-650
lines changed

12 files changed

+484
-650
lines changed

.github/workflows/tail-call.yml

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -79,19 +79,17 @@ jobs:
7979
with:
8080
python-version: '3.11'
8181

82-
- name: Native Windows (debug)
82+
- name: Native Windows MSVC (PGO)
8383
if: runner.os == 'Windows' && matrix.architecture != 'ARM64'
8484
shell: cmd
8585
run: |
86-
choco install llvm --allow-downgrade --no-progress --version ${{ matrix.llvm }}.1.0
87-
set PlatformToolset=clangcl
88-
set LLVMToolsVersion=${{ matrix.llvm }}.1.0
89-
set LLVMInstallDir=C:\Program Files\LLVM
90-
call ./PCbuild/build.bat --tail-call-interp -d -p ${{ matrix.architecture }}
91-
call ./PCbuild/rt.bat -d -p ${{ matrix.architecture }} -q --multiprocess 0 --timeout 4500 --verbose2 --verbose3
86+
choco install visualstudio2026buildtools-preview --pre -allWorkloads
87+
set PATH=C:\Program Files\Microsoft Visual Studio\18\Insiders\MSBuild\Current\Bin\;%PATH%
88+
./PCbuild/build.bat --tail-call-interp --pgo -p ${{ matrix.architecture }} "/p:PlatformToolset=v145"
89+
./PCbuild/rt.bat -p ${{ matrix.architecture }} -q --multiprocess 0 --timeout 4500 --verbose2 --verbose3
9290
9391
# No tests (yet):
94-
- name: Emulated Windows (release)
92+
- name: Emulated Windows Clang (release)
9593
if: runner.os == 'Windows' && matrix.architecture == 'ARM64'
9694
shell: cmd
9795
run: |

Include/internal/pycore_ceval.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,17 @@ _Py_VectorCall_StackRefSteal(
417417
int total_args,
418418
_PyStackRef kwnames);
419419

420+
PyAPI_FUNC(PyObject*)
421+
_Py_VectorCallInstrumentation_StackRefSteal(
422+
_PyStackRef callable,
423+
_PyStackRef* arguments,
424+
int total_args,
425+
_PyStackRef kwnames,
426+
bool call_instrumentation,
427+
_PyInterpreterFrame* frame,
428+
_Py_CODEUNIT* this_instr,
429+
PyThreadState* tstate);
430+
420431
PyAPI_FUNC(PyObject *)
421432
_Py_BuiltinCallFast_StackRefSteal(
422433
_PyStackRef callable,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Allow building CPython with the tail calling interpreter on Visual Studio 2026 MSVC. This provides a performance gain over the prior interpreter for MSVC. Patch by Ken Jin, Brandt Bucher, and Chris Eibl. With help from the MSVC team including Hulon Jenkins.

Objects/abstract.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,14 @@ PyObject_GetItem(PyObject *o, PyObject *key)
205205
return type_error("'%.200s' object is not subscriptable", o);
206206
}
207207

208+
// MSVC fails during a tail call release build with loads of
209+
// error C4737: Unable to perform required tail call.
210+
// without using Py_NO_INLINE here, but PGO works fine.
211+
#if defined(_MSC_VER) && !defined(__clang__) && _Py_TAIL_CALL_INTERP && !defined(_Py_USING_PGO)
212+
Py_NO_INLINE
213+
#endif
208214
int
209-
PyMapping_GetOptionalItem(PyObject *obj, PyObject *key, PyObject **result)
215+
PyMapping_GetOptionalItem(PyObject *obj, PyObject *key, PyObject **restrict result)
210216
{
211217
if (PyDict_CheckExact(obj)) {
212218
return PyDict_GetItemRef(obj, key, result);

PCbuild/pythoncore.vcxproj

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,9 @@
600600
<ClCompile Include="..\Python\bltinmodule.c" />
601601
<ClCompile Include="..\Python\bootstrap_hash.c" />
602602
<ClCompile Include="..\Python\brc.c" />
603-
<ClCompile Include="..\Python\ceval.c" />
603+
<ClCompile Include="..\Python\ceval.c">
604+
<AdditionalOptions Condition="'$(UseTailCallInterp)' == 'true' and $(PlatformToolset) != 'ClangCL'">/std:clatest %(AdditionalOptions)</AdditionalOptions>
605+
</ClCompile>
604606
<ClCompile Include="..\Python\codecs.c" />
605607
<ClCompile Include="..\Python\codegen.c" />
606608
<ClCompile Include="..\Python\compile.c" />

Python/bytecodes.c

Lines changed: 43 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2212,8 +2212,11 @@ dummy_func(
22122212
}
22132213
// we make no attempt to optimize here; specializations should
22142214
// handle any case whose performance we care about
2215-
PyObject *stack[] = {class, self};
2216-
PyObject *super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
2215+
PyObject *super;
2216+
{
2217+
PyObject *stack[] = {class, self};
2218+
super = PyObject_Vectorcall(global_super, stack, oparg & 2, NULL);
2219+
}
22172220
if (opcode == INSTRUMENTED_LOAD_SUPER_ATTR) {
22182221
PyObject *arg = oparg & 2 ? class : &_PyInstrumentation_MISSING;
22192222
if (super == NULL) {
@@ -2272,8 +2275,12 @@ dummy_func(
22722275
PyObject *name = GETITEM(FRAME_CO_NAMES, oparg >> 2);
22732276
PyTypeObject *cls = (PyTypeObject *)class;
22742277
int method_found = 0;
2275-
PyObject *attr_o = _PySuper_Lookup(cls, self, name,
2276-
Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? &method_found : NULL);
2278+
PyObject *attr_o;
2279+
{
2280+
int *method_found_ptr = &method_found;
2281+
attr_o = _PySuper_Lookup(cls, self, name,
2282+
Py_TYPE(self)->tp_getattro == PyObject_GenericGetAttr ? method_found_ptr : NULL);
2283+
}
22772284
if (attr_o == NULL) {
22782285
ERROR_NO_POP();
22792286
}
@@ -3499,10 +3506,13 @@ dummy_func(
34993506
}
35003507
assert(PyStackRef_IsTaggedInt(lasti));
35013508
(void)lasti; // Shut up compiler warning if asserts are off
3502-
PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
3503-
int has_self = !PyStackRef_IsNull(exit_self);
3504-
PyObject *res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
3505-
(3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
3509+
PyObject* res_o;
3510+
{
3511+
PyObject *stack[5] = {NULL, PyStackRef_AsPyObjectBorrow(exit_self), exc, val_o, tb};
3512+
int has_self = !PyStackRef_IsNull(exit_self);
3513+
res_o = PyObject_Vectorcall(exit_func_o, stack + 2 - has_self,
3514+
(3 + has_self) | PY_VECTORCALL_ARGUMENTS_OFFSET, NULL);
3515+
}
35063516
Py_XDECREF(original_tb);
35073517
ERROR_IF(res_o == NULL);
35083518
res = PyStackRef_FromPyObjectSteal(res_o);
@@ -3734,36 +3744,18 @@ dummy_func(
37343744
frame->return_offset = INSTRUCTION_SIZE;
37353745
DISPATCH_INLINED(new_frame);
37363746
}
3737-
/* Callable is not a normal Python function */
3738-
STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
3739-
if (CONVERSION_FAILED(args_o)) {
3740-
DECREF_INPUTS();
3741-
ERROR_IF(true);
3742-
}
3743-
PyObject *res_o = PyObject_Vectorcall(
3744-
callable_o, args_o,
3745-
total_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
3746-
NULL);
3747-
STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
3748-
if (opcode == INSTRUMENTED_CALL) {
3749-
PyObject *arg = total_args == 0 ?
3750-
&_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
3751-
if (res_o == NULL) {
3752-
_Py_call_instrumentation_exc2(
3753-
tstate, PY_MONITORING_EVENT_C_RAISE,
3754-
frame, this_instr, callable_o, arg);
3755-
}
3756-
else {
3757-
int err = _Py_call_instrumentation_2args(
3758-
tstate, PY_MONITORING_EVENT_C_RETURN,
3759-
frame, this_instr, callable_o, arg);
3760-
if (err < 0) {
3761-
Py_CLEAR(res_o);
3762-
}
3763-
}
3764-
}
3765-
assert((res_o != NULL) ^ (_PyErr_Occurred(tstate) != NULL));
3766-
DECREF_INPUTS();
3747+
PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
3748+
callable,
3749+
arguments,
3750+
total_args,
3751+
PyStackRef_NULL,
3752+
opcode == INSTRUMENTED_CALL,
3753+
frame,
3754+
this_instr,
3755+
tstate);
3756+
DEAD(args);
3757+
DEAD(self_or_null);
3758+
DEAD(callable);
37673759
ERROR_IF(res_o == NULL);
37683760
res = PyStackRef_FromPyObjectSteal(res_o);
37693761
}
@@ -4607,35 +4599,19 @@ dummy_func(
46074599
frame->return_offset = INSTRUCTION_SIZE;
46084600
DISPATCH_INLINED(new_frame);
46094601
}
4610-
/* Callable is not a normal Python function */
4611-
STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
4612-
if (CONVERSION_FAILED(args_o)) {
4613-
DECREF_INPUTS();
4614-
ERROR_IF(true);
4615-
}
4616-
PyObject *res_o = PyObject_Vectorcall(
4617-
callable_o, args_o,
4618-
positional_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
4619-
kwnames_o);
4620-
STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
4621-
if (opcode == INSTRUMENTED_CALL_KW) {
4622-
PyObject *arg = total_args == 0 ?
4623-
&_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
4624-
if (res_o == NULL) {
4625-
_Py_call_instrumentation_exc2(
4626-
tstate, PY_MONITORING_EVENT_C_RAISE,
4627-
frame, this_instr, callable_o, arg);
4628-
}
4629-
else {
4630-
int err = _Py_call_instrumentation_2args(
4631-
tstate, PY_MONITORING_EVENT_C_RETURN,
4632-
frame, this_instr, callable_o, arg);
4633-
if (err < 0) {
4634-
Py_CLEAR(res_o);
4635-
}
4636-
}
4637-
}
4638-
DECREF_INPUTS();
4602+
PyObject* res_o = _Py_VectorCallInstrumentation_StackRefSteal(
4603+
callable,
4604+
arguments,
4605+
total_args,
4606+
kwnames,
4607+
opcode == INSTRUMENTED_CALL_KW,
4608+
frame,
4609+
this_instr,
4610+
tstate);
4611+
DEAD(kwnames);
4612+
DEAD(args);
4613+
DEAD(self_or_null);
4614+
DEAD(callable);
46394615
ERROR_IF(res_o == NULL);
46404616
res = PyStackRef_FromPyObjectSteal(res_o);
46414617
}

Python/ceval.c

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,6 +1071,65 @@ _Py_VectorCall_StackRefSteal(
10711071
return res;
10721072
}
10731073

1074+
PyObject*
1075+
_Py_VectorCallInstrumentation_StackRefSteal(
1076+
_PyStackRef callable,
1077+
_PyStackRef* arguments,
1078+
int total_args,
1079+
_PyStackRef kwnames,
1080+
bool call_instrumentation,
1081+
_PyInterpreterFrame* frame,
1082+
_Py_CODEUNIT* this_instr,
1083+
PyThreadState* tstate)
1084+
{
1085+
PyObject* res;
1086+
STACKREFS_TO_PYOBJECTS(arguments, total_args, args_o);
1087+
if (CONVERSION_FAILED(args_o)) {
1088+
res = NULL;
1089+
goto cleanup;
1090+
}
1091+
PyObject* callable_o = PyStackRef_AsPyObjectBorrow(callable);
1092+
PyObject* kwnames_o = PyStackRef_AsPyObjectBorrow(kwnames);
1093+
int positional_args = total_args;
1094+
if (kwnames_o != NULL) {
1095+
positional_args -= (int)PyTuple_GET_SIZE(kwnames_o);
1096+
}
1097+
res = PyObject_Vectorcall(
1098+
callable_o, args_o,
1099+
positional_args | PY_VECTORCALL_ARGUMENTS_OFFSET,
1100+
kwnames_o);
1101+
STACKREFS_TO_PYOBJECTS_CLEANUP(args_o);
1102+
if (call_instrumentation) {
1103+
PyObject* arg = total_args == 0 ?
1104+
&_PyInstrumentation_MISSING : PyStackRef_AsPyObjectBorrow(arguments[0]);
1105+
if (res == NULL) {
1106+
_Py_call_instrumentation_exc2(
1107+
tstate, PY_MONITORING_EVENT_C_RAISE,
1108+
frame, this_instr, callable_o, arg);
1109+
}
1110+
else {
1111+
int err = _Py_call_instrumentation_2args(
1112+
tstate, PY_MONITORING_EVENT_C_RETURN,
1113+
frame, this_instr, callable_o, arg);
1114+
if (err < 0) {
1115+
Py_CLEAR(res);
1116+
}
1117+
}
1118+
}
1119+
assert((res != NULL) ^ (PyErr_Occurred() != NULL));
1120+
cleanup:
1121+
PyStackRef_XCLOSE(kwnames);
1122+
// arguments is a pointer into the GC visible stack,
1123+
// so we must NULL out values as we clear them.
1124+
for (int i = total_args - 1; i >= 0; i--) {
1125+
_PyStackRef tmp = arguments[i];
1126+
arguments[i] = PyStackRef_NULL;
1127+
PyStackRef_CLOSE(tmp);
1128+
}
1129+
PyStackRef_CLOSE(callable);
1130+
return res;
1131+
}
1132+
10741133
PyObject *
10751134
_Py_BuiltinCallFast_StackRefSteal(
10761135
_PyStackRef callable,

Python/ceval_macros.h

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -87,16 +87,19 @@
8787
# elif defined(_MSC_VER) && (_MSC_VER < 1950)
8888
# error "You need at least VS 2026 / PlatformToolset v145 for tail calling."
8989
# endif
90-
91-
// Note: [[clang::musttail]] works for GCC 15, but not __attribute__((musttail)) at the moment.
92-
# define Py_MUSTTAIL [[clang::musttail]]
93-
# define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
94-
Py_PRESERVE_NONE_CC typedef PyObject* (*py_tail_call_funcptr)(TAIL_CALL_PARAMS);
90+
# if defined(_MSC_VER) && !defined(__clang__)
91+
# define Py_MUSTTAIL [[msvc::musttail]]
92+
# define Py_PRESERVE_NONE_CC __preserve_none
93+
# else
94+
# define Py_MUSTTAIL __attribute__((musttail))
95+
# define Py_PRESERVE_NONE_CC __attribute__((preserve_none))
96+
# endif
97+
typedef PyObject *(Py_PRESERVE_NONE_CC *py_tail_call_funcptr)(TAIL_CALL_PARAMS);
9598

9699
# define DISPATCH_TABLE_VAR instruction_funcptr_table
97100
# define DISPATCH_TABLE instruction_funcptr_handler_table
98101
# define TRACING_DISPATCH_TABLE instruction_funcptr_tracing_table
99-
# define TARGET(op) Py_PRESERVE_NONE_CC PyObject *_TAIL_CALL_##op(TAIL_CALL_PARAMS)
102+
# define TARGET(op) Py_NO_INLINE PyObject *Py_PRESERVE_NONE_CC _TAIL_CALL_##op(TAIL_CALL_PARAMS)
100103

101104
# define DISPATCH_GOTO() \
102105
do { \

Python/executor_cases.c.h

Lines changed: 27 additions & 18 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)