Skip to content

Commit b4c7077

Browse files
committed
Fix JIT register clobbering: use only caller-saved registers
ARM64 JIT used x18 (macOS platform register) and x19-x20 (callee-saved) for temporaries without saving/restoring them. In ReleaseSafe, Zig keeps values in x19 across JIT calls, causing silent data corruption for loops exceeding JIT_THRESHOLD. Fix: compute temp_base = max_slot + 4 and restrict all scratch registers to x3-x15 (caller-saved only).
1 parent fac9011 commit b4c7077

File tree

1 file changed

+49
-40
lines changed

1 file changed

+49
-40
lines changed

src/engine/vm/jit.zig

Lines changed: 49 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@
1616
//! Register convention for JIT-compiled code:
1717
//! x0 = return value (NaN-boxed)
1818
//! x1 = return status (0=ok, 1=deopt)
19+
//! x3..x(max_slot+3) = loop variables (unboxed i64)
20+
//! x(max_slot+4)..x15 = scratch temporaries (vstack, const unbox)
1921
//! x16 = base pointer (&stack[frame.base])
20-
//! x17 = temp (tag checking)
21-
//! x3..x15 = loop variables (unboxed i64)
22+
//! x17 = tag constant / boxing temp
23+
//!
24+
//! Only caller-saved registers (x0-x15) are used. x18 (macOS platform register)
25+
//! and x19-x28 (callee-saved) are never touched.
2226
//!
2327
//! Calling convention (C ABI):
2428
//! Input: x0 = stack ptr, x1 = base (slot count), x2 = constants ptr
@@ -405,6 +409,39 @@ pub const JitCompiler = struct {
405409
// Max 13 local slots (x3..x15).
406410
if (max_slot > 12) return null;
407411

412+
// Temporary register allocation: use caller-saved registers above the
413+
// last local slot register, staying within x3..x15. This avoids
414+
// clobbering x18 (macOS platform register) and x19-x28 (callee-saved).
415+
//
416+
// Register map:
417+
// x0: stack ptr / return value
418+
// x1: base offset / return status
419+
// x2: constants ptr
420+
// x3 .. x(max_slot+3): local slot registers
421+
// x(temp_base) .. x15: scratch temporaries (prologue, vstack, const unbox)
422+
// x16: base pointer (&stack[base])
423+
// x17: tag constant / boxing temp
424+
const temp_base: u5 = @intCast(@as(u8, max_slot) + 4);
425+
// Count max vstack entries needed (one per add/sub op before recur_loop).
426+
var max_vsp: u8 = 0;
427+
{
428+
var cnt: u8 = 0;
429+
for (ops[0..op_count]) |op| {
430+
switch (op) {
431+
.add_locals, .add_local_const, .sub_locals, .sub_local_const => {
432+
cnt += 1;
433+
max_vsp = @max(max_vsp, cnt);
434+
},
435+
.recur_loop => cnt = 0,
436+
else => {},
437+
}
438+
}
439+
}
440+
// Need temp_base + max_vsp - 1 <= 15 (x15 is last caller-saved).
441+
// Also need at least 1 temp for prologue/branch_cmp_const.
442+
if (@as(u8, temp_base) + max_vsp > 16) return null;
443+
if (temp_base > 15) return null;
444+
408445
self.offset = 0;
409446

410447
// --- Prologue: compute base pointer, load and unbox locals ---
@@ -421,19 +458,12 @@ pub const JitCompiler = struct {
421458
if (used_slots & (@as(u16, 1) << @intCast(slot)) == 0) continue;
422459
const reg: u5 = @intCast(slot + 3);
423460
self.emit(ldr(reg, 16, @intCast(slot * 8)));
424-
self.emit(lsrImm(18, reg, 48));
425-
self.emit(cmpReg(18, 17));
461+
self.emit(lsrImm(temp_base, reg, 48));
462+
self.emit(cmpReg(temp_base, 17));
426463
self.emit(0); // placeholder for B.NE deopt
427464
self.emit(sbfx(reg, reg, 0, 48));
428465
}
429466

430-
// Load constants referenced by *_local_const ops into x19..x28 (callee-saved).
431-
// For PoC simplicity: unbox constants inline where needed.
432-
// Actually, for the PoC, let's just unbox constants during prologue into
433-
// a separate set of registers. We have x19-x28 as callee-saved, so we
434-
// need to save/restore them. For simplicity, keep constants in memory
435-
// and load them fresh in the loop body from x2 (constants pointer).
436-
437467
// --- Main loop ---
438468
const loop_top = self.offset;
439469

@@ -450,46 +480,27 @@ pub const JitCompiler = struct {
450480
const ra: u5 = @intCast(@as(u8, b.slot_a) + 3);
451481
const rb: u5 = @intCast(@as(u8, b.slot_b) + 3);
452482
self.emit(cmpReg(ra, rb));
453-
// Branch to exit on exit_cond. Offset patched later.
454-
const exit_branch_pos = self.offset;
455-
_ = exit_branch_pos;
456-
self.emit(0); // placeholder
483+
self.emit(0); // placeholder for exit branch
457484
},
458485
.branch_cmp_const => |b| {
459486
const ra: u5 = @intCast(@as(u8, b.slot) + 3);
460487
// Load constant from memory, unbox, compare.
461-
// ldr x18, [x2, #const_idx*8]
462-
self.emit(ldr(18, 2, @intCast(@as(u16, b.const_idx) * 8)));
463-
self.emit(sbfx(18, 18, 0, 48));
464-
self.emit(cmpReg(ra, 18));
488+
self.emit(ldr(temp_base, 2, @intCast(@as(u16, b.const_idx) * 8)));
489+
self.emit(sbfx(temp_base, temp_base, 0, 48));
490+
self.emit(cmpReg(ra, temp_base));
465491
self.emit(0); // placeholder for exit branch
466492
},
467493
.add_locals => |a| {
468494
const ra: u5 = @intCast(@as(u8, a.slot_a) + 3);
469495
const rb: u5 = @intCast(@as(u8, a.slot_b) + 3);
470-
// Result into x18 (temp), push to vstack.
471-
self.emit(addReg(18, ra, rb));
472-
vstack[vsp] = 18;
473-
// But we might need multiple vstack entries, so store in
474-
// different temp registers. Use x18, x19, x20...
475-
// For PoC: max 2 vstack entries (arith_loop).
476-
const dst: u5 = @intCast(18 + vsp);
477-
if (dst > 20) return null; // too many temporaries
478-
self.emit(addReg(dst, ra, rb));
479-
// Fix: we emitted twice. Remove the first emit.
480-
// Actually let me restructure.
481-
// Re-do: emit into the correct register directly.
482-
self.offset -= 8; // undo both emits
483-
const tmp: u5 = @intCast(18 + vsp);
484-
if (tmp > 20) return null;
496+
const tmp: u5 = @intCast(@as(u8, temp_base) + @as(u8, @intCast(vsp)));
485497
self.emit(addReg(tmp, ra, rb));
486498
vstack[vsp] = tmp;
487499
vsp += 1;
488500
},
489501
.add_local_const => |a| {
490502
const ra: u5 = @intCast(@as(u8, a.slot) + 3);
491-
const tmp: u5 = @intCast(18 + vsp);
492-
if (tmp > 20) return null;
503+
const tmp: u5 = @intCast(@as(u8, temp_base) + @as(u8, @intCast(vsp)));
493504
// Load constant, unbox, add.
494505
self.emit(ldr(tmp, 2, @intCast(@as(u16, a.const_idx) * 8)));
495506
self.emit(sbfx(tmp, tmp, 0, 48));
@@ -500,16 +511,14 @@ pub const JitCompiler = struct {
500511
.sub_locals => |a| {
501512
const ra: u5 = @intCast(@as(u8, a.slot_a) + 3);
502513
const rb: u5 = @intCast(@as(u8, a.slot_b) + 3);
503-
const tmp: u5 = @intCast(18 + vsp);
504-
if (tmp > 20) return null;
514+
const tmp: u5 = @intCast(@as(u8, temp_base) + @as(u8, @intCast(vsp)));
505515
self.emit(subReg(tmp, ra, rb));
506516
vstack[vsp] = tmp;
507517
vsp += 1;
508518
},
509519
.sub_local_const => |a| {
510520
const ra: u5 = @intCast(@as(u8, a.slot) + 3);
511-
const tmp: u5 = @intCast(18 + vsp);
512-
if (tmp > 20) return null;
521+
const tmp: u5 = @intCast(@as(u8, temp_base) + @as(u8, @intCast(vsp)));
513522
self.emit(ldr(tmp, 2, @intCast(@as(u16, a.const_idx) * 8)));
514523
self.emit(sbfx(tmp, tmp, 0, 48));
515524
self.emit(subReg(tmp, ra, tmp));

0 commit comments

Comments
 (0)