1616//! Register convention for JIT-compiled code:
1717//! x0 = return value (NaN-boxed)
1818//! x1 = return status (0=ok, 1=deopt)
19+ //! x3..x(max_slot+3) = loop variables (unboxed i64)
20+ //! x(max_slot+4)..x15 = scratch temporaries (vstack, const unbox)
1921//! x16 = base pointer (&stack[frame.base])
20- //! x17 = temp (tag checking)
21- //! x3..x15 = loop variables (unboxed i64)
22+ //! x17 = tag constant / boxing temp
23+ //!
24+ //! Only caller-saved registers (x0-x15) are used. x18 (macOS platform register)
25+ //! and x19-x28 (callee-saved) are never touched.
2226//!
2327//! Calling convention (C ABI):
2428//! Input: x0 = stack ptr, x1 = base (slot count), x2 = constants ptr
@@ -405,6 +409,39 @@ pub const JitCompiler = struct {
405409 // Max 13 local slots (x3..x15).
406410 if (max_slot > 12 ) return null ;
407411
412+ // Temporary register allocation: use caller-saved registers above the
413+ // last local slot register, staying within x3..x15. This avoids
414+ // clobbering x18 (macOS platform register) and x19-x28 (callee-saved).
415+ //
416+ // Register map:
417+ // x0: stack ptr / return value
418+ // x1: base offset / return status
419+ // x2: constants ptr
420+ // x3 .. x(max_slot+3): local slot registers
421+ // x(temp_base) .. x15: scratch temporaries (prologue, vstack, const unbox)
422+ // x16: base pointer (&stack[base])
423+ // x17: tag constant / boxing temp
424+ const temp_base : u5 = @intCast (@as (u8 , max_slot ) + 4 );
425+ // Count max vstack entries needed (one per add/sub op before recur_loop).
426+ var max_vsp : u8 = 0 ;
427+ {
428+ var cnt : u8 = 0 ;
429+ for (ops [0.. op_count ]) | op | {
430+ switch (op ) {
431+ .add_locals , .add_local_const , .sub_locals , .sub_local_const = > {
432+ cnt += 1 ;
433+ max_vsp = @max (max_vsp , cnt );
434+ },
435+ .recur_loop = > cnt = 0 ,
436+ else = > {},
437+ }
438+ }
439+ }
440+ // Need temp_base + max_vsp - 1 <= 15 (x15 is last caller-saved).
441+ // Also need at least 1 temp for prologue/branch_cmp_const.
442+ if (@as (u8 , temp_base ) + max_vsp > 16 ) return null ;
443+ if (temp_base > 15 ) return null ;
444+
408445 self .offset = 0 ;
409446
410447 // --- Prologue: compute base pointer, load and unbox locals ---
@@ -421,19 +458,12 @@ pub const JitCompiler = struct {
421458 if (used_slots & (@as (u16 , 1 ) << @intCast (slot )) == 0 ) continue ;
422459 const reg : u5 = @intCast (slot + 3 );
423460 self .emit (ldr (reg , 16 , @intCast (slot * 8 )));
424- self .emit (lsrImm (18 , reg , 48 ));
425- self .emit (cmpReg (18 , 17 ));
461+ self .emit (lsrImm (temp_base , reg , 48 ));
462+ self .emit (cmpReg (temp_base , 17 ));
426463 self .emit (0 ); // placeholder for B.NE deopt
427464 self .emit (sbfx (reg , reg , 0 , 48 ));
428465 }
429466
430- // Load constants referenced by *_local_const ops into x19..x28 (callee-saved).
431- // For PoC simplicity: unbox constants inline where needed.
432- // Actually, for the PoC, let's just unbox constants during prologue into
433- // a separate set of registers. We have x19-x28 as callee-saved, so we
434- // need to save/restore them. For simplicity, keep constants in memory
435- // and load them fresh in the loop body from x2 (constants pointer).
436-
437467 // --- Main loop ---
438468 const loop_top = self .offset ;
439469
@@ -450,46 +480,27 @@ pub const JitCompiler = struct {
450480 const ra : u5 = @intCast (@as (u8 , b .slot_a ) + 3 );
451481 const rb : u5 = @intCast (@as (u8 , b .slot_b ) + 3 );
452482 self .emit (cmpReg (ra , rb ));
453- // Branch to exit on exit_cond. Offset patched later.
454- const exit_branch_pos = self .offset ;
455- _ = exit_branch_pos ;
456- self .emit (0 ); // placeholder
483+ self .emit (0 ); // placeholder for exit branch
457484 },
458485 .branch_cmp_const = > | b | {
459486 const ra : u5 = @intCast (@as (u8 , b .slot ) + 3 );
460487 // Load constant from memory, unbox, compare.
461- // ldr x18, [x2, #const_idx*8]
462- self .emit (ldr (18 , 2 , @intCast (@as (u16 , b .const_idx ) * 8 )));
463- self .emit (sbfx (18 , 18 , 0 , 48 ));
464- self .emit (cmpReg (ra , 18 ));
488+ self .emit (ldr (temp_base , 2 , @intCast (@as (u16 , b .const_idx ) * 8 )));
489+ self .emit (sbfx (temp_base , temp_base , 0 , 48 ));
490+ self .emit (cmpReg (ra , temp_base ));
465491 self .emit (0 ); // placeholder for exit branch
466492 },
467493 .add_locals = > | a | {
468494 const ra : u5 = @intCast (@as (u8 , a .slot_a ) + 3 );
469495 const rb : u5 = @intCast (@as (u8 , a .slot_b ) + 3 );
470- // Result into x18 (temp), push to vstack.
471- self .emit (addReg (18 , ra , rb ));
472- vstack [vsp ] = 18 ;
473- // But we might need multiple vstack entries, so store in
474- // different temp registers. Use x18, x19, x20...
475- // For PoC: max 2 vstack entries (arith_loop).
476- const dst : u5 = @intCast (18 + vsp );
477- if (dst > 20 ) return null ; // too many temporaries
478- self .emit (addReg (dst , ra , rb ));
479- // Fix: we emitted twice. Remove the first emit.
480- // Actually let me restructure.
481- // Re-do: emit into the correct register directly.
482- self .offset -= 8 ; // undo both emits
483- const tmp : u5 = @intCast (18 + vsp );
484- if (tmp > 20 ) return null ;
496+ const tmp : u5 = @intCast (@as (u8 , temp_base ) + @as (u8 , @intCast (vsp )));
485497 self .emit (addReg (tmp , ra , rb ));
486498 vstack [vsp ] = tmp ;
487499 vsp += 1 ;
488500 },
489501 .add_local_const = > | a | {
490502 const ra : u5 = @intCast (@as (u8 , a .slot ) + 3 );
491- const tmp : u5 = @intCast (18 + vsp );
492- if (tmp > 20 ) return null ;
503+ const tmp : u5 = @intCast (@as (u8 , temp_base ) + @as (u8 , @intCast (vsp )));
493504 // Load constant, unbox, add.
494505 self .emit (ldr (tmp , 2 , @intCast (@as (u16 , a .const_idx ) * 8 )));
495506 self .emit (sbfx (tmp , tmp , 0 , 48 ));
@@ -500,16 +511,14 @@ pub const JitCompiler = struct {
500511 .sub_locals = > | a | {
501512 const ra : u5 = @intCast (@as (u8 , a .slot_a ) + 3 );
502513 const rb : u5 = @intCast (@as (u8 , a .slot_b ) + 3 );
503- const tmp : u5 = @intCast (18 + vsp );
504- if (tmp > 20 ) return null ;
514+ const tmp : u5 = @intCast (@as (u8 , temp_base ) + @as (u8 , @intCast (vsp )));
505515 self .emit (subReg (tmp , ra , rb ));
506516 vstack [vsp ] = tmp ;
507517 vsp += 1 ;
508518 },
509519 .sub_local_const = > | a | {
510520 const ra : u5 = @intCast (@as (u8 , a .slot ) + 3 );
511- const tmp : u5 = @intCast (18 + vsp );
512- if (tmp > 20 ) return null ;
521+ const tmp : u5 = @intCast (@as (u8 , temp_base ) + @as (u8 , @intCast (vsp )));
513522 self .emit (ldr (tmp , 2 , @intCast (@as (u16 , a .const_idx ) * 8 )));
514523 self .emit (sbfx (tmp , tmp , 0 , 48 ));
515524 self .emit (subReg (tmp , ra , tmp ));
0 commit comments