From faa67506e51908e2b235fe68ca3dac8c3bfaf354 Mon Sep 17 00:00:00 2001
From: John Hawthorn <john@hawthorn.email>
Date: Fri, 1 Aug 2025 13:05:59 -0700
Subject: [PATCH 1/9] Ensure CC entries always marked, add missing WB

Previously we were issuing writebarriers for each cc, but were missing
the cme.

We also need to avoid it being possible to run GC after we've copied the
values into the allocated array, but before they're visible in the
object.
---
 vm_method.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vm_method.c b/vm_method.c
index 874e25ed7666e6..76b1c97d046c2b 100644
--- a/vm_method.c
+++ b/vm_method.c
@@ -136,17 +136,19 @@ rb_vm_cc_table_create(size_t capa)
 static enum rb_id_table_iterator_result
 vm_cc_table_dup_i(ID key, VALUE old_ccs_ptr, void *data)
 {
+    VALUE new_table = (VALUE)data;
     struct rb_class_cc_entries *old_ccs = (struct rb_class_cc_entries *)old_ccs_ptr;
     size_t memsize = vm_ccs_alloc_size(old_ccs->capa);
-    struct rb_class_cc_entries *new_ccs = ruby_xmalloc(memsize);
+    struct rb_class_cc_entries *new_ccs = ruby_xcalloc(1, memsize);
+    rb_managed_id_table_insert(new_table, key, (VALUE)new_ccs);
+
     memcpy(new_ccs, old_ccs, memsize);
 
 #if VM_CHECK_MODE > 0
     new_ccs->debug_sig = ~(VALUE)new_ccs;
 #endif
 
-    VALUE new_table = (VALUE)data;
-    rb_managed_id_table_insert(new_table, key, (VALUE)new_ccs);
+    RB_OBJ_WRITTEN(new_table, Qundef, (VALUE)new_ccs->cme);
     for (int index = 0; index < new_ccs->len; index++) {
         RB_OBJ_WRITTEN(new_table, Qundef, new_ccs->entries[index].cc);
     }

From f58fca7de0120394d5530902f694b12d260dd14e Mon Sep 17 00:00:00 2001
From: Alan Wu <XrXr@users.noreply.github.com>
Date: Fri, 1 Aug 2025 16:50:10 -0400
Subject: [PATCH 2/9] ZJIT: A64: Use MOVN for small negative immediates

Save a couple instructions to load a small negative constant into a
register. In fact MOVN is speced to alias as `mov` in the official
disassembly.
---
 zjit/src/asm/arm64/inst/mov.rs | 37 ++++++++++++++++++++++++++++++++++
 zjit/src/asm/arm64/mod.rs      | 20 ++++++++++++++++++
 zjit/src/backend/arm64/mod.rs  | 32 +++++++++++++++--------------
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/zjit/src/asm/arm64/inst/mov.rs b/zjit/src/asm/arm64/inst/mov.rs
index eae4565c3ab0ce..58877ae94040c7 100644
--- a/zjit/src/asm/arm64/inst/mov.rs
+++ b/zjit/src/asm/arm64/inst/mov.rs
@@ -2,6 +2,9 @@ use super::super::arg::Sf;
 
 /// Which operation is being performed.
 enum Op {
+    /// A movn operation which inverts the immediate and zeroes out the other bits.
+    MOVN = 0b00,
+
     /// A movz operation which zeroes out the other bits.
     MOVZ = 0b10,
 
@@ -61,6 +64,12 @@ impl Mov {
         Self { rd, imm16, hw: hw.into(), op: Op::MOVK, sf: num_bits.into() }
     }
 
+    /// MOVN
+    /// <https://developer.arm.com/documentation/ddi0602/2025-06/Base-Instructions/MOVN--Move-wide-with-NOT->
+    pub fn movn(rd: u8, imm16: u16, hw: u8, num_bits: u8) -> Self {
+        Self { rd, imm16, hw: hw.into(), op: Op::MOVN, sf: num_bits.into() }
+    }
+
     /// MOVZ
     /// <https://developer.arm.com/documentation/ddi0602/2022-03/Base-Instructions/MOVZ--Move-wide-with-zero-?lang=en>
     pub fn movz(rd: u8, imm16: u16, hw: u8, num_bits: u8) -> Self {
@@ -104,6 +113,34 @@ mod tests {
         assert_eq!(0xf2800f60, result);
     }
 
+    #[test]
+    fn test_movn_unshifted() {
+        let inst = Mov::movn(0, 123, 0, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x92800f60, result);
+    }
+
+    #[test]
+    fn test_movn_shifted_16() {
+        let inst = Mov::movn(0, 123, 16, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x92a00f60, result);
+    }
+
+    #[test]
+    fn test_movn_shifted_32() {
+        let inst = Mov::movn(0, 123, 32, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x92c00f60, result);
+    }
+
+    #[test]
+    fn test_movn_shifted_48() {
+        let inst = Mov::movn(0, 123, 48, 64);
+        let result: u32 = inst.into();
+        assert_eq!(0x92e00f60, result);
+    }
+
     #[test]
     fn test_movk_shifted_16() {
         let inst = Mov::movk(0, 123, 16, 64);
diff --git a/zjit/src/asm/arm64/mod.rs b/zjit/src/asm/arm64/mod.rs
index ae14d561f65b16..0576b230907ff5 100644
--- a/zjit/src/asm/arm64/mod.rs
+++ b/zjit/src/asm/arm64/mod.rs
@@ -716,6 +716,21 @@ pub fn movk(cb: &mut CodeBlock, rd: A64Opnd, imm16: A64Opnd, shift: u8) {
     cb.write_bytes(&bytes);
 }
 
+/// MOVN - load a register with the complement of a shifted then zero extended 16-bit immediate
+/// <https://developer.arm.com/documentation/ddi0602/2025-06/Base-Instructions/MOVN--Move-wide-with-NOT->
+pub fn movn(cb: &mut CodeBlock, rd: A64Opnd, imm16: A64Opnd, shift: u8) {
+    let bytes: [u8; 4] = match (rd, imm16) {
+        (A64Opnd::Reg(rd), A64Opnd::UImm(imm16)) => {
+            assert!(uimm_fits_bits(imm16, 16), "The immediate operand must be 16 bits or less.");
+
+            Mov::movn(rd.reg_no, imm16 as u16, shift, rd.num_bits).into()
+        },
+        _ => panic!("Invalid operand combination to movn instruction.")
+    };
+
+    cb.write_bytes(&bytes);
+}
+
 /// MOVZ - move a 16 bit immediate into a register, zero the other bits
 pub fn movz(cb: &mut CodeBlock, rd: A64Opnd, imm16: A64Opnd, shift: u8) {
     let bytes: [u8; 4] = match (rd, imm16) {
@@ -1543,6 +1558,11 @@ mod tests {
         check_bytes("600fa0f2", |cb| movk(cb, X0, A64Opnd::new_uimm(123), 16));
     }
 
+    #[test]
+    fn test_movn() {
+        check_bytes("600fa092", |cb| movn(cb, X0, A64Opnd::new_uimm(123), 16));
+    }
+
     #[test]
     fn test_movz() {
         check_bytes("600fa0d2", |cb| movz(cb, X0, A64Opnd::new_uimm(123), 16));
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index a1243f45e17e58..8ba7a33de5e1bc 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -140,6 +140,10 @@ fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize {
         // instruction, then we'll use that.
         movz(cb, rd, A64Opnd::new_uimm(current), 0);
         return 1;
+    } else if u16::try_from(!value).is_ok() {
+        // For small negative values, use a single movn
+        movn(cb, rd, A64Opnd::new_uimm(!value), 0);
+        return 1;
     } else if BitmaskImmediate::try_from(current).is_ok() {
         // Otherwise, if the immediate can be encoded
         // with the special bitmask immediate encoding,
@@ -1592,15 +1596,16 @@ mod tests {
 
         // Test values that exercise various types of immediates.
         //  - 9 bit displacement for Load/Store
-        //  - 12 bit shifted immediate
+        //  - 12 bit ADD/SUB shifted immediate
+        //  - 16 bit MOV family shifted immediates
         //  - bit mask immediates
-        for displacement in [i32::MAX, 0x10008, 0x1800, 0x208, -0x208, -0x1800, -0x1008, i32::MIN] {
+        for displacement in [i32::MAX, 0x10008, 0x1800, 0x208, -0x208, -0x1800, -0x10008, i32::MIN] {
             let mem = Opnd::mem(64, NATIVE_STACK_PTR, displacement);
             asm.lea_into(Opnd::Reg(X0_REG), mem);
         }
 
         asm.compile_with_num_regs(&mut cb, 0);
-        assert_disasm!(cb, "e07b40b2e063208b000180d22000a0f2e063208b000083d2e063208be0230891e02308d100009dd2e0ffbff2e0ffdff2e0fffff2e063208b00ff9dd2e0ffbff2e0ffdff2e0fffff2e063208be08361b2e063208b", "
+        assert_disasm!(cb, "e07b40b2e063208b000180d22000a0f2e063208b000083d2e063208be0230891e02308d1e0ff8292e063208b00ff9fd2c0ffbff2e0ffdff2e0fffff2e063208be08361b2e063208b", "
             0x0: orr x0, xzr, #0x7fffffff
             0x4: add x0, sp, x0
             0x8: mov x0, #8
@@ -1610,18 +1615,15 @@ mod tests {
             0x18: add x0, sp, x0
             0x1c: add x0, sp, #0x208
             0x20: sub x0, sp, #0x208
-            0x24: mov x0, #0xe800
-            0x28: movk x0, #0xffff, lsl #16
-            0x2c: movk x0, #0xffff, lsl #32
-            0x30: movk x0, #0xffff, lsl #48
-            0x34: add x0, sp, x0
-            0x38: mov x0, #0xeff8
-            0x3c: movk x0, #0xffff, lsl #16
-            0x40: movk x0, #0xffff, lsl #32
-            0x44: movk x0, #0xffff, lsl #48
-            0x48: add x0, sp, x0
-            0x4c: orr x0, xzr, #0xffffffff80000000
-            0x50: add x0, sp, x0
+            0x24: mov x0, #-0x1800
+            0x28: add x0, sp, x0
+            0x2c: mov x0, #0xfff8
+            0x30: movk x0, #0xfffe, lsl #16
+            0x34: movk x0, #0xffff, lsl #32
+            0x38: movk x0, #0xffff, lsl #48
+            0x3c: add x0, sp, x0
+            0x40: orr x0, xzr, #0xffffffff80000000
+            0x44: add x0, sp, x0
         ");
     }
 

From afac22647852439e7b3563216afac7b9491eaa0b Mon Sep 17 00:00:00 2001
From: Alan Wu <XrXr@users.noreply.github.com>
Date: Fri, 1 Aug 2025 14:54:52 -0400
Subject: [PATCH 3/9] ZJIT: Fix side-exit panicking when there's too many
 locals

Previously, ARM64 panicked due to compiled_side_exits() when the memory
displacement got large enough to exceed the 9 bits limit. Usually, we split
these kind of memory operands, but compiled_side_exits() runs after
split.

Using scratch registers, implement `Insn::Store` on ARM such that it can
handle large displacements without split(). Do this for x86 as well, and
remove arch specific code from compiled_side_exits(). We can now run
`TestKeywordArguments`.

Since `Insn::Store` doesn't need splitting now, users enjoy lower
register pressure.

Downside is, using `Assembler::SCRATCH_REG` as a base register is now
sometimes an error, depending on whether `Insn::Store` also needs to
use the register. It seems a fair trade off since `SCRATCH_REG` is
not often used, and we don't put it as a base register anywhere at the
moment.
---
 test/.excludes-zjit/TestKeywordArguments.rb |   1 -
 zjit/src/backend/arm64/mod.rs               | 234 +++++++++++++++-----
 zjit/src/backend/lir.rs                     |  13 --
 zjit/src/backend/x86_64/mod.rs              | 135 +++++++----
 4 files changed, 269 insertions(+), 114 deletions(-)
 delete mode 100644 test/.excludes-zjit/TestKeywordArguments.rb

diff --git a/test/.excludes-zjit/TestKeywordArguments.rb b/test/.excludes-zjit/TestKeywordArguments.rb
deleted file mode 100644
index f52bdf6d30d6c6..00000000000000
--- a/test/.excludes-zjit/TestKeywordArguments.rb
+++ /dev/null
@@ -1 +0,0 @@
-exclude(/test_/, 'Multiple tests make ZJIT panic')
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index 8ba7a33de5e1bc..2025e444ae4464 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -195,11 +195,15 @@ pub const ALLOC_REGS: &'static [Reg] = &[
 
 impl Assembler
 {
-    // Special scratch registers for intermediate processing.
-    // This register is caller-saved (so we don't have to save it before using it)
+    /// Special scratch registers for intermediate processing.
+    /// This register is call-clobbered (so we don't have to save it before using it).
+    /// Avoid using if you can since this is used to lower [Insn] internally and
+    /// so conflicts are possible.
     pub const SCRATCH_REG: Reg = X16_REG;
     const SCRATCH0: A64Opnd = A64Opnd::Reg(Assembler::SCRATCH_REG);
+    const SCRATCH0_REG: Reg = Assembler::SCRATCH_REG;
     const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG);
+    const SCRATCH1_REG: Reg = X17_REG;
 
     /// Get the list of registers from which we will allocate on this platform
     pub fn get_alloc_regs() -> Vec<Reg> {
@@ -652,31 +656,6 @@ impl Assembler
                     *opnd = split_load_operand(asm, *opnd);
                     asm.push_insn(insn);
                 },
-                Insn::Store { dest, src } => {
-                    // The value being stored must be in a register, so if it's
-                    // not already one we'll load it first.
-                    let opnd1 = match src {
-                         // If the first operand is zero, then we can just use
-                        // the zero register.
-                        Opnd::UImm(0) | Opnd::Imm(0) => Opnd::Reg(XZR_REG),
-                        // Otherwise we'll check if we need to load it first.
-                        _ => split_load_operand(asm, *src)
-                    };
-
-                    match dest {
-                        Opnd::Reg(_) => {
-                            // Store does not support a register as a dest operand.
-                            asm.mov(*dest, opnd1);
-                        }
-                        _ => {
-                            // The displacement for the STUR instruction can't be more
-                            // than 9 bits long. If it's longer, we need to load the
-                            // memory address into a register first.
-                            let opnd0 = split_memory_address(asm, *dest);
-                            asm.store(opnd0, opnd1);
-                        }
-                    }
-                },
                 Insn::Mul { left, right, .. } => {
                     *left = split_load_operand(asm, *left);
                     *right = split_load_operand(asm, *right);
@@ -843,6 +822,42 @@ impl Assembler
             }
         }
 
+        /// Do the address calculation of `out_reg = base_reg + disp`
+        fn load_effective_address(cb: &mut CodeBlock, out: A64Opnd, base_reg_no: u8, disp: i32) {
+            let base_reg = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: base_reg_no });
+            assert_ne!(31, out.unwrap_reg().reg_no, "Lea sp, [sp, #imm] not always encodable. Use add/sub instead.");
+
+            if ShiftedImmediate::try_from(disp.unsigned_abs() as u64).is_ok() {
+                // Use ADD/SUB if the displacement fits
+                add(cb, out, base_reg, A64Opnd::new_imm(disp.into()));
+            } else {
+                // Use add_extended() to interpret reg_no=31 as sp
+                // since the base register is never the zero register.
+                // Careful! Only the first two operands can refer to sp.
+                emit_load_value(cb, out, disp as u64);
+                add_extended(cb, out, base_reg, out);
+            };
+        }
+
+        /// Load a VALUE to a register and remember it for GC marking and reference updating
+        fn emit_load_gc_value(cb: &mut CodeBlock, gc_offsets: &mut Vec<CodePtr>, dest: A64Opnd, value: VALUE) {
+            // We dont need to check if it's a special const
+            // here because we only allow these operands to hit
+            // this point if they're not a special const.
+            assert!(!value.special_const_p());
+
+            // This assumes only load instructions can contain
+            // references to GC'd Value operands. If the value
+            // being loaded is a heap object, we'll report that
+            // back out to the gc_offsets list.
+            ldr_literal(cb, dest, 2.into());
+            b(cb, InstructionOffset::from_bytes(4 + (SIZEOF_VALUE as i32)));
+            cb.write_bytes(&value.as_u64().to_le_bytes());
+
+            let ptr_offset = cb.get_write_ptr().sub_bytes(SIZEOF_VALUE);
+            gc_offsets.push(ptr_offset);
+        }
+
         /// Emit a push instruction for the given operand by adding to the stack
         /// pointer and then storing the given value.
         fn emit_push(cb: &mut CodeBlock, opnd: A64Opnd) {
@@ -1013,12 +1028,84 @@ impl Assembler
                 Insn::LShift { opnd, shift, out } => {
                     lsl(cb, out.into(), opnd.into(), shift.into());
                 },
-                Insn::Store { dest, src } => {
+                store_insn @ Insn::Store { dest, src } => {
+                    // With minor exceptions, as long as `dest` is a Mem, all forms of `src` are
+                    // accepted. As a rule of thumb, avoid using Assembler::SCRATCH as a memory
+                    // base register to gurantee things will work.
+                    let &Opnd::Mem(Mem { num_bits: dest_num_bits, base: MemBase::Reg(base_reg_no), disp }) = dest else {
+                        panic!("Unexpected Insn::Store destination in arm64_emit: {dest:?}");
+                    };
+
+                    // This kind of tricky clobber can only happen for explicit use of SCRATCH_REG,
+                    // so we panic to get the author to change their code.
+                    #[track_caller]
+                    fn assert_no_clobber(store_insn: &Insn, user_use: u8, backend_use: Reg) {
+                        assert_ne!(
+                            backend_use.reg_no,
+                            user_use,
+                            "Emitting {store_insn:?} would clobber {user_use:?}, in conflict with its semantics"
+                        );
+                    }
+
+                    // Split src into SCRATCH0 if necessary
+                    let src_reg: A64Reg = match src {
+                        Opnd::Reg(reg) => *reg,
+                        // Use zero register when possible
+                        Opnd::UImm(0) | Opnd::Imm(0) => XZR_REG,
+                        // Immediates
+                        &Opnd::Imm(imm) => {
+                            assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG);
+                            emit_load_value(cb, Self::SCRATCH0, imm as u64);
+                            Self::SCRATCH0_REG
+                        }
+                        &Opnd::UImm(imm) => {
+                            assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG);
+                            emit_load_value(cb, Self::SCRATCH0, imm);
+                            Self::SCRATCH0_REG
+                        }
+                        &Opnd::Value(value) => {
+                            assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG);
+                            emit_load_gc_value(cb, &mut gc_offsets, Self::SCRATCH0, value);
+                            Self::SCRATCH0_REG
+                        }
+                        src_mem @ &Opnd::Mem(Mem { num_bits: src_num_bits, base: MemBase::Reg(src_base_reg_no), disp: src_disp }) => {
+                            // For mem-to-mem store, load the source into SCRATCH0
+                            assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG);
+                            let src_mem = if mem_disp_fits_bits(src_disp) {
+                                src_mem.into()
+                            } else {
+                                // Split the load address into SCRATCH0 first if necessary
+                                assert_no_clobber(store_insn, src_base_reg_no, Self::SCRATCH0_REG);
+                                load_effective_address(cb, Self::SCRATCH0, src_base_reg_no, src_disp);
+                                A64Opnd::new_mem(dest_num_bits, Self::SCRATCH0, 0)
+                            };
+                            match src_num_bits {
+                                64 | 32 => ldur(cb, Self::SCRATCH0, src_mem),
+                                16 => ldurh(cb, Self::SCRATCH0, src_mem),
+                                8 => ldurb(cb, Self::SCRATCH0, src_mem),
+                                num_bits => panic!("unexpected num_bits: {num_bits}")
+                            };
+                            Self::SCRATCH0_REG
+                        }
+                        src @ (Opnd::Mem(_) | Opnd::None | Opnd::VReg { .. }) => panic!("Unexpected source operand during arm64_emit: {src:?}")
+                    };
+                    let src = A64Opnd::Reg(src_reg);
+
+                    // Split dest into SCRATCH1 if necessary.
+                    let dest = if mem_disp_fits_bits(disp) {
+                        dest.into()
+                    } else {
+                        assert_no_clobber(store_insn, src_reg.reg_no, Self::SCRATCH1_REG);
+                        assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH1_REG);
+                        load_effective_address(cb, Self::SCRATCH1, base_reg_no, disp);
+                        A64Opnd::new_mem(dest_num_bits, Self::SCRATCH1, 0)
+                    };
+
                     // This order may be surprising but it is correct. The way
                     // the Arm64 assembler works, the register that is going to
                     // be stored is first and the address is second. However in
                     // our IR we have the address first and the register second.
-                    match dest.rm_num_bits() {
+                    match dest_num_bits {
                         64 | 32 => stur(cb, src.into(), dest.into()),
                         16 => sturh(cb, src.into(), dest.into()),
                         num_bits => panic!("unexpected dest num_bits: {} (src: {:#?}, dest: {:#?})", num_bits, src, dest),
@@ -1045,21 +1132,7 @@ impl Assembler
                             };
                         },
                         Opnd::Value(value) => {
-                            // We dont need to check if it's a special const
-                            // here because we only allow these operands to hit
-                            // this point if they're not a special const.
-                            assert!(!value.special_const_p());
-
-                            // This assumes only load instructions can contain
-                            // references to GC'd Value operands. If the value
-                            // being loaded is a heap object, we'll report that
-                            // back out to the gc_offsets list.
-                            ldr_literal(cb, out.into(), 2.into());
-                            b(cb, InstructionOffset::from_bytes(4 + (SIZEOF_VALUE as i32)));
-                            cb.write_bytes(&value.as_u64().to_le_bytes());
-
-                            let ptr_offset = cb.get_write_ptr().sub_bytes(SIZEOF_VALUE);
-                            gc_offsets.push(ptr_offset);
+                            emit_load_gc_value(cb, &mut gc_offsets, out.into(), value);
                         },
                         Opnd::None => {
                             unreachable!("Attempted to load from None operand");
@@ -1097,20 +1170,7 @@ impl Assembler
                     let &Opnd::Mem(Mem { num_bits: _, base: MemBase::Reg(base_reg_no), disp }) = opnd else {
                         panic!("Unexpected Insn::Lea operand in arm64_emit: {opnd:?}");
                     };
-                    let out: A64Opnd = out.into();
-                    let base_reg = A64Opnd::Reg(A64Reg { num_bits: 64, reg_no: base_reg_no });
-                    assert_ne!(31, out.unwrap_reg().reg_no, "Insn::Lea sp, [sp, #imm] not always encodable. Use add/sub instead.");
-
-                    if ShiftedImmediate::try_from(disp.unsigned_abs() as u64).is_ok() {
-                        // Use ADD/SUB if the displacement fits
-                        add(cb, out, base_reg, A64Opnd::new_imm(disp.into()));
-                    } else {
-                        // Use add_extended() to interpret reg_no=31 as sp
-                        // since the base register is never the zero register.
-                        // Careful! Only the first two operands can refer to sp.
-                        emit_load_value(cb, out, disp as u64);
-                        add_extended(cb, out, base_reg, out);
-                    };
+                    load_effective_address(cb, out.into(), base_reg_no, disp);
                 }
                 Insn::LeaJumpTarget { out, target, .. } => {
                     if let Target::Label(label_idx) = target {
@@ -1627,6 +1687,64 @@ mod tests {
         ");
     }
 
+    #[test]
+    fn test_store() {
+        let (mut asm, mut cb) = setup_asm();
+
+        // Large memory offsets in combinations of destination and source
+        let large_mem = Opnd::mem(64, NATIVE_STACK_PTR, -0x305);
+        let small_mem = Opnd::mem(64, C_RET_OPND, 0);
+        asm.store(small_mem, large_mem);
+        asm.store(large_mem, small_mem);
+        asm.store(large_mem, large_mem);
+
+        asm.compile_with_num_regs(&mut cb, 0);
+        assert_disasm!(cb, "f0170cd1100240f8100000f8100040f8f1170cd1300200f8f0170cd1100240f8f1170cd1300200f8", "
+            0x0: sub x16, sp, #0x305
+            0x4: ldur x16, [x16]
+            0x8: stur x16, [x0]
+            0xc: ldur x16, [x0]
+            0x10: sub x17, sp, #0x305
+            0x14: stur x16, [x17]
+            0x18: sub x16, sp, #0x305
+            0x1c: ldur x16, [x16]
+            0x20: sub x17, sp, #0x305
+            0x24: stur x16, [x17]
+        ");
+    }
+
+    #[test]
+    fn test_store_value_without_split() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let imitation_heap_value = VALUE(0x1000);
+        assert!(imitation_heap_value.heap_object_p());
+        asm.store(Opnd::mem(VALUE_BITS, SP, 0), imitation_heap_value.into());
+
+        // Side exit code are compiled without the split pass, so we directly call emit here to
+        // emulate that scenario.
+        let gc_offsets = asm.arm64_emit(&mut cb).unwrap();
+        assert_eq!(1, gc_offsets.len(), "VALUE source operand should be reported as gc offset");
+
+        assert_disasm!(cb, "50000058030000140010000000000000b00200f8", "
+            0x0: ldr x16, #8
+            0x4: b #0x10
+            0x8: .byte 0x00, 0x10, 0x00, 0x00
+            0xc: .byte 0x00, 0x00, 0x00, 0x00
+            0x10: stur x16, [x21]
+        ");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_store_unserviceable() {
+        let (mut asm, mut cb) = setup_asm();
+        // This would put the source into SCRATCH_REG, messing up the destination
+        asm.store(Opnd::mem(64, Opnd::Reg(Assembler::SCRATCH_REG), 0), 0x83902.into());
+
+        asm.compile_with_num_regs(&mut cb, 0);
+    }
+
     /*
     #[test]
     fn test_emit_lea_label() {
diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs
index 36e783bd4e658a..b910052dae1b04 100644
--- a/zjit/src/backend/lir.rs
+++ b/zjit/src/backend/lir.rs
@@ -1822,29 +1822,16 @@ impl Assembler
                 };
                 self.write_label(side_exit_label.clone());
 
-                // Load an operand that cannot be used as a source of Insn::Store
-                fn split_store_source(asm: &mut Assembler, opnd: Opnd) -> Opnd {
-                    if matches!(opnd, Opnd::Mem(_) | Opnd::Value(_)) ||
-                        (cfg!(target_arch = "aarch64") && matches!(opnd, Opnd::UImm(_))) {
-                        asm.load_into(Opnd::Reg(Assembler::SCRATCH_REG), opnd);
-                        Opnd::Reg(Assembler::SCRATCH_REG)
-                    } else {
-                        opnd
-                    }
-                }
-
                 // Restore the PC and the stack for regular side exits. We don't do this for
                 // side exits right after JIT-to-JIT calls, which restore them before the call.
                 if let Some(SideExitContext { pc, stack, locals }) = context {
                     asm_comment!(self, "write stack slots: {stack:?}");
                     for (idx, &opnd) in stack.iter().enumerate() {
-                        let opnd = split_store_source(self, opnd);
                         self.store(Opnd::mem(64, SP, idx as i32 * SIZEOF_VALUE_I32), opnd);
                     }
 
                     asm_comment!(self, "write locals: {locals:?}");
                     for (idx, &opnd) in locals.iter().enumerate() {
-                        let opnd = split_store_source(self, opnd);
                         self.store(Opnd::mem(64, SP, (-local_size_and_idx_to_ep_offset(locals.len(), idx) - 1) * SIZEOF_VALUE_I32), opnd);
                     }
 
diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs
index 4543252573800c..d21c7ee09c82d2 100644
--- a/zjit/src/backend/x86_64/mod.rs
+++ b/zjit/src/backend/x86_64/mod.rs
@@ -96,8 +96,10 @@ pub const ALLOC_REGS: &'static [Reg] = &[
 
 impl Assembler
 {
-    // A special scratch register for intermediate processing.
-    // This register is caller-saved (so we don't have to save it before using it)
+    /// Special scratch registers for intermediate processing.
+    /// This register is call-clobbered (so we don't have to save it before using it).
+    /// Avoid using if you can since this is used to lower [Insn] internally and
+    /// so conflicts are possible.
     pub const SCRATCH_REG: Reg = R11_REG;
     const SCRATCH0: X86Opnd = X86Opnd::Reg(Assembler::SCRATCH_REG);
 
@@ -293,38 +295,11 @@ impl Assembler
 
                     asm.push_insn(insn);
                 },
-                Insn::Mov { dest, src } | Insn::Store { dest, src } => {
-                    match (&dest, &src) {
-                        (Opnd::Mem(_), Opnd::Mem(_)) => {
-                            // We load opnd1 because for mov, opnd0 is the output
-                            let opnd1 = asm.load(*src);
-                            asm.mov(*dest, opnd1);
-                        },
-                        (Opnd::Mem(Mem { num_bits, .. }), Opnd::UImm(value)) => {
-                            // For 64 bit destinations, 32-bit values will be sign-extended
-                            if *num_bits == 64 && imm_num_bits(*value as i64) > 32 {
-                                let opnd1 = asm.load(*src);
-                                asm.mov(*dest, opnd1);
-                            } else {
-                                asm.mov(*dest, *src);
-                            }
-                        },
-                        (Opnd::Mem(Mem { num_bits, .. }), Opnd::Imm(value)) => {
-                            // For 64 bit destinations, 32-bit values will be sign-extended
-                            if *num_bits == 64 && imm_num_bits(*value) > 32 {
-                                let opnd1 = asm.load(*src);
-                                asm.mov(*dest, opnd1);
-                            } else if uimm_num_bits(*value as u64) <= *num_bits {
-                                // If the bit string is short enough for the destination, use the unsigned representation.
-                                // Note that 64-bit and negative values are ruled out.
-                                asm.mov(*dest, Opnd::UImm(*value as u64));
-                            } else {
-                                asm.mov(*dest, *src);
-                            }
-                        },
-                        _ => {
-                            asm.mov(*dest, *src);
-                        }
+                Insn::Mov { dest, src } => {
+                    if let Opnd::Mem(_) = dest {
+                        asm.store(*dest, *src);
+                    } else {
+                        asm.mov(*dest, *src);
                     }
                 },
                 Insn::Not { opnd, .. } => {
@@ -440,6 +415,14 @@ impl Assembler
             }
         }
 
+        fn emit_load_gc_value(cb: &mut CodeBlock, gc_offsets: &mut Vec<CodePtr>, dest_reg: X86Opnd, value: VALUE) {
+            // Using movabs because mov might write value in 32 bits
+            movabs(cb, dest_reg, value.0 as _);
+            // The pointer immediate is encoded as the last part of the mov written out
+            let ptr_offset = cb.get_write_ptr().sub_bytes(SIZEOF_VALUE);
+            gc_offsets.push(ptr_offset);
+        }
+
         // List of GC offsets
         let mut gc_offsets: Vec<CodePtr> = Vec::new();
 
@@ -552,20 +535,71 @@ impl Assembler
                     shr(cb, opnd.into(), shift.into())
                 },
 
-                Insn::Store { dest, src } => {
-                    mov(cb, dest.into(), src.into());
-                },
+                store_insn @ Insn::Store { dest, src } => {
+                    let &Opnd::Mem(Mem { num_bits, base: MemBase::Reg(base_reg_no), disp: _ }) = dest else {
+                        panic!("Unexpected Insn::Store destination in x64_emit: {dest:?}");
+                    };
+
+                    // This kind of tricky clobber can only happen for explicit use of SCRATCH_REG,
+                    // so we panic to get the author to change their code.
+                    #[track_caller]
+                    fn assert_no_clobber(store_insn: &Insn, user_use: u8, backend_use: Reg) {
+                        assert_ne!(
+                            backend_use.reg_no,
+                            user_use,
+                            "Emitting {store_insn:?} would clobber {user_use:?}, in conflict with its semantics"
+                        );
+                    }
+
+                    let scratch = X86Opnd::Reg(Self::SCRATCH_REG);
+                    let src = match src {
+                        Opnd::Reg(_) => src.into(),
+                        &Opnd::Mem(_) => {
+                            assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG);
+                            mov(cb, scratch, src.into());
+                            scratch
+                        }
+                        &Opnd::Imm(imm) => {
+                            // For 64 bit destinations, 32-bit values will be sign-extended
+                            if num_bits == 64 && imm_num_bits(imm) > 32 {
+                                assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG);
+                                mov(cb, scratch, src.into());
+                                scratch
+                            } else if uimm_num_bits(imm as u64) <= num_bits {
+                                // If the bit string is short enough for the destination, use the unsigned representation.
+                                // Note that 64-bit and negative values are ruled out.
+                                uimm_opnd(imm as u64)
+                            } else {
+                                src.into()
+                            }
+                        }
+                        &Opnd::UImm(imm) => {
+                            // For 64 bit destinations, 32-bit values will be sign-extended
+                            if num_bits == 64 && imm_num_bits(imm as i64) > 32 {
+                                assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG);
+                                mov(cb, scratch, src.into());
+                                scratch
+                            } else {
+                                src.into()
+                            }
+                        }
+                        &Opnd::Value(value) => {
+                            assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG);
+                            emit_load_gc_value(cb, &mut gc_offsets, scratch, value);
+                            scratch
+                        }
+                        src @ (Opnd::None | Opnd::VReg { .. }) => panic!("Unexpected source operand during x86_emit: {src:?}")
+
+                    };
+                    mov(cb, dest.into(), src);
+                }
 
                 // This assumes only load instructions can contain references to GC'd Value operands
                 Insn::Load { opnd, out } |
                 Insn::LoadInto { dest: out, opnd } => {
                     match opnd {
                         Opnd::Value(val) if val.heap_object_p() => {
-                            // Using movabs because mov might write value in 32 bits
-                            movabs(cb, out.into(), val.0 as _);
-                            // The pointer immediate is encoded as the last part of the mov written out
-                            let ptr_offset = cb.get_write_ptr().sub_bytes(SIZEOF_VALUE);
-                            gc_offsets.push(ptr_offset);
+                            emit_load_gc_value(cb, &mut gc_offsets, out.into(), *val);
                         }
                         _ => mov(cb, out.into(), opnd.into())
                     }
@@ -1352,4 +1386,21 @@ mod tests {
             0x29: pop rbp
         "});
     }
+
+    #[test]
+    fn test_store_value_without_split() {
+        let (mut asm, mut cb) = setup_asm();
+
+        let imitation_heap_value = VALUE(0x1000);
+        assert!(imitation_heap_value.heap_object_p());
+        asm.store(Opnd::mem(VALUE_BITS, SP, 0), imitation_heap_value.into());
+
+        let gc_offsets = asm.x86_emit(&mut cb).unwrap();
+        assert_eq!(1, gc_offsets.len(), "VALUE source operand should be reported as gc offset");
+
+        assert_disasm!(cb, "49bb00100000000000004c891b", "
+            0x0: movabs r11, 0x1000
+            0xa: mov qword ptr [rbx], r11
+        ");
+    }
 }

From 3ad6bba1369aa61feabe4ff0f1f0e8dd9245cebc Mon Sep 17 00:00:00 2001
From: Alan Wu <XrXr@users.noreply.github.com>
Date: Fri, 1 Aug 2025 17:19:23 -0400
Subject: [PATCH 4/9] ZJIT: Refer to scratch registers in operands

Co-authored-by: Takashi Kokubun <takashi.kokubun@shopify.com>
---
 zjit/src/backend/arm64/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index 2025e444ae4464..e99f52cdb3168e 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -200,10 +200,10 @@ impl Assembler
     /// Avoid using if you can since this is used to lower [Insn] internally and
     /// so conflicts are possible.
     pub const SCRATCH_REG: Reg = X16_REG;
-    const SCRATCH0: A64Opnd = A64Opnd::Reg(Assembler::SCRATCH_REG);
-    const SCRATCH0_REG: Reg = Assembler::SCRATCH_REG;
-    const SCRATCH1: A64Opnd = A64Opnd::Reg(X17_REG);
+    const SCRATCH0_REG: Reg = Self::SCRATCH_REG;
     const SCRATCH1_REG: Reg = X17_REG;
+    const SCRATCH0: A64Opnd = A64Opnd::Reg(Self::SCRATCH0_REG);
+    const SCRATCH1: A64Opnd = A64Opnd::Reg(Self::SCRATCH1_REG);
 
     /// Get the list of registers from which we will allocate on this platform
     pub fn get_alloc_regs() -> Vec<Reg> {

From 19cbf8406a3dfa5c9a7eff712e33e272c10dbde0 Mon Sep 17 00:00:00 2001
From: Takashi Kokubun <takashikkbn@gmail.com>
Date: Fri, 1 Aug 2025 17:37:00 -0700
Subject: [PATCH 5/9] ZJIT: Enable IncrCounter for arm64 (#14086)

---
 test/ruby/test_zjit.rb        | 11 ++++++++++-
 zjit/src/backend/arm64/mod.rs |  5 +----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/test/ruby/test_zjit.rb b/test/ruby/test_zjit.rb
index 684a4bb2d40723..6db57e18ba729c 100644
--- a/test/ruby/test_zjit.rb
+++ b/test/ruby/test_zjit.rb
@@ -1095,6 +1095,14 @@ def test_require_rubygems_with_auto_compact
     }, call_threshold: 2
   end
 
+  def test_stats
+    assert_runs 'true', %q{
+      def test = 1
+      test
+      RubyVM::ZJIT.stats[:zjit_insns_count] > 0
+    }, stats: true
+  end
+
   def test_zjit_option_uses_array_each_in_ruby
     omit 'ZJIT wrongly compiles Array#each, so it is disabled for now'
     assert_runs '"<internal:array>"', %q{
@@ -1414,12 +1422,13 @@ def assert_runs(expected, test_script, insns: [], assert_compiles: false, **opts
   end
 
   # Run a Ruby process with ZJIT options and a pipe for writing test results
-  def eval_with_jit(script, call_threshold: 1, num_profiles: 1, timeout: 1000, pipe_fd:, debug: true)
+  def eval_with_jit(script, call_threshold: 1, num_profiles: 1, stats: false, debug: true, timeout: 1000, pipe_fd:)
     args = [
       "--disable-gems",
       "--zjit-call-threshold=#{call_threshold}",
       "--zjit-num-profiles=#{num_profiles}",
     ]
+    args << "--zjit-stats" if stats
     args << "--zjit-debug" if debug
     args << "-e" << script_shell_encode(script)
     pipe_r, pipe_w = IO.pipe
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index e99f52cdb3168e..148d01ea862e97 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -1305,8 +1305,7 @@ impl Assembler
                     }
                     last_patch_pos = Some(cb.get_write_pos());
                 },
-                Insn::IncrCounter { mem: _, value: _ } => {
-                    /*
+                Insn::IncrCounter { mem, value } => {
                     let label = cb.new_label("incr_counter_loop".to_string());
                     cb.write_label(label);
 
@@ -1322,8 +1321,6 @@ impl Assembler
 
                     cmp(cb, Self::SCRATCH1, A64Opnd::new_uimm(0));
                     emit_conditional_jump::<{Condition::NE}>(cb, Target::Label(label));
-                    */
-                    unimplemented!("labels are not supported yet");
                 },
                 Insn::Breakpoint => {
                     brk(cb, A64Opnd::None);

From 85510fc2ff5b5e44f1bc41613171f6008ad8890b Mon Sep 17 00:00:00 2001
From: Stan Lo <stan.lo@shopify.com>
Date: Fri, 1 Aug 2025 13:37:36 +0100
Subject: [PATCH 6/9] ZJIT: Support annotating builtin functions

This allows us to annotate builtin functions with their return type.
---
 zjit/src/codegen.rs       |  2 +-
 zjit/src/cruby_methods.rs | 49 ++++++++++++++++++++++++++++++++-
 zjit/src/hir.rs           | 58 +++++++++++++++++++++++++++++++++------
 zjit/src/state.rs         |  4 +--
 4 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs
index e7ebc1414af86e..433b22e15d2c79 100644
--- a/zjit/src/codegen.rs
+++ b/zjit/src/codegen.rs
@@ -340,7 +340,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
         Insn::SendWithoutBlockDirect { cd, state, self_val, args, .. } if args.len() + 1 > C_ARG_OPNDS.len() => // +1 for self
             gen_send_without_block(jit, asm, *cd, &function.frame_state(*state), opnd!(self_val), opnds!(args))?,
         Insn::SendWithoutBlockDirect { cme, iseq, self_val, args, state, .. } => gen_send_without_block_direct(cb, jit, asm, *cme, *iseq, opnd!(self_val), opnds!(args), &function.frame_state(*state))?,
-        Insn::InvokeBuiltin { bf, args, state } => gen_invokebuiltin(asm, &function.frame_state(*state), bf, opnds!(args))?,
+        Insn::InvokeBuiltin { bf, args, state, .. } => gen_invokebuiltin(asm, &function.frame_state(*state), bf, opnds!(args))?,
         Insn::Return { val } => return Some(gen_return(asm, opnd!(val))?),
         Insn::FixnumAdd { left, right, state } => gen_fixnum_add(jit, asm, opnd!(left), opnd!(right), &function.frame_state(*state))?,
         Insn::FixnumSub { left, right, state } => gen_fixnum_sub(jit, asm, opnd!(left), opnd!(right), &function.frame_state(*state))?,
diff --git a/zjit/src/cruby_methods.rs b/zjit/src/cruby_methods.rs
index 9c29ed5472d8f1..4851671a5b2f04 100644
--- a/zjit/src/cruby_methods.rs
+++ b/zjit/src/cruby_methods.rs
@@ -15,6 +15,11 @@ use crate::hir_type::{types, Type};
 
 pub struct Annotations {
     cfuncs: HashMap<*mut c_void, FnProperties>,
+    // Builtin annotations by function pointer for fast runtime lookup
+    // Uses Option to cache both hits and misses
+    builtin_ptrs: HashMap<*mut c_void, Option<FnProperties>>,
+    // Temporary name-based annotations used during initialization
+    builtin_names: HashMap<&'static str, FnProperties>,
 }
 
 /// Runtime behaviors of C functions that implement a Ruby method
@@ -41,6 +46,25 @@ impl Annotations {
         };
         self.cfuncs.get(&fn_ptr).copied()
     }
+
+    /// Query about properties of a builtin function by its pointer
+    /// If not found by pointer, checks by name and caches the result (including misses)
+    pub fn get_builtin_properties(&mut self, bf: *const rb_builtin_function) -> Option<FnProperties> {
+        let func_ptr = unsafe { (*bf).func_ptr as *mut c_void };
+
+        // First check if we already have it cached by pointer
+        if let Some(&cached_result) = self.builtin_ptrs.get(&func_ptr) {
+            return cached_result;
+        }
+
+        // If not found, check by name and cache the result
+        let name = unsafe { std::ffi::CStr::from_ptr((*bf).name).to_str().ok()? };
+        let result = self.builtin_names.get(name).copied();
+
+        // Cache the result (both hits and misses)
+        self.builtin_ptrs.insert(func_ptr, result);
+        result
+    }
 }
 
 fn annotate_c_method(props_map: &mut HashMap<*mut c_void, FnProperties>, class: VALUE, method_name: &'static str, props: FnProperties) {
@@ -63,6 +87,7 @@ fn annotate_c_method(props_map: &mut HashMap<*mut c_void, FnProperties>, class:
 /// are about the stock versions of methods.
 pub fn init() -> Annotations {
     let cfuncs = &mut HashMap::new();
+    let builtin_names = &mut HashMap::new();
 
     macro_rules! annotate {
         ($module:ident, $method_name:literal, $return_type:expr, $($properties:ident),+) => {
@@ -74,6 +99,22 @@ pub fn init() -> Annotations {
         }
     }
 
+    macro_rules! annotate_builtin {
+        ($name:literal, $return_type:expr) => {
+            annotate_builtin!($name, $return_type, no_gc, leaf, elidable)
+        };
+        ($name:literal, $return_type:expr, $($properties:ident),+) => {
+            let mut props = FnProperties {
+                no_gc: false,
+                leaf: false,
+                elidable: false,
+                return_type: $return_type
+            };
+            $(props.$properties = true;)+
+            builtin_names.insert($name, props);
+        }
+    }
+
     annotate!(rb_mKernel, "itself", types::BasicObject, no_gc, leaf, elidable);
     annotate!(rb_cString, "bytesize", types::Fixnum, no_gc, leaf);
     annotate!(rb_cModule, "name", types::StringExact.union(types::NilClass), no_gc, leaf, elidable);
@@ -83,7 +124,13 @@ pub fn init() -> Annotations {
     annotate!(rb_cNilClass, "nil?", types::TrueClass, no_gc, leaf, elidable);
     annotate!(rb_mKernel, "nil?", types::FalseClass, no_gc, leaf, elidable);
 
+    // Annotate builtin functions
+    annotate_builtin!("rb_f_float", types::Flonum);
+    annotate_builtin!("rb_f_float1", types::Flonum);
+
     Annotations {
-        cfuncs: std::mem::take(cfuncs)
+        cfuncs: std::mem::take(cfuncs),
+        builtin_ptrs: HashMap::new(), // Cache queried built-in functions by pointer
+        builtin_names: std::mem::take(builtin_names)
     }
 }
diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs
index 61514c88769d87..b5b7a8fe717604 100644
--- a/zjit/src/hir.rs
+++ b/zjit/src/hir.rs
@@ -522,7 +522,12 @@ pub enum Insn {
     },
 
     // Invoke a builtin function
-    InvokeBuiltin { bf: rb_builtin_function, args: Vec<InsnId>, state: InsnId },
+    InvokeBuiltin {
+        bf: rb_builtin_function,
+        args: Vec<InsnId>,
+        state: InsnId,
+        return_type: Option<Type>,  // None for unannotated builtins
+    },
 
     /// Control flow instructions
     Return { val: InsnId },
@@ -1163,7 +1168,7 @@ impl Function {
                 args: find_vec!(args),
                 state,
             },
-            &InvokeBuiltin { bf, ref args, state } => InvokeBuiltin { bf: bf, args: find_vec!(args), state },
+            &InvokeBuiltin { bf, ref args, state, return_type } => InvokeBuiltin { bf, args: find_vec!(args), state, return_type },
             &ArrayDup { val, state } => ArrayDup { val: find!(val), state },
             &HashDup { val, state } => HashDup { val: find!(val), state },
             &CCall { cfun, ref args, name, return_type, elidable } => CCall { cfun, args: find_vec!(args), name, return_type, elidable },
@@ -1260,7 +1265,7 @@ impl Function {
             Insn::SendWithoutBlock { .. } => types::BasicObject,
             Insn::SendWithoutBlockDirect { .. } => types::BasicObject,
             Insn::Send { .. } => types::BasicObject,
-            Insn::InvokeBuiltin { .. } => types::BasicObject,
+            Insn::InvokeBuiltin { return_type, .. } => return_type.unwrap_or(types::BasicObject),
             Insn::Defined { .. } => types::BasicObject,
             Insn::DefinedIvar { .. } => types::BasicObject,
             Insn::GetConstantPath { .. } => types::BasicObject,
@@ -3119,7 +3124,18 @@ pub fn iseq_to_hir(iseq: *const rb_iseq_t) -> Result<Function, ParseError> {
                     args.reverse();
 
                     let exit_id = fun.push_insn(block, Insn::Snapshot { state: exit_state });
-                    let insn_id = fun.push_insn(block, Insn::InvokeBuiltin { bf, args, state: exit_id });
+
+                    // Check if this builtin is annotated
+                    let return_type = ZJITState::get_method_annotations()
+                        .get_builtin_properties(&bf)
+                        .map(|props| props.return_type);
+
+                    let insn_id = fun.push_insn(block, Insn::InvokeBuiltin {
+                        bf,
+                        args,
+                        state: exit_id,
+                        return_type,
+                    });
                     state.stack_push(insn_id);
                 }
                 YARVINSN_opt_invokebuiltin_delegate |
@@ -3134,7 +3150,18 @@ pub fn iseq_to_hir(iseq: *const rb_iseq_t) -> Result<Function, ParseError> {
                     }
 
                     let exit_id = fun.push_insn(block, Insn::Snapshot { state: exit_state });
-                    let insn_id = fun.push_insn(block, Insn::InvokeBuiltin { bf, args, state: exit_id });
+
+                    // Check if this builtin is annotated
+                    let return_type = ZJITState::get_method_annotations()
+                        .get_builtin_properties(&bf)
+                        .map(|props| props.return_type);
+
+                    let insn_id = fun.push_insn(block, Insn::InvokeBuiltin {
+                        bf,
+                        args,
+                        state: exit_id,
+                        return_type,
+                    });
                     state.stack_push(insn_id);
                 }
                 YARVINSN_objtostring => {
@@ -4973,17 +5000,32 @@ mod tests {
     }
 
     #[test]
-    fn test_invokebuiltin_delegate_with_args() {
+    fn test_invokebuiltin_delegate_annotated() {
         assert_method_hir_with_opcode("Float", YARVINSN_opt_invokebuiltin_delegate_leave, expect![[r#"
             fn Float@<internal:kernel>:197:
             bb0(v0:BasicObject, v1:BasicObject, v2:BasicObject, v3:BasicObject):
-              v6:BasicObject = InvokeBuiltin rb_f_float, v0, v1, v2
+              v6:Flonum = InvokeBuiltin rb_f_float, v0, v1, v2
               Jump bb1(v0, v1, v2, v3, v6)
-            bb1(v8:BasicObject, v9:BasicObject, v10:BasicObject, v11:BasicObject, v12:BasicObject):
+            bb1(v8:BasicObject, v9:BasicObject, v10:BasicObject, v11:BasicObject, v12:Flonum):
               Return v12
         "#]]);
     }
 
+    #[test]
+    fn test_invokebuiltin_delegate_with_args() {
+        // Using an unannotated builtin to test InvokeBuiltin generation
+        let iseq = crate::cruby::with_rubyvm(|| get_method_iseq("Dir", "open"));
+        assert!(iseq_contains_opcode(iseq, YARVINSN_opt_invokebuiltin_delegate), "iseq Dir.open does not contain invokebuiltin");
+        let function = iseq_to_hir(iseq).unwrap();
+        assert_function_hir(function, expect![[r#"
+            fn open@<internal:dir>:184:
+            bb0(v0:BasicObject, v1:BasicObject, v2:BasicObject, v3:BasicObject, v4:BasicObject):
+              v5:NilClass = Const Value(nil)
+              v8:BasicObject = InvokeBuiltin dir_s_open, v0, v1, v2
+              SideExit UnknownOpcode(getblockparamproxy)
+        "#]]);
+    }
+
     #[test]
     fn test_invokebuiltin_delegate_without_args() {
         assert_method_hir_with_opcode("class", YARVINSN_opt_invokebuiltin_delegate_leave, expect![[r#"
diff --git a/zjit/src/state.rs b/zjit/src/state.rs
index 79be91fd85e5c6..b572e150eb2f19 100644
--- a/zjit/src/state.rs
+++ b/zjit/src/state.rs
@@ -117,8 +117,8 @@ impl ZJITState {
         &mut ZJITState::get_instance().invariants
     }
 
-    pub fn get_method_annotations() -> &'static cruby_methods::Annotations {
-        &ZJITState::get_instance().method_annotations
+    pub fn get_method_annotations() -> &'static mut cruby_methods::Annotations {
+        &mut ZJITState::get_instance().method_annotations
     }
 
     /// Return true if successful compilation should be asserted

From 3c1ca509b844929bf3cd5dbe81c09474c1b8a205 Mon Sep 17 00:00:00 2001
From: Stan Lo <stan.lo@shopify.com>
Date: Fri, 1 Aug 2025 21:16:14 +0100
Subject: [PATCH 7/9] ZJIT: Improve builtin function annotation collection

---
 zjit/src/cruby_methods.rs | 97 +++++++++++++++++++++++++++------------
 zjit/src/state.rs         |  4 +-
 2 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/zjit/src/cruby_methods.rs b/zjit/src/cruby_methods.rs
index 4851671a5b2f04..08c2cee7b7a1ce 100644
--- a/zjit/src/cruby_methods.rs
+++ b/zjit/src/cruby_methods.rs
@@ -15,11 +15,7 @@ use crate::hir_type::{types, Type};
 
 pub struct Annotations {
     cfuncs: HashMap<*mut c_void, FnProperties>,
-    // Builtin annotations by function pointer for fast runtime lookup
-    // Uses Option to cache both hits and misses
-    builtin_ptrs: HashMap<*mut c_void, Option<FnProperties>>,
-    // Temporary name-based annotations used during initialization
-    builtin_names: HashMap<&'static str, FnProperties>,
+    builtin_funcs: HashMap<*mut c_void, FnProperties>,
 }
 
 /// Runtime behaviors of C functions that implement a Ruby method
@@ -48,22 +44,9 @@ impl Annotations {
     }
 
     /// Query about properties of a builtin function by its pointer
-    /// If not found by pointer, checks by name and caches the result (including misses)
-    pub fn get_builtin_properties(&mut self, bf: *const rb_builtin_function) -> Option<FnProperties> {
+    pub fn get_builtin_properties(&self, bf: *const rb_builtin_function) -> Option<FnProperties> {
         let func_ptr = unsafe { (*bf).func_ptr as *mut c_void };
-
-        // First check if we already have it cached by pointer
-        if let Some(&cached_result) = self.builtin_ptrs.get(&func_ptr) {
-            return cached_result;
-        }
-
-        // If not found, check by name and cache the result
-        let name = unsafe { std::ffi::CStr::from_ptr((*bf).name).to_str().ok()? };
-        let result = self.builtin_names.get(name).copied();
-
-        // Cache the result (both hits and misses)
-        self.builtin_ptrs.insert(func_ptr, result);
-        result
+        self.builtin_funcs.get(&func_ptr).copied()
     }
 }
 
@@ -83,11 +66,67 @@ fn annotate_c_method(props_map: &mut HashMap<*mut c_void, FnProperties>, class:
     props_map.insert(fn_ptr, props);
 }
 
+/// Look up a method and find its builtin function pointer by parsing its ISEQ
+/// We currently only support methods with exactly one invokebuiltin instruction
+fn annotate_builtin_method(props_map: &mut HashMap<*mut c_void, FnProperties>, class: VALUE, method_name: &'static str, props: FnProperties) {
+    unsafe {
+        let method_id = rb_intern2(method_name.as_ptr().cast(), method_name.len().try_into().unwrap());
+        let method = rb_method_entry_at(class, method_id);
+        if method.is_null() {
+            panic!("Method {}#{} not found", std::ffi::CStr::from_ptr(rb_class2name(class)).to_str().unwrap_or("?"), method_name);
+        }
+
+        // Cast ME to CME - they have identical layout
+        let cme = method.cast::<rb_callable_method_entry_t>();
+        let def_type = get_cme_def_type(cme);
+
+        if def_type != VM_METHOD_TYPE_ISEQ {
+            panic!("Method {}#{} is not an ISEQ method (type: {})",
+                std::ffi::CStr::from_ptr(rb_class2name(class)).to_str().unwrap_or("?"),
+                method_name, def_type);
+        }
+
+        // Get the ISEQ from the method definition
+        let iseq = get_def_iseq_ptr((*cme).def);
+        if iseq.is_null() {
+            panic!("Failed to get ISEQ for {}#{}",
+                std::ffi::CStr::from_ptr(rb_class2name(class)).to_str().unwrap_or("?"),
+                method_name);
+        }
+
+        // Get the size of the ISEQ in instruction units
+        let encoded_size = rb_iseq_encoded_size(iseq);
+
+        // Scan through the ISEQ to find invokebuiltin instructions
+        let mut insn_idx: u32 = 0;
+        while insn_idx < encoded_size {
+            // Get the PC for this instruction index
+            let pc = rb_iseq_pc_at_idx(iseq, insn_idx);
+
+            // Get the opcode using the proper decoder
+            let opcode = rb_iseq_opcode_at_pc(iseq, pc);
+
+            if opcode == YARVINSN_invokebuiltin as i32 ||
+               opcode == YARVINSN_opt_invokebuiltin_delegate as i32 ||
+               opcode == YARVINSN_opt_invokebuiltin_delegate_leave as i32 {
+                // The first operand is the builtin function pointer
+                let bf_value = *pc.add(1);
+                let bf_ptr = bf_value.as_ptr() as *const rb_builtin_function;
+                let func_ptr = (*bf_ptr).func_ptr as *mut c_void;
+                props_map.insert(func_ptr, props);
+            }
+
+            // Move to the next instruction using the proper length
+            insn_idx = insn_idx.saturating_add(rb_insn_len(VALUE(opcode as usize)).try_into().unwrap());
+        }
+    }
+}
+
 /// Gather annotations. Run this right after boot since the annotations
 /// are about the stock versions of methods.
 pub fn init() -> Annotations {
     let cfuncs = &mut HashMap::new();
-    let builtin_names = &mut HashMap::new();
+    let builtin_funcs = &mut HashMap::new();
 
     macro_rules! annotate {
         ($module:ident, $method_name:literal, $return_type:expr, $($properties:ident),+) => {
@@ -100,10 +139,10 @@ pub fn init() -> Annotations {
     }
 
     macro_rules! annotate_builtin {
-        ($name:literal, $return_type:expr) => {
-            annotate_builtin!($name, $return_type, no_gc, leaf, elidable)
+        ($module:ident, $method_name:literal, $return_type:expr) => {
+            annotate_builtin!($module, $method_name, $return_type, no_gc, leaf, elidable)
         };
-        ($name:literal, $return_type:expr, $($properties:ident),+) => {
+        ($module:ident, $method_name:literal, $return_type:expr, $($properties:ident),+) => {
             let mut props = FnProperties {
                 no_gc: false,
                 leaf: false,
@@ -111,7 +150,7 @@ pub fn init() -> Annotations {
                 return_type: $return_type
             };
             $(props.$properties = true;)+
-            builtin_names.insert($name, props);
+            annotate_builtin_method(builtin_funcs, unsafe { $module }, $method_name, props);
         }
     }
 
@@ -124,13 +163,11 @@ pub fn init() -> Annotations {
     annotate!(rb_cNilClass, "nil?", types::TrueClass, no_gc, leaf, elidable);
     annotate!(rb_mKernel, "nil?", types::FalseClass, no_gc, leaf, elidable);
 
-    // Annotate builtin functions
-    annotate_builtin!("rb_f_float", types::Flonum);
-    annotate_builtin!("rb_f_float1", types::Flonum);
+    annotate_builtin!(rb_mKernel, "Float", types::Flonum);
+    annotate_builtin!(rb_mKernel, "Integer", types::Integer);
 
     Annotations {
         cfuncs: std::mem::take(cfuncs),
-        builtin_ptrs: HashMap::new(), // Cache queried built-in functions by pointer
-        builtin_names: std::mem::take(builtin_names)
+        builtin_funcs: std::mem::take(builtin_funcs),
     }
 }
diff --git a/zjit/src/state.rs b/zjit/src/state.rs
index b572e150eb2f19..79be91fd85e5c6 100644
--- a/zjit/src/state.rs
+++ b/zjit/src/state.rs
@@ -117,8 +117,8 @@ impl ZJITState {
         &mut ZJITState::get_instance().invariants
     }
 
-    pub fn get_method_annotations() -> &'static mut cruby_methods::Annotations {
-        &mut ZJITState::get_instance().method_annotations
+    pub fn get_method_annotations() -> &'static cruby_methods::Annotations {
+        &ZJITState::get_instance().method_annotations
     }
 
     /// Return true if successful compilation should be asserted

From 44dee185aa4c9814da4a50d90db7c80d5a787e0f Mon Sep 17 00:00:00 2001
From: Stan Lo <stan.lo@shopify.com>
Date: Fri, 1 Aug 2025 22:20:20 +0100
Subject: [PATCH 8/9] ZJIT: Annotate Kernel#class

---
 zjit/src/cruby_methods.rs |  1 +
 zjit/src/hir.rs           | 21 ++++++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/zjit/src/cruby_methods.rs b/zjit/src/cruby_methods.rs
index 08c2cee7b7a1ce..e03dbe2cd94396 100644
--- a/zjit/src/cruby_methods.rs
+++ b/zjit/src/cruby_methods.rs
@@ -165,6 +165,7 @@ pub fn init() -> Annotations {
 
     annotate_builtin!(rb_mKernel, "Float", types::Flonum);
     annotate_builtin!(rb_mKernel, "Integer", types::Integer);
+    annotate_builtin!(rb_mKernel, "class", types::Class, leaf);
 
     Annotations {
         cfuncs: std::mem::take(cfuncs),
diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs
index b5b7a8fe717604..976580c85b25e1 100644
--- a/zjit/src/hir.rs
+++ b/zjit/src/hir.rs
@@ -5011,6 +5011,18 @@ mod tests {
         "#]]);
     }
 
+    #[test]
+    fn test_invokebuiltin_cexpr_annotated() {
+        assert_method_hir_with_opcode("class", YARVINSN_opt_invokebuiltin_delegate_leave, expect![[r#"
+            fn class@<internal:kernel>:20:
+            bb0(v0:BasicObject):
+              v3:Class = InvokeBuiltin _bi20, v0
+              Jump bb1(v0, v3)
+            bb1(v5:BasicObject, v6:Class):
+              Return v6
+        "#]]);
+    }
+
     #[test]
     fn test_invokebuiltin_delegate_with_args() {
         // Using an unannotated builtin to test InvokeBuiltin generation
@@ -5028,10 +5040,13 @@ mod tests {
 
     #[test]
     fn test_invokebuiltin_delegate_without_args() {
-        assert_method_hir_with_opcode("class", YARVINSN_opt_invokebuiltin_delegate_leave, expect![[r#"
-            fn class@<internal:kernel>:20:
+        let iseq = crate::cruby::with_rubyvm(|| get_method_iseq("GC", "enable"));
+        assert!(iseq_contains_opcode(iseq, YARVINSN_opt_invokebuiltin_delegate_leave), "iseq GC.enable does not contain invokebuiltin");
+        let function = iseq_to_hir(iseq).unwrap();
+        assert_function_hir(function, expect![[r#"
+            fn enable@<internal:gc>:55:
             bb0(v0:BasicObject):
-              v3:BasicObject = InvokeBuiltin _bi20, v0
+              v3:BasicObject = InvokeBuiltin gc_enable, v0
               Jump bb1(v0, v3)
             bb1(v5:BasicObject, v6:BasicObject):
               Return v6

From 30a20bc166bc37acd7dcb3788686df149c7f428a Mon Sep 17 00:00:00 2001
From: Stan Lo <stan.lo@shopify.com>
Date: Fri, 1 Aug 2025 22:49:52 +0100
Subject: [PATCH 9/9] ZJIT: Reject builtin annotation if its iseq has multiple
 invokebuiltin insns

---
 zjit/src/cruby_methods.rs | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/zjit/src/cruby_methods.rs b/zjit/src/cruby_methods.rs
index e03dbe2cd94396..8d1548f92b1717 100644
--- a/zjit/src/cruby_methods.rs
+++ b/zjit/src/cruby_methods.rs
@@ -99,6 +99,8 @@ fn annotate_builtin_method(props_map: &mut HashMap<*mut c_void, FnProperties>, c
 
         // Scan through the ISEQ to find invokebuiltin instructions
         let mut insn_idx: u32 = 0;
+        let mut func_ptr = std::ptr::null_mut::<c_void>();
+
         while insn_idx < encoded_size {
             // Get the PC for this instruction index
             let pc = rb_iseq_pc_at_idx(iseq, insn_idx);
@@ -112,13 +114,22 @@ fn annotate_builtin_method(props_map: &mut HashMap<*mut c_void, FnProperties>, c
                 // The first operand is the builtin function pointer
                 let bf_value = *pc.add(1);
                 let bf_ptr = bf_value.as_ptr() as *const rb_builtin_function;
-                let func_ptr = (*bf_ptr).func_ptr as *mut c_void;
-                props_map.insert(func_ptr, props);
+
+                if func_ptr.is_null() {
+                    func_ptr = (*bf_ptr).func_ptr as *mut c_void;
+                } else {
+                    panic!("Multiple invokebuiltin instructions found in ISEQ for {}#{}",
+                        std::ffi::CStr::from_ptr(rb_class2name(class)).to_str().unwrap_or("?"),
+                        method_name);
+                }
             }
 
             // Move to the next instruction using the proper length
             insn_idx = insn_idx.saturating_add(rb_insn_len(VALUE(opcode as usize)).try_into().unwrap());
         }
+
+        // Only insert the properties if its iseq has exactly one invokebuiltin instruction
+        props_map.insert(func_ptr, props);
     }
 }