diff --git a/doc/string/partition.rdoc b/doc/string/partition.rdoc index ece034ee66225e..ee445bd21f32ba 100644 --- a/doc/string/partition.rdoc +++ b/doc/string/partition.rdoc @@ -29,7 +29,7 @@ If +pattern+ is a Regexp, performs the equivalent of self.match(pattern) ["hello", "", ""] If +pattern+ is not a Regexp, converts it to a string (if it is not already one), -then performs the equivalet of self.index(pattern) +then performs the equivalent of self.index(pattern) (and does _not_ set {pattern-matching global variables}[rdoc-ref:globals.md@Pattern+Matching]): 'hello'.partition('h') # => ["", "h", "ello"] diff --git a/insns.def b/insns.def index eef0d3f5dc1124..69a8210d7d6f99 100644 --- a/insns.def +++ b/insns.def @@ -1470,6 +1470,7 @@ opt_ltlt * string. Then what happens if that codepoint does not exist in the * string's encoding? Of course an exception. That's not a leaf. */ // attr bool leaf = false; /* has "invalid codepoint" exception */ +// attr bool zjit_profile = true; { val = vm_opt_ltlt(recv, obj); @@ -1537,6 +1538,7 @@ opt_aset /* This is another story than opt_aref. When vm_opt_aset() resorts * to rb_hash_aset(), which should call #hash for `obj`. */ // attr bool leaf = false; /* has rb_funcall() */ /* calls #hash */ +// attr bool zjit_profile = true; { val = vm_opt_aset(recv, obj, set); @@ -1551,6 +1553,7 @@ opt_length (CALL_DATA cd) (VALUE recv) (VALUE val) +// attr bool zjit_profile = true; { val = vm_opt_length(recv, BOP_LENGTH); @@ -1565,6 +1568,7 @@ opt_size (CALL_DATA cd) (VALUE recv) (VALUE val) +// attr bool zjit_profile = true; { val = vm_opt_length(recv, BOP_SIZE); @@ -1624,6 +1628,7 @@ opt_regexpmatch2 (VALUE obj2, VALUE obj1) (VALUE val) // attr bool leaf = false; /* match_at() has rb_thread_check_ints() */ +// attr bool zjit_profile = true; { val = vm_opt_regexpmatch2(obj2, obj1); diff --git a/yjit/src/backend/ir.rs b/yjit/src/backend/ir.rs index 40df3ae4d5830b..8205d6de76bd2f 100644 --- a/yjit/src/backend/ir.rs +++ b/yjit/src/backend/ir.rs @@ -1602,7 +1602,7 @@ impl Assembler if c_args.len() > 0 { // Resolve C argument dependencies let c_args_len = c_args.len() as isize; - let moves = Self::reorder_reg_moves(&c_args.drain(..).collect()); + let moves = Self::reorder_reg_moves(&std::mem::take(&mut c_args)); shift_live_ranges(&mut shifted_live_ranges, asm.insns.len(), moves.len() as isize - c_args_len); // Push batched C arguments diff --git a/zjit/src/asm/mod.rs b/zjit/src/asm/mod.rs index aeb429382d88c0..32dc633a2941ce 100644 --- a/zjit/src/asm/mod.rs +++ b/zjit/src/asm/mod.rs @@ -20,7 +20,6 @@ pub mod arm64; pub struct Label(pub usize); /// Reference to an ASM label -#[derive(Clone)] pub struct LabelRef { // Position in the code block where the label reference exists pos: usize, @@ -34,7 +33,7 @@ pub struct LabelRef { num_bytes: usize, /// The object that knows how to encode the branch instruction. - encode: fn(&mut CodeBlock, i64, i64) + encode: Box, } /// Block of memory into which instructions can be assembled @@ -223,11 +222,11 @@ impl CodeBlock { } // Add a label reference at the current write position - pub fn label_ref(&mut self, label: Label, num_bytes: usize, encode: fn(&mut CodeBlock, i64, i64)) { + pub fn label_ref(&mut self, label: Label, num_bytes: usize, encode: impl Fn(&mut CodeBlock, i64, i64) + 'static) { assert!(label.0 < self.label_addrs.len()); // Keep track of the reference - self.label_refs.push(LabelRef { pos: self.write_pos, label, num_bytes, encode }); + self.label_refs.push(LabelRef { pos: self.write_pos, label, num_bytes, encode: Box::new(encode) }); // Move past however many bytes the instruction takes up if self.write_pos + num_bytes < self.mem_size { @@ -251,7 +250,7 @@ impl CodeBlock { assert!(label_addr < self.mem_size); self.write_pos = ref_pos; - (label_ref.encode)(self, (ref_pos + label_ref.num_bytes) as i64, label_addr as i64); + (label_ref.encode.as_ref())(self, (ref_pos + label_ref.num_bytes) as i64, label_addr as i64); // Assert that we've written the same number of bytes that we // expected to have written. diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs index 5ac62c059986d5..6750926b35daa1 100644 --- a/zjit/src/backend/arm64/mod.rs +++ b/zjit/src/backend/arm64/mod.rs @@ -113,8 +113,8 @@ fn emit_jmp_ptr(cb: &mut CodeBlock, dst_ptr: CodePtr, padding: bool) { b(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32)); 1 } else { - let num_insns = emit_load_value(cb, Assembler::SCRATCH0, dst_addr as u64); - br(cb, Assembler::SCRATCH0); + let num_insns = emit_load_value(cb, Assembler::EMIT0_OPND, dst_addr as u64); + br(cb, Assembler::EMIT0_OPND); num_insns + 1 }; @@ -181,7 +181,7 @@ fn emit_load_value(cb: &mut CodeBlock, rd: A64Opnd, value: u64) -> usize { /// List of registers that can be used for register allocation. /// This has the same number of registers for x86_64 and arm64. -/// SCRATCH0 and SCRATCH1 are excluded. +/// SCRATCH_OPND, EMIT0_OPND, and EMIT1_OPND are excluded. pub const ALLOC_REGS: &[Reg] = &[ X0_REG, X1_REG, @@ -193,17 +193,33 @@ pub const ALLOC_REGS: &[Reg] = &[ X12_REG, ]; -impl Assembler -{ - /// Special scratch registers for intermediate processing. - /// This register is call-clobbered (so we don't have to save it before using it). - /// Avoid using if you can since this is used to lower [Insn] internally and - /// so conflicts are possible. - pub const SCRATCH_REG: Reg = X16_REG; - const SCRATCH0_REG: Reg = Self::SCRATCH_REG; - const SCRATCH1_REG: Reg = X17_REG; - const SCRATCH0: A64Opnd = A64Opnd::Reg(Self::SCRATCH0_REG); - const SCRATCH1: A64Opnd = A64Opnd::Reg(Self::SCRATCH1_REG); +/// Special scratch register for intermediate processing. It should be used only by +/// [`Assembler::arm64_split_with_scratch_reg`] or [`Assembler::new_with_scratch_reg`]. +const SCRATCH_OPND: Opnd = Opnd::Reg(X15_REG); + +impl Assembler { + /// Special registers for intermediate processing in arm64_emit. It should be used only by arm64_emit. + /// TODO: Remove the use of these registers by splitting instructions in arm64_split_with_scratch_reg. + const EMIT0_REG: Reg = X16_REG; + const EMIT1_REG: Reg = X17_REG; + const EMIT0_OPND: A64Opnd = A64Opnd::Reg(Self::EMIT0_REG); + const EMIT1_OPND: A64Opnd = A64Opnd::Reg(Self::EMIT1_REG); + + /// Return an Assembler with scratch registers disabled in the backend, and a scratch register. + pub fn new_with_scratch_reg() -> (Self, Opnd) { + (Self::new_with_label_names(Vec::default(), 0, true), SCRATCH_OPND) + } + + /// Return true if opnd contains a scratch reg + pub fn has_scratch_reg(opnd: Opnd) -> bool { + match opnd { + Opnd::Reg(_) => opnd == SCRATCH_OPND, + Opnd::Mem(Mem { base: MemBase::Reg(reg_no), .. }) => { + reg_no == SCRATCH_OPND.unwrap_reg().reg_no + } + _ => false, + } + } /// Get the list of registers from which we will allocate on this platform pub fn get_alloc_regs() -> Vec { @@ -372,7 +388,7 @@ impl Assembler let live_ranges: Vec = take(&mut self.live_ranges); let mut iterator = self.insns.into_iter().enumerate().peekable(); - let mut asm_local = Assembler::new_with_label_names(take(&mut self.label_names), live_ranges.len()); + let mut asm_local = Assembler::new_with_label_names(take(&mut self.label_names), live_ranges.len(), self.accept_scratch_reg); let asm = &mut asm_local; while let Some((index, mut insn)) = iterator.next() { @@ -555,14 +571,6 @@ impl Assembler } } }, - Insn::IncrCounter { mem, value } => { - let counter_addr = match mem { - Opnd::Mem(_) => asm.lea(*mem), - _ => *mem - }; - - asm.incr_counter(counter_addr, *value); - }, Insn::JmpOpnd(opnd) => { if let Opnd::Mem(_) = opnd { let opnd0 = split_load_operand(asm, *opnd); @@ -679,6 +687,58 @@ impl Assembler asm_local } + /// Split instructions using scratch registers. To maximize the use of the register pool for + /// VRegs, most splits should happen in [`Self::arm64_split`]. However, some instructions + /// need to be split with registers after `alloc_regs`, e.g. for `compile_side_exits`, so this + /// splits them and uses scratch registers for it. + fn arm64_split_with_scratch_reg(mut self) -> Assembler { + let mut iterator = self.insns.into_iter().enumerate().peekable(); + let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), self.live_ranges.len(), true); + + while let Some((_, mut insn)) = iterator.next() { + match &mut insn { + // For compile_side_exits, support splitting simple C arguments here + Insn::CCall { opnds, .. } if !opnds.is_empty() => { + for (i, opnd) in opnds.iter().enumerate() { + asm.load_into(C_ARG_OPNDS[i], *opnd); + } + *opnds = vec![]; + asm.push_insn(insn); + } + &mut Insn::Lea { opnd, out } => { + match (opnd, out) { + // Split here for compile_side_exits + (Opnd::Mem(_), Opnd::Mem(_)) => { + asm.lea_into(SCRATCH_OPND, opnd); + asm.store(out, SCRATCH_OPND); + } + _ => { + asm.push_insn(insn); + } + } + } + // Convert Opnd::const_ptr into Opnd::Mem. It's split here compile_side_exits. + &mut Insn::IncrCounter { mem, value } => { + assert!(matches!(mem, Opnd::UImm(_))); + asm.load_into(SCRATCH_OPND, mem); + asm.lea_into(SCRATCH_OPND, Opnd::mem(64, SCRATCH_OPND, 0)); + asm.incr_counter(SCRATCH_OPND, value); + } + // Resolve ParallelMov that couldn't be handled without a scratch register. + Insn::ParallelMov { moves } => { + for (reg, opnd) in Self::resolve_parallel_moves(moves, Some(SCRATCH_OPND)).unwrap() { + asm.load_into(Opnd::Reg(reg), opnd); + } + } + _ => { + asm.push_insn(insn); + } + } + } + + asm + } + /// Emit platform-specific machine code /// Returns a list of GC offsets. Can return failure to signal caller to retry. fn arm64_emit(&mut self, cb: &mut CodeBlock) -> Option> { @@ -739,8 +799,8 @@ impl Assembler // that if it doesn't match it will skip over the // instructions used for branching. bcond(cb, Condition::inverse(CONDITION), (load_insns + 2).into()); - emit_load_value(cb, Assembler::SCRATCH0, dst_addr as u64); - br(cb, Assembler::SCRATCH0); + emit_load_value(cb, Assembler::EMIT0_OPND, dst_addr as u64); + br(cb, Assembler::EMIT0_OPND); // Here we'll return the number of instructions that it // took to write out the destination address + 1 for the @@ -806,8 +866,8 @@ impl Assembler } else { cbz(cb, reg, InstructionOffset::from_insns(load_insns + 2)); } - emit_load_value(cb, Assembler::SCRATCH0, dst_addr); - br(cb, Assembler::SCRATCH0); + emit_load_value(cb, Assembler::EMIT0_OPND, dst_addr); + br(cb, Assembler::EMIT0_OPND); } */ } else { @@ -979,7 +1039,7 @@ impl Assembler (Some(Insn::JoMul(_)), _) | (Some(Insn::PosMarker(_)), Some(Insn::JoMul(_))) => { // Compute the high 64 bits - smulh(cb, Self::SCRATCH0, left.into(), right.into()); + smulh(cb, Self::EMIT0_OPND, left.into(), right.into()); // Compute the low 64 bits // This may clobber one of the input registers, @@ -988,11 +1048,11 @@ impl Assembler // Produce a register that is all zeros or all ones // Based on the sign bit of the 64-bit mul result - asr(cb, Self::SCRATCH1, out.into(), A64Opnd::UImm(63)); + asr(cb, Self::EMIT1_OPND, out.into(), A64Opnd::UImm(63)); // If the high 64-bits are not all zeros or all ones, // matching the sign bit, then we have an overflow - cmp(cb, Self::SCRATCH0, Self::SCRATCH1); + cmp(cb, Self::EMIT0_OPND, Self::EMIT1_OPND); // Insn::JoMul will emit_conditional_jump::<{Condition::NE}> } _ => { @@ -1021,77 +1081,57 @@ impl Assembler Insn::LShift { opnd, shift, out } => { lsl(cb, out.into(), opnd.into(), shift.into()); }, - store_insn @ Insn::Store { dest, src } => { - // With minor exceptions, as long as `dest` is a Mem, all forms of `src` are - // accepted. As a rule of thumb, avoid using Assembler::SCRATCH as a memory - // base register to gurantee things will work. + Insn::Store { dest, src } => { + // With minor exceptions, as long as `dest` is a Mem, all forms of `src` are accepted. let &Opnd::Mem(Mem { num_bits: dest_num_bits, base: MemBase::Reg(base_reg_no), disp }) = dest else { panic!("Unexpected Insn::Store destination in arm64_emit: {dest:?}"); }; - // This kind of tricky clobber can only happen for explicit use of SCRATCH_REG, - // so we panic to get the author to change their code. - #[track_caller] - fn assert_no_clobber(store_insn: &Insn, user_use: u8, backend_use: Reg) { - assert_ne!( - backend_use.reg_no, - user_use, - "Emitting {store_insn:?} would clobber {user_use:?}, in conflict with its semantics" - ); - } - - // Split src into SCRATCH0 if necessary + // Split src into EMIT0_OPND if necessary let src_reg: A64Reg = match src { Opnd::Reg(reg) => *reg, // Use zero register when possible Opnd::UImm(0) | Opnd::Imm(0) => XZR_REG, // Immediates &Opnd::Imm(imm) => { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG); - emit_load_value(cb, Self::SCRATCH0, imm as u64); - Self::SCRATCH0_REG + emit_load_value(cb, Self::EMIT0_OPND, imm as u64); + Self::EMIT0_REG } &Opnd::UImm(imm) => { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG); - emit_load_value(cb, Self::SCRATCH0, imm); - Self::SCRATCH0_REG + emit_load_value(cb, Self::EMIT0_OPND, imm); + Self::EMIT0_REG } &Opnd::Value(value) => { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG); - emit_load_gc_value(cb, &mut gc_offsets, Self::SCRATCH0, value); - Self::SCRATCH0_REG + emit_load_gc_value(cb, &mut gc_offsets, Self::EMIT0_OPND, value); + Self::EMIT0_REG } src_mem @ &Opnd::Mem(Mem { num_bits: src_num_bits, base: MemBase::Reg(src_base_reg_no), disp: src_disp }) => { - // For mem-to-mem store, load the source into SCRATCH0 - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH0_REG); + // For mem-to-mem store, load the source into EMIT0_OPND let src_mem = if mem_disp_fits_bits(src_disp) { src_mem.into() } else { - // Split the load address into SCRATCH0 first if necessary - assert_no_clobber(store_insn, src_base_reg_no, Self::SCRATCH0_REG); - load_effective_address(cb, Self::SCRATCH0, src_base_reg_no, src_disp); - A64Opnd::new_mem(dest_num_bits, Self::SCRATCH0, 0) + // Split the load address into EMIT0_OPND first if necessary + load_effective_address(cb, Self::EMIT0_OPND, src_base_reg_no, src_disp); + A64Opnd::new_mem(dest_num_bits, Self::EMIT0_OPND, 0) }; match src_num_bits { - 64 | 32 => ldur(cb, Self::SCRATCH0, src_mem), - 16 => ldurh(cb, Self::SCRATCH0, src_mem), - 8 => ldurb(cb, Self::SCRATCH0, src_mem), + 64 | 32 => ldur(cb, Self::EMIT0_OPND, src_mem), + 16 => ldurh(cb, Self::EMIT0_OPND, src_mem), + 8 => ldurb(cb, Self::EMIT0_OPND, src_mem), num_bits => panic!("unexpected num_bits: {num_bits}") }; - Self::SCRATCH0_REG + Self::EMIT0_REG } src @ (Opnd::Mem(_) | Opnd::None | Opnd::VReg { .. }) => panic!("Unexpected source operand during arm64_emit: {src:?}") }; let src = A64Opnd::Reg(src_reg); - // Split dest into SCRATCH1 if necessary. + // Split dest into EMIT1_OPND if necessary. let dest = if mem_disp_fits_bits(disp) { dest.into() } else { - assert_no_clobber(store_insn, src_reg.reg_no, Self::SCRATCH1_REG); - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH1_REG); - load_effective_address(cb, Self::SCRATCH1, base_reg_no, disp); - A64Opnd::new_mem(dest_num_bits, Self::SCRATCH1, 0) + load_effective_address(cb, Self::EMIT1_OPND, base_reg_no, disp); + A64Opnd::new_mem(dest_num_bits, Self::EMIT1_OPND, 0) }; // This order may be surprising but it is correct. The way @@ -1169,10 +1209,10 @@ impl Assembler if let Target::Label(label_idx) = target { // Set output to the raw address of the label cb.label_ref(*label_idx, 4, |cb, end_addr, dst_addr| { - adr(cb, Self::SCRATCH0, A64Opnd::new_imm(dst_addr - (end_addr - 4))); + adr(cb, Self::EMIT0_OPND, A64Opnd::new_imm(dst_addr - (end_addr - 4))); }); - mov(cb, out.into(), Self::SCRATCH0); + mov(cb, out.into(), Self::EMIT0_OPND); } else { // Set output to the jump target's raw address let target_code = target.unwrap_code_ptr(); @@ -1197,15 +1237,15 @@ impl Assembler } // Push the flags/state register - mrs(cb, Self::SCRATCH0, SystemRegister::NZCV); - emit_push(cb, Self::SCRATCH0); + mrs(cb, Self::EMIT0_OPND, SystemRegister::NZCV); + emit_push(cb, Self::EMIT0_OPND); }, Insn::CPopAll => { let regs = Assembler::get_caller_save_regs(); // Pop the state/flags register - msr(cb, SystemRegister::NZCV, Self::SCRATCH0); - emit_pop(cb, Self::SCRATCH0); + msr(cb, SystemRegister::NZCV, Self::EMIT0_OPND); + emit_pop(cb, Self::EMIT0_OPND); for reg in regs.into_iter().rev() { emit_pop(cb, A64Opnd::Reg(reg)); @@ -1221,8 +1261,8 @@ impl Assembler if b_offset_fits_bits((dst_addr - src_addr) / 4) { bl(cb, InstructionOffset::from_bytes((dst_addr - src_addr) as i32)); } else { - emit_load_value(cb, Self::SCRATCH0, dst_addr as u64); - blr(cb, Self::SCRATCH0); + emit_load_value(cb, Self::EMIT0_OPND, dst_addr as u64); + blr(cb, Self::EMIT0_OPND); } }, Insn::CRet { .. } => { @@ -1302,17 +1342,17 @@ impl Assembler let label = cb.new_label("incr_counter_loop".to_string()); cb.write_label(label); - ldaxr(cb, Self::SCRATCH0, mem.into()); - add(cb, Self::SCRATCH0, Self::SCRATCH0, value.into()); + ldaxr(cb, Self::EMIT0_OPND, mem.into()); + add(cb, Self::EMIT0_OPND, Self::EMIT0_OPND, value.into()); // The status register that gets used to track whether or // not the store was successful must be 32 bytes. Since we - // store the SCRATCH registers as their 64-bit versions, we + // store the EMIT registers as their 64-bit versions, we // need to rewrap it here. - let status = A64Opnd::Reg(Self::SCRATCH1.unwrap_reg().with_num_bits(32)); - stlxr(cb, status, Self::SCRATCH0, mem.into()); + let status = A64Opnd::Reg(Self::EMIT1_REG.with_num_bits(32)); + stlxr(cb, status, Self::EMIT0_OPND, mem.into()); - cmp(cb, Self::SCRATCH1, A64Opnd::new_uimm(0)); + cmp(cb, Self::EMIT1_OPND, A64Opnd::new_uimm(0)); emit_conditional_jump::<{Condition::NE}>(cb, Target::Label(label)); }, Insn::Breakpoint => { @@ -1363,9 +1403,16 @@ impl Assembler /// Optimize and compile the stored instructions pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec) -> Result<(CodePtr, Vec), CompileError> { + // The backend is allowed to use scratch registers only if it has not accepted them so far. + let use_scratch_reg = !self.accept_scratch_reg; + let asm = self.arm64_split(); let mut asm = asm.alloc_regs(regs)?; + // We put compile_side_exits after alloc_regs to avoid extending live ranges for VRegs spilled on side exits. asm.compile_side_exits(); + if use_scratch_reg { + asm = asm.arm64_split_with_scratch_reg(); + } // Create label instances in the code block for (idx, name) in asm.label_names.iter().enumerate() { @@ -1809,12 +1856,39 @@ mod tests { assert_snapshot!(cb.hexdump(), @"50000058030000140010000000000000b00200f8"); } + #[test] + fn test_store_with_valid_scratch_reg() { + let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg(); + let mut cb = CodeBlock::new_dummy(); + asm.store(Opnd::mem(64, scratch_reg, 0), 0x83902.into()); + + asm.compile_with_num_regs(&mut cb, 0); + assert_disasm_snapshot!(cb.disasm(), @r" + 0x0: mov x16, #0x3902 + 0x4: movk x16, #8, lsl #16 + 0x8: stur x16, [x15] + "); + assert_snapshot!(cb.hexdump(), @"502087d21001a0f2f00100f8"); + } + + #[test] + #[should_panic] + fn test_store_with_invalid_scratch_reg() { + let (_, scratch_reg) = Assembler::new_with_scratch_reg(); + let (mut asm, mut cb) = setup_asm(); + // This would put the source into scratch_reg, messing up the destination + asm.store(Opnd::mem(64, scratch_reg, 0), 0x83902.into()); + + asm.compile_with_num_regs(&mut cb, 0); + } + #[test] #[should_panic] - fn test_store_unserviceable() { + fn test_load_into_with_invalid_scratch_reg() { + let (_, scratch_reg) = Assembler::new_with_scratch_reg(); let (mut asm, mut cb) = setup_asm(); - // This would put the source into SCRATCH_REG, messing up the destination - asm.store(Opnd::mem(64, SCRATCH_OPND, 0), 0x83902.into()); + // This would put the source into scratch_reg, messing up the destination + asm.load_into(scratch_reg, 0x83902.into()); asm.compile_with_num_regs(&mut cb, 0); } @@ -2300,13 +2374,13 @@ mod tests { asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len()); assert_disasm_snapshot!(cb.disasm(), @" - 0x0: mov x16, x0 + 0x0: mov x15, x0 0x4: mov x0, x1 - 0x8: mov x1, x16 + 0x8: mov x1, x15 0xc: mov x16, #0 0x10: blr x16 "); - assert_snapshot!(cb.hexdump(), @"f00300aae00301aae10310aa100080d200023fd6"); + assert_snapshot!(cb.hexdump(), @"ef0300aae00301aae1030faa100080d200023fd6"); } #[test] @@ -2324,16 +2398,16 @@ mod tests { asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len()); assert_disasm_snapshot!(cb.disasm(), @" - 0x0: mov x16, x2 + 0x0: mov x15, x2 0x4: mov x2, x3 - 0x8: mov x3, x16 - 0xc: mov x16, x0 + 0x8: mov x3, x15 + 0xc: mov x15, x0 0x10: mov x0, x1 - 0x14: mov x1, x16 + 0x14: mov x1, x15 0x18: mov x16, #0 0x1c: blr x16 "); - assert_snapshot!(cb.hexdump(), @"f00302aae20303aae30310aaf00300aae00301aae10310aa100080d200023fd6"); + assert_snapshot!(cb.hexdump(), @"ef0302aae20303aae3030faaef0300aae00301aae1030faa100080d200023fd6"); } #[test] @@ -2350,13 +2424,13 @@ mod tests { asm.compile_with_num_regs(&mut cb, ALLOC_REGS.len()); assert_disasm_snapshot!(cb.disasm(), @" - 0x0: mov x16, x0 + 0x0: mov x15, x0 0x4: mov x0, x1 0x8: mov x1, x2 - 0xc: mov x2, x16 + 0xc: mov x2, x15 0x10: mov x16, #0 0x14: blr x16 "); - assert_snapshot!(cb.hexdump(), @"f00300aae00301aae10302aae20310aa100080d200023fd6"); + assert_snapshot!(cb.hexdump(), @"ef0300aae00301aae10302aae2030faa100080d200023fd6"); } } diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs index 76a53c66d6b652..aad5600f569767 100644 --- a/zjit/src/backend/lir.rs +++ b/zjit/src/backend/lir.rs @@ -17,7 +17,6 @@ pub use crate::backend::current::{ NATIVE_STACK_PTR, NATIVE_BASE_PTR, C_ARG_OPNDS, C_RET_REG, C_RET_OPND, }; -pub const SCRATCH_OPND: Opnd = Opnd::Reg(Assembler::SCRATCH_REG); pub static JIT_PRESERVED_REGS: &[Opnd] = &[CFP, SP, EC]; @@ -1173,6 +1172,10 @@ pub struct Assembler { /// Names of labels pub(super) label_names: Vec, + /// If true, `push_insn` is allowed to use scratch registers. + /// On `compile`, it also disables the backend's use of them. + pub(super) accept_scratch_reg: bool, + /// If Some, the next ccall should verify its leafness leaf_ccall_stack_size: Option } @@ -1181,12 +1184,12 @@ impl Assembler { /// Create an Assembler pub fn new() -> Self { - Self::new_with_label_names(Vec::default(), 0) + Self::new_with_label_names(Vec::default(), 0, false) } /// Create an Assembler with parameters that are populated by another Assembler instance. /// This API is used for copying an Assembler for the next compiler pass. - pub fn new_with_label_names(label_names: Vec, num_vregs: usize) -> Self { + pub fn new_with_label_names(label_names: Vec, num_vregs: usize, accept_scratch_reg: bool) -> Self { let mut live_ranges = Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY); live_ranges.resize(num_vregs, LiveRange { start: None, end: None }); @@ -1194,6 +1197,7 @@ impl Assembler insns: Vec::with_capacity(ASSEMBLER_INSNS_CAPACITY), live_ranges, label_names, + accept_scratch_reg, leaf_ccall_stack_size: None, } } @@ -1255,6 +1259,14 @@ impl Assembler } } + // If this Assembler should not accept scratch registers, assert no use of them. + if !self.accept_scratch_reg { + let opnd_iter = insn.opnd_iter(); + for opnd in opnd_iter { + assert!(!Self::has_scratch_reg(*opnd), "should not use scratch register: {opnd:?}"); + } + } + self.insns.push(insn); } @@ -1268,9 +1280,9 @@ impl Assembler Target::Label(label) } - // Shuffle register moves, sometimes adding extra moves using SCRATCH_REG, + // Shuffle register moves, sometimes adding extra moves using scratch_reg, // so that they will not rewrite each other before they are used. - pub fn resolve_parallel_moves(old_moves: &[(Reg, Opnd)]) -> Vec<(Reg, Opnd)> { + pub fn resolve_parallel_moves(old_moves: &[(Reg, Opnd)], scratch_reg: Option) -> Option> { // Return the index of a move whose destination is not used as a source if any. fn find_safe_move(moves: &[(Reg, Opnd)]) -> Option { moves.iter().enumerate().find(|&(_, &(dest_reg, _))| { @@ -1289,19 +1301,21 @@ impl Assembler new_moves.push(old_moves.remove(index)); } - // No safe move. Load the source of one move into SCRATCH_REG, and - // then load SCRATCH_REG into the destination when it's safe. + // No safe move. Load the source of one move into scratch_reg, and + // then load scratch_reg into the destination when it's safe. if !old_moves.is_empty() { - // Make sure it's safe to use SCRATCH_REG - assert!(old_moves.iter().all(|&(_, opnd)| opnd != SCRATCH_OPND)); + // If scratch_reg is None, return None and leave it to *_split_with_scratch_regs to resolve it. + let scratch_reg = scratch_reg?.unwrap_reg(); + // Make sure it's safe to use scratch_reg + assert!(old_moves.iter().all(|&(_, opnd)| opnd != Opnd::Reg(scratch_reg))); - // Move SCRATCH <- opnd, and delay reg <- SCRATCH + // Move scratch_reg <- opnd, and delay reg <- scratch_reg let (reg, opnd) = old_moves.remove(0); - new_moves.push((Assembler::SCRATCH_REG, opnd)); - old_moves.push((reg, SCRATCH_OPND)); + new_moves.push((scratch_reg, opnd)); + old_moves.push((reg, Opnd::Reg(scratch_reg))); } } - new_moves + Some(new_moves) } /// Sets the out field on the various instructions that require allocated @@ -1345,7 +1359,7 @@ impl Assembler // live_ranges is indexed by original `index` given by the iterator. let live_ranges: Vec = take(&mut self.live_ranges); let mut iterator = self.insns.into_iter().enumerate().peekable(); - let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), live_ranges.len()); + let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), live_ranges.len(), self.accept_scratch_reg); while let Some((index, mut insn)) = iterator.next() { let before_ccall = match (&insn, iterator.peek().map(|(_, insn)| insn)) { @@ -1510,9 +1524,14 @@ impl Assembler let is_ccall = matches!(insn, Insn::CCall { .. }); match insn { Insn::ParallelMov { moves } => { - // Now that register allocation is done, it's ready to resolve parallel moves. - for (reg, opnd) in Self::resolve_parallel_moves(&moves) { - asm.load_into(Opnd::Reg(reg), opnd); + // For trampolines that use scratch registers, attempt to lower ParallelMov without scratch_reg. + if let Some(moves) = Self::resolve_parallel_moves(&moves, None) { + for (reg, opnd) in moves { + asm.load_into(Opnd::Reg(reg), opnd); + } + } else { + // If it needs a scratch_reg, leave it to *_split_with_scratch_regs to handle it. + asm.push_insn(Insn::ParallelMov { moves }); } } Insn::CCall { opnds, fptr, start_marker, end_marker, out } => { @@ -1586,7 +1605,7 @@ impl Assembler for (idx, target) in targets { // Compile a side exit. Note that this is past the split pass and alloc_regs(), - // so you can't use a VReg or an instruction that needs to be split. + // so you can't use an instruction that returns a VReg. if let Target::SideExit { pc, stack, locals, reason, label } = target { asm_comment!(self, "Exit: {reason}"); let side_exit_label = if let Some(label) = label { @@ -1609,35 +1628,24 @@ impl Assembler } asm_comment!(self, "save cfp->pc"); - self.load_into(SCRATCH_OPND, Opnd::const_ptr(pc)); - self.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC), SCRATCH_OPND); + self.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_PC), Opnd::const_ptr(pc)); asm_comment!(self, "save cfp->sp"); - self.lea_into(SCRATCH_OPND, Opnd::mem(64, SP, stack.len() as i32 * SIZEOF_VALUE_I32)); - let cfp_sp = Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP); - self.store(cfp_sp, SCRATCH_OPND); + self.lea_into(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SP), Opnd::mem(64, SP, stack.len() as i32 * SIZEOF_VALUE_I32)); // Using C_RET_OPND as an additional scratch register, which is no longer used if get_option!(stats) { asm_comment!(self, "increment a side exit counter"); - self.load_into(SCRATCH_OPND, Opnd::const_ptr(exit_counter_ptr(reason))); - self.incr_counter_with_reg(Opnd::mem(64, SCRATCH_OPND, 0), 1.into(), C_RET_OPND); + self.incr_counter(Opnd::const_ptr(exit_counter_ptr(reason)), 1.into()); if let SideExitReason::UnhandledYARVInsn(opcode) = reason { asm_comment!(self, "increment an unhandled YARV insn counter"); - self.load_into(SCRATCH_OPND, Opnd::const_ptr(exit_counter_ptr_for_opcode(opcode))); - self.incr_counter_with_reg(Opnd::mem(64, SCRATCH_OPND, 0), 1.into(), C_RET_OPND); + self.incr_counter(Opnd::const_ptr(exit_counter_ptr_for_opcode(opcode)), 1.into()); } } if get_option!(trace_side_exits) { - // Use `load_into` with `C_ARG_OPNDS` instead of `opnds` argument for ccall, since `compile_side_exits` - // is after the split pass, which would allow use of `opnds`. - self.load_into(C_ARG_OPNDS[0], Opnd::const_ptr(pc as *const u8)); - self.ccall( - rb_zjit_record_exit_stack as *const u8, - vec![] - ); + asm_ccall!(self, rb_zjit_record_exit_stack, Opnd::const_ptr(pc as *const u8)); } asm_comment!(self, "exit to the interpreter"); @@ -1823,19 +1831,6 @@ impl Assembler { self.push_insn(Insn::IncrCounter { mem, value }); } - /// incr_counter() but uses a specific register to split Insn::Lea - pub fn incr_counter_with_reg(&mut self, mem: Opnd, value: Opnd, reg: Opnd) { - assert!(matches!(reg, Opnd::Reg(_)), "incr_counter_with_reg should take a register, got: {reg:?}"); - let counter_opnd = if cfg!(target_arch = "aarch64") { // See arm64_split() - assert_ne!(reg, SCRATCH_OPND, "SCRATCH_REG should be reserved for IncrCounter"); - self.lea_into(reg, mem); - reg - } else { // x86_emit() expects Opnd::Mem - mem - }; - self.incr_counter(counter_opnd, value); - } - pub fn jbe(&mut self, target: Target) { self.push_insn(Insn::Jbe(target)); } @@ -1898,7 +1893,7 @@ impl Assembler { } pub fn lea_into(&mut self, out: Opnd, opnd: Opnd) { - assert!(matches!(out, Opnd::Reg(_)), "Destination of lea_into must be a register, got: {out:?}"); + assert!(matches!(out, Opnd::Reg(_) | Opnd::Mem(_)), "Destination of lea_into must be a register or memory, got: {out:?}"); self.push_insn(Insn::Lea { opnd, out }); } diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs index 2edd15380871e1..b6c4658463048a 100644 --- a/zjit/src/backend/x86_64/mod.rs +++ b/zjit/src/backend/x86_64/mod.rs @@ -83,7 +83,7 @@ impl From<&Opnd> for X86Opnd { /// List of registers that can be used for register allocation. /// This has the same number of registers for x86_64 and arm64. -/// SCRATCH_REG is excluded. +/// SCRATCH_OPND is excluded. pub const ALLOC_REGS: &[Reg] = &[ RDI_REG, RSI_REG, @@ -95,14 +95,26 @@ pub const ALLOC_REGS: &[Reg] = &[ RAX_REG, ]; -impl Assembler -{ - /// Special scratch registers for intermediate processing. - /// This register is call-clobbered (so we don't have to save it before using it). - /// Avoid using if you can since this is used to lower [Insn] internally and - /// so conflicts are possible. - pub const SCRATCH_REG: Reg = R11_REG; - const SCRATCH0: X86Opnd = X86Opnd::Reg(Assembler::SCRATCH_REG); +/// Special scratch register for intermediate processing. It should be used only by +/// [`Assembler::x86_split_with_scratch_reg`] or [`Assembler::new_with_scratch_reg`]. +const SCRATCH_OPND: Opnd = Opnd::Reg(R11_REG); + +impl Assembler { + /// Return an Assembler with scratch registers disabled in the backend, and a scratch register. + pub fn new_with_scratch_reg() -> (Self, Opnd) { + (Self::new_with_label_names(Vec::default(), 0, true), SCRATCH_OPND) + } + + /// Return true if opnd contains a scratch reg + pub fn has_scratch_reg(opnd: Opnd) -> bool { + match opnd { + Opnd::Reg(_) => opnd == SCRATCH_OPND, + Opnd::Mem(Mem { base: MemBase::Reg(reg_no), .. }) => { + reg_no == SCRATCH_OPND.unwrap_reg().reg_no + } + _ => false, + } + } /// Get the list of registers from which we can allocate on this platform pub fn get_alloc_regs() -> Vec { @@ -127,7 +139,7 @@ impl Assembler { let live_ranges: Vec = take(&mut self.live_ranges); let mut iterator = self.insns.into_iter().enumerate().peekable(); - let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), live_ranges.len()); + let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), live_ranges.len(), self.accept_scratch_reg); while let Some((index, mut insn)) = iterator.next() { let is_load = matches!(insn, Insn::Load { .. } | Insn::LoadInto { .. }); @@ -374,36 +386,165 @@ impl Assembler asm } - /// Emit platform-specific machine code - pub fn x86_emit(&mut self, cb: &mut CodeBlock) -> Option> { + /// Split instructions using scratch registers. To maximize the use of the register pool + /// for VRegs, most splits should happen in [`Self::x86_split`]. However, some instructions + /// need to be split with registers after `alloc_regs`, e.g. for `compile_side_exits`, so + /// this splits them and uses scratch registers for it. + pub fn x86_split_with_scratch_reg(mut self) -> Assembler { /// For some instructions, we want to be able to lower a 64-bit operand /// without requiring more registers to be available in the register - /// allocator. So we just use the SCRATCH0 register temporarily to hold + /// allocator. So we just use the SCRATCH_OPND register temporarily to hold /// the value before we immediately use it. - fn emit_64bit_immediate(cb: &mut CodeBlock, opnd: &Opnd) -> X86Opnd { + fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd { match opnd { Opnd::Imm(value) => { // 32-bit values will be sign-extended - if imm_num_bits(*value) > 32 { - mov(cb, Assembler::SCRATCH0, opnd.into()); - Assembler::SCRATCH0 + if imm_num_bits(value) > 32 { + asm.mov(SCRATCH_OPND, opnd); + SCRATCH_OPND } else { - opnd.into() + opnd } }, Opnd::UImm(value) => { // 32-bit values will be sign-extended - if imm_num_bits(*value as i64) > 32 { - mov(cb, Assembler::SCRATCH0, opnd.into()); - Assembler::SCRATCH0 + if imm_num_bits(value as i64) > 32 { + asm.mov(SCRATCH_OPND, opnd); + SCRATCH_OPND } else { - imm_opnd(*value as i64) + Opnd::Imm(value as i64) } }, - _ => opnd.into() + _ => opnd } } + let mut iterator = self.insns.into_iter().enumerate().peekable(); + let mut asm = Assembler::new_with_label_names(take(&mut self.label_names), self.live_ranges.len(), true); + + while let Some((_, mut insn)) = iterator.next() { + match &mut insn { + Insn::Add { right, .. } | + Insn::Sub { right, .. } | + Insn::Mul { right, .. } | + Insn::And { right, .. } | + Insn::Or { right, .. } | + Insn::Xor { right, .. } | + Insn::Test { right, .. } => { + *right = split_64bit_immediate(&mut asm, *right); + asm.push_insn(insn); + } + Insn::Cmp { left, right } => { + let num_bits = match right { + Opnd::Imm(value) => Some(imm_num_bits(*value)), + Opnd::UImm(value) => Some(uimm_num_bits(*value)), + _ => None + }; + + // If the immediate is less than 64 bits (like 32, 16, 8), and the operand + // sizes match, then we can represent it as an immediate in the instruction + // without moving it to a register first. + // IOW, 64 bit immediates must always be moved to a register + // before comparisons, where other sizes may be encoded + // directly in the instruction. + let use_imm = num_bits.is_some() && left.num_bits() == num_bits && num_bits.unwrap() < 64; + if !use_imm { + *right = split_64bit_immediate(&mut asm, *right); + } + asm.push_insn(insn); + } + // For compile_side_exits, support splitting simple C arguments here + Insn::CCall { opnds, .. } if !opnds.is_empty() => { + for (i, opnd) in opnds.iter().enumerate() { + asm.load_into(C_ARG_OPNDS[i], *opnd); + } + *opnds = vec![]; + asm.push_insn(insn); + } + &mut Insn::Lea { opnd, out } => { + match (opnd, out) { + // Split here for compile_side_exits + (Opnd::Mem(_), Opnd::Mem(_)) => { + asm.lea_into(SCRATCH_OPND, opnd); + asm.store(out, SCRATCH_OPND); + } + _ => { + asm.push_insn(insn); + } + } + } + Insn::LeaJumpTarget { target, out } => { + if let Target::Label(_) = target { + asm.push_insn(Insn::LeaJumpTarget { out: SCRATCH_OPND, target: target.clone() }); + asm.mov(*out, SCRATCH_OPND); + } + } + // Convert Opnd::const_ptr into Opnd::Mem. This split is done here to give + // a register for compile_side_exits. + &mut Insn::IncrCounter { mem, value } => { + assert!(matches!(mem, Opnd::UImm(_))); + asm.load_into(SCRATCH_OPND, mem); + asm.incr_counter(Opnd::mem(64, SCRATCH_OPND, 0), value); + } + // Resolve ParallelMov that couldn't be handled without a scratch register. + Insn::ParallelMov { moves } => { + for (reg, opnd) in Self::resolve_parallel_moves(&moves, Some(SCRATCH_OPND)).unwrap() { + asm.load_into(Opnd::Reg(reg), opnd); + } + } + // Handle various operand combinations for spills on compile_side_exits. + &mut Insn::Store { dest, src } => { + let Opnd::Mem(Mem { num_bits, .. }) = dest else { + panic!("Unexpected Insn::Store destination in x86_split_with_scratch_reg: {dest:?}"); + }; + + let src = match src { + Opnd::Reg(_) => src, + Opnd::Mem(_) => { + asm.mov(SCRATCH_OPND, src); + SCRATCH_OPND + } + Opnd::Imm(imm) => { + // For 64 bit destinations, 32-bit values will be sign-extended + if num_bits == 64 && imm_num_bits(imm) > 32 { + asm.mov(SCRATCH_OPND, src); + SCRATCH_OPND + } else if uimm_num_bits(imm as u64) <= num_bits { + // If the bit string is short enough for the destination, use the unsigned representation. + // Note that 64-bit and negative values are ruled out. + Opnd::UImm(imm as u64) + } else { + src + } + } + Opnd::UImm(imm) => { + // For 64 bit destinations, 32-bit values will be sign-extended + if num_bits == 64 && imm_num_bits(imm as i64) > 32 { + asm.mov(SCRATCH_OPND, src); + SCRATCH_OPND + } else { + src.into() + } + } + Opnd::Value(_) => { + asm.load_into(SCRATCH_OPND, src); + SCRATCH_OPND + } + src @ (Opnd::None | Opnd::VReg { .. }) => panic!("Unexpected source operand during x86_split_with_scratch_reg: {src:?}"), + }; + asm.store(dest, src); + } + _ => { + asm.push_insn(insn); + } + } + } + + asm + } + + /// Emit platform-specific machine code + pub fn x86_emit(&mut self, cb: &mut CodeBlock) -> Option> { fn emit_csel( cb: &mut CodeBlock, truthy: Opnd, @@ -506,33 +647,27 @@ impl Assembler } Insn::Add { left, right, .. } => { - let opnd1 = emit_64bit_immediate(cb, right); - add(cb, left.into(), opnd1); + add(cb, left.into(), right.into()); }, Insn::Sub { left, right, .. } => { - let opnd1 = emit_64bit_immediate(cb, right); - sub(cb, left.into(), opnd1); + sub(cb, left.into(), right.into()); }, Insn::Mul { left, right, .. } => { - let opnd1 = emit_64bit_immediate(cb, right); - imul(cb, left.into(), opnd1); + imul(cb, left.into(), right.into()); }, Insn::And { left, right, .. } => { - let opnd1 = emit_64bit_immediate(cb, right); - and(cb, left.into(), opnd1); + and(cb, left.into(), right.into()); }, Insn::Or { left, right, .. } => { - let opnd1 = emit_64bit_immediate(cb, right); - or(cb, left.into(), opnd1); + or(cb, left.into(), right.into()); }, Insn::Xor { left, right, .. } => { - let opnd1 = emit_64bit_immediate(cb, right); - xor(cb, left.into(), opnd1); + xor(cb, left.into(), right.into()); }, Insn::Not { opnd, .. } => { @@ -551,64 +686,6 @@ impl Assembler shr(cb, opnd.into(), shift.into()) }, - store_insn @ Insn::Store { dest, src } => { - let &Opnd::Mem(Mem { num_bits, base: MemBase::Reg(base_reg_no), disp: _ }) = dest else { - panic!("Unexpected Insn::Store destination in x64_emit: {dest:?}"); - }; - - // This kind of tricky clobber can only happen for explicit use of SCRATCH_REG, - // so we panic to get the author to change their code. - #[track_caller] - fn assert_no_clobber(store_insn: &Insn, user_use: u8, backend_use: Reg) { - assert_ne!( - backend_use.reg_no, - user_use, - "Emitting {store_insn:?} would clobber {user_use:?}, in conflict with its semantics" - ); - } - - let scratch = X86Opnd::Reg(Self::SCRATCH_REG); - let src = match src { - Opnd::Reg(_) => src.into(), - &Opnd::Mem(_) => { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG); - mov(cb, scratch, src.into()); - scratch - } - &Opnd::Imm(imm) => { - // For 64 bit destinations, 32-bit values will be sign-extended - if num_bits == 64 && imm_num_bits(imm) > 32 { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG); - mov(cb, scratch, src.into()); - scratch - } else if uimm_num_bits(imm as u64) <= num_bits { - // If the bit string is short enough for the destination, use the unsigned representation. - // Note that 64-bit and negative values are ruled out. - uimm_opnd(imm as u64) - } else { - src.into() - } - } - &Opnd::UImm(imm) => { - // For 64 bit destinations, 32-bit values will be sign-extended - if num_bits == 64 && imm_num_bits(imm as i64) > 32 { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG); - mov(cb, scratch, src.into()); - scratch - } else { - src.into() - } - } - &Opnd::Value(value) => { - assert_no_clobber(store_insn, base_reg_no, Self::SCRATCH_REG); - emit_load_gc_value(cb, &mut gc_offsets, scratch, value); - scratch - } - src @ (Opnd::None | Opnd::VReg { .. }) => panic!("Unexpected source operand during x86_emit: {src:?}") - }; - mov(cb, dest.into(), src); - } - // This assumes only load instructions can contain references to GC'd Value operands Insn::Load { opnd, out } | Insn::LoadInto { dest: out, opnd } => { @@ -626,6 +703,7 @@ impl Assembler Insn::ParallelMov { .. } => unreachable!("{insn:?} should have been lowered at alloc_regs()"), + Insn::Store { dest, src } | Insn::Mov { dest, src } => { mov(cb, dest.into(), src.into()); }, @@ -638,13 +716,12 @@ impl Assembler // Load address of jump target Insn::LeaJumpTarget { target, out } => { if let Target::Label(label) = target { + let out = *out; // Set output to the raw address of the label - cb.label_ref(*label, 7, |cb, src_addr, dst_addr| { + cb.label_ref(*label, 7, move |cb, src_addr, dst_addr| { let disp = dst_addr - src_addr; - lea(cb, Self::SCRATCH0, mem_opnd(8, RIP, disp.try_into().unwrap())); + lea(cb, out.into(), mem_opnd(8, RIP, disp.try_into().unwrap())); }); - - mov(cb, out.into(), Self::SCRATCH0); } else { // Set output to the jump target's raw address let target_code = target.unwrap_code_ptr(); @@ -700,30 +777,12 @@ impl Assembler // Compare Insn::Cmp { left, right } => { - let num_bits = match right { - Opnd::Imm(value) => Some(imm_num_bits(*value)), - Opnd::UImm(value) => Some(uimm_num_bits(*value)), - _ => None - }; - - // If the immediate is less than 64 bits (like 32, 16, 8), and the operand - // sizes match, then we can represent it as an immediate in the instruction - // without moving it to a register first. - // IOW, 64 bit immediates must always be moved to a register - // before comparisons, where other sizes may be encoded - // directly in the instruction. - if num_bits.is_some() && left.num_bits() == num_bits && num_bits.unwrap() < 64 { - cmp(cb, left.into(), right.into()); - } else { - let emitted = emit_64bit_immediate(cb, right); - cmp(cb, left.into(), emitted); - } + cmp(cb, left.into(), right.into()); } // Test and set flags Insn::Test { left, right } => { - let emitted = emit_64bit_immediate(cb, right); - test(cb, left.into(), emitted); + test(cb, left.into(), right.into()); } Insn::JmpOpnd(opnd) => { @@ -893,9 +952,16 @@ impl Assembler /// Optimize and compile the stored instructions pub fn compile_with_regs(self, cb: &mut CodeBlock, regs: Vec) -> Result<(CodePtr, Vec), CompileError> { + // The backend is allowed to use scratch registers only if it has not accepted them so far. + let use_scratch_regs = !self.accept_scratch_reg; + let asm = self.x86_split(); let mut asm = asm.alloc_regs(regs)?; + // We put compile_side_exits after alloc_regs to avoid extending live ranges for VRegs spilled on side exits. asm.compile_side_exits(); + if use_scratch_regs { + asm = asm.x86_split_with_scratch_reg(); + } // Create label instances in the code block for (idx, name) in asm.label_names.iter().enumerate() { @@ -1527,6 +1593,7 @@ mod tests { assert!(imitation_heap_value.heap_object_p()); asm.store(Opnd::mem(VALUE_BITS, SP, 0), imitation_heap_value.into()); + asm = asm.x86_split_with_scratch_reg(); let gc_offsets = asm.x86_emit(&mut cb).unwrap(); assert_eq!(1, gc_offsets.len(), "VALUE source operand should be reported as gc offset"); diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index 262a0361b4a74e..c5bdbcfe0a5a83 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -18,7 +18,7 @@ use crate::state::ZJITState; use crate::stats::{send_fallback_counter, exit_counter_for_compile_error, incr_counter, incr_counter_by, send_fallback_counter_for_method_type, send_without_block_fallback_counter_for_method_type, send_fallback_counter_ptr_for_opcode, CompileError}; use crate::stats::{counter_ptr, with_time_stat, Counter, Counter::{compile_time_ns, exit_compile_error}}; use crate::{asm::CodeBlock, cruby::*, options::debug, virtualmem::CodePtr}; -use crate::backend::lir::{self, asm_comment, asm_ccall, Assembler, Opnd, Target, CFP, C_ARG_OPNDS, C_RET_OPND, EC, NATIVE_STACK_PTR, NATIVE_BASE_PTR, SCRATCH_OPND, SP}; +use crate::backend::lir::{self, asm_comment, asm_ccall, Assembler, Opnd, Target, CFP, C_ARG_OPNDS, C_RET_OPND, EC, NATIVE_STACK_PTR, NATIVE_BASE_PTR, SP}; use crate::hir::{iseq_to_hir, BlockId, BranchEdge, Invariant, RangeType, SideExitReason::{self, *}, SpecialBackrefSymbol, SpecialObjectType}; use crate::hir::{Const, FrameState, Function, Insn, InsnId, SendFallbackReason}; use crate::hir_type::{types, Type}; @@ -1592,9 +1592,7 @@ fn gen_guard_bit_equals(jit: &mut JITState, asm: &mut Assembler, val: lir::Opnd, /// Generate code that records unoptimized C functions if --zjit-stats is enabled fn gen_incr_counter_ptr(asm: &mut Assembler, counter_ptr: *mut u64) { if get_option!(stats) { - let ptr_reg = asm.load(Opnd::const_ptr(counter_ptr as *const u8)); - let counter_opnd = Opnd::mem(64, ptr_reg, 0); - asm.incr_counter(counter_opnd, Opnd::UImm(1)); + asm.incr_counter(Opnd::const_ptr(counter_ptr as *const u8), Opnd::UImm(1)); } } @@ -1963,12 +1961,12 @@ fn function_stub_hit_body(cb: &mut CodeBlock, iseq_call: &IseqCallRef) -> Result /// Compile a stub for an ISEQ called by SendWithoutBlockDirect fn gen_function_stub(cb: &mut CodeBlock, iseq_call: IseqCallRef) -> Result { - let mut asm = Assembler::new(); + let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg(); asm_comment!(asm, "Stub: {}", iseq_get_location(iseq_call.iseq.get(), 0)); // Call function_stub_hit using the shared trampoline. See `gen_function_stub_hit_trampoline`. // Use load_into instead of mov, which is split on arm64, to avoid clobbering ALLOC_REGS. - asm.load_into(SCRATCH_OPND, Opnd::const_ptr(Rc::into_raw(iseq_call))); + asm.load_into(scratch_reg, Opnd::const_ptr(Rc::into_raw(iseq_call))); asm.jmp(ZJITState::get_function_stub_hit_trampoline().into()); asm.compile(cb).map(|(code_ptr, gc_offsets)| { @@ -1979,7 +1977,7 @@ fn gen_function_stub(cb: &mut CodeBlock, iseq_call: IseqCallRef) -> Result Result { - let mut asm = Assembler::new(); + let (mut asm, scratch_reg) = Assembler::new_with_scratch_reg(); asm_comment!(asm, "function_stub_hit trampoline"); // Maintain alignment for x86_64, and set up a frame for arm64 properly @@ -1992,8 +1990,8 @@ pub fn gen_function_stub_hit_trampoline(cb: &mut CodeBlock) -> Result Result>(); + let insns = std::mem::take(&mut fun.blocks[tmp_block.0].insns); fun.blocks[block.0].insns.extend(insns); fun.make_equal_to(send_insn_id, replacement); fun.remove_block(tmp_block); @@ -2453,7 +2453,7 @@ impl Function { if let Some(replacement) = (props.inline)(fun, tmp_block, recv, &args, state) { // Copy contents of tmp_block to block assert_ne!(block, tmp_block); - let insns = fun.blocks[tmp_block.0].insns.drain(..).collect::>(); + let insns = std::mem::take(&mut fun.blocks[tmp_block.0].insns); fun.blocks[block.0].insns.extend(insns); fun.make_equal_to(send_insn_id, replacement); fun.remove_block(tmp_block); @@ -4957,7 +4957,7 @@ mod tests { } #[track_caller] - fn assert_contains_opcode(method: &str, opcode: u32) { + pub fn assert_contains_opcode(method: &str, opcode: u32) { let iseq = crate::cruby::with_rubyvm(|| get_method_iseq("self", method)); unsafe { crate::cruby::rb_zjit_profile_disable(iseq) }; assert!(iseq_contains_opcode(iseq, opcode), "iseq {method} does not contain {}", insn_name(opcode as usize)); @@ -7999,6 +7999,7 @@ mod opt_tests { use super::*; use crate::{hir_strings, options::*}; use insta::assert_snapshot; + use super::tests::assert_contains_opcode; #[track_caller] fn hir_string(method: &str) -> String { @@ -12817,4 +12818,145 @@ mod opt_tests { Return v25 "); } + + #[test] + fn test_optimize_array_aset() { + eval(" + def test(arr) + arr[1] = 10 + end + test([]) + "); + assert_snapshot!(hir_string("test"), @r" + fn test@:3: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + v2:BasicObject = GetLocal l0, SP@4 + Jump bb2(v1, v2) + bb1(v5:BasicObject, v6:BasicObject): + EntryPoint JIT(0) + Jump bb2(v5, v6) + bb2(v8:BasicObject, v9:BasicObject): + v14:Fixnum[1] = Const Value(1) + v15:Fixnum[10] = Const Value(10) + PatchPoint MethodRedefined(Array@0x1000, []=@0x1008, cme:0x1010) + PatchPoint NoSingletonClass(Array@0x1000) + v28:ArrayExact = GuardType v9, ArrayExact + v29:BasicObject = CCallVariadic []=@0x1038, v28, v14, v15 + CheckInterrupts + Return v15 + "); + } + + #[test] + fn test_optimize_array_ltlt() { + eval(" + def test(arr) + arr << 1 + end + test([]) + "); + assert_snapshot!(hir_string("test"), @r" + fn test@:3: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + v2:BasicObject = GetLocal l0, SP@4 + Jump bb2(v1, v2) + bb1(v5:BasicObject, v6:BasicObject): + EntryPoint JIT(0) + Jump bb2(v5, v6) + bb2(v8:BasicObject, v9:BasicObject): + v13:Fixnum[1] = Const Value(1) + PatchPoint MethodRedefined(Array@0x1000, <<@0x1008, cme:0x1010) + PatchPoint NoSingletonClass(Array@0x1000) + v26:ArrayExact = GuardType v9, ArrayExact + v27:BasicObject = CCallWithFrame <<@0x1038, v26, v13 + CheckInterrupts + Return v27 + "); + } + + #[test] + fn test_optimize_array_length() { + eval(" + def test(arr) = arr.length + test([]) + "); + assert_contains_opcode("test", YARVINSN_opt_length); + assert_snapshot!(hir_string("test"), @r" + fn test@:2: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + v2:BasicObject = GetLocal l0, SP@4 + Jump bb2(v1, v2) + bb1(v5:BasicObject, v6:BasicObject): + EntryPoint JIT(0) + Jump bb2(v5, v6) + bb2(v8:BasicObject, v9:BasicObject): + PatchPoint MethodRedefined(Array@0x1000, length@0x1008, cme:0x1010) + PatchPoint NoSingletonClass(Array@0x1000) + v25:ArrayExact = GuardType v9, ArrayExact + v26:Fixnum = CCall length@0x1038, v25 + CheckInterrupts + Return v26 + "); + } + + #[test] + fn test_optimize_array_size() { + eval(" + def test(arr) = arr.size + test([]) + "); + assert_contains_opcode("test", YARVINSN_opt_size); + assert_snapshot!(hir_string("test"), @r" + fn test@:2: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + v2:BasicObject = GetLocal l0, SP@4 + Jump bb2(v1, v2) + bb1(v5:BasicObject, v6:BasicObject): + EntryPoint JIT(0) + Jump bb2(v5, v6) + bb2(v8:BasicObject, v9:BasicObject): + PatchPoint MethodRedefined(Array@0x1000, size@0x1008, cme:0x1010) + PatchPoint NoSingletonClass(Array@0x1000) + v25:ArrayExact = GuardType v9, ArrayExact + v26:Fixnum = CCall size@0x1038, v25 + CheckInterrupts + Return v26 + "); + } + + #[test] + fn test_optimize_regexpmatch2() { + eval(r#" + def test(s) = s =~ /a/ + test("foo") + "#); + assert_contains_opcode("test", YARVINSN_opt_regexpmatch2); + assert_snapshot!(hir_string("test"), @r" + fn test@:2: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + v2:BasicObject = GetLocal l0, SP@4 + Jump bb2(v1, v2) + bb1(v5:BasicObject, v6:BasicObject): + EntryPoint JIT(0) + Jump bb2(v5, v6) + bb2(v8:BasicObject, v9:BasicObject): + v13:RegexpExact[VALUE(0x1000)] = Const Value(VALUE(0x1000)) + PatchPoint MethodRedefined(String@0x1008, =~@0x1010, cme:0x1018) + PatchPoint NoSingletonClass(String@0x1008) + v26:StringExact = GuardType v9, StringExact + v27:BasicObject = CCallWithFrame =~@0x1040, v26, v13 + CheckInterrupts + Return v27 + "); + } } diff --git a/zjit/src/profile.rs b/zjit/src/profile.rs index 67f2fdc7403822..e7db47142bcf65 100644 --- a/zjit/src/profile.rs +++ b/zjit/src/profile.rs @@ -74,9 +74,14 @@ fn profile_insn(bare_opcode: ruby_vminsn_type, ec: EcPtr) { YARVINSN_opt_or => profile_operands(profiler, profile, 2), YARVINSN_opt_empty_p => profile_operands(profiler, profile, 1), YARVINSN_opt_aref => profile_operands(profiler, profile, 2), + YARVINSN_opt_ltlt => profile_operands(profiler, profile, 2), + YARVINSN_opt_aset => profile_operands(profiler, profile, 3), YARVINSN_opt_not => profile_operands(profiler, profile, 1), YARVINSN_getinstancevariable => profile_self(profiler, profile), + YARVINSN_opt_regexpmatch2 => profile_operands(profiler, profile, 2), YARVINSN_objtostring => profile_operands(profiler, profile, 1), + YARVINSN_opt_length => profile_operands(profiler, profile, 1), + YARVINSN_opt_size => profile_operands(profiler, profile, 1), YARVINSN_opt_send_without_block => { let cd: *const rb_call_data = profiler.insn_opnd(0).as_ptr(); let argc = unsafe { vm_ci_argc((*cd).ci) };