diff --git a/.github/actions/setup/directories/action.yml b/.github/actions/setup/directories/action.yml
index 0120f1213082c3..c0fc75ad7d9b2f 100644
--- a/.github/actions/setup/directories/action.yml
+++ b/.github/actions/setup/directories/action.yml
@@ -175,7 +175,7 @@ runs:
         echo final='rmdir ${{ inputs.builddir }}' >> $GITHUB_OUTPUT
 
     - name: clean
-      uses: gacts/run-and-post-run@d803f6920adc9a47eeac4cb6c93dbc2e2890c684 # v1.4.2
+      uses: gacts/run-and-post-run@81b6ce503cde93862cec047c54652e45c5dca991 # v1.4.3
       with:
         working-directory:
         post: |
diff --git a/doc/stringio/getbyte.rdoc b/doc/stringio/getbyte.rdoc
new file mode 100644
index 00000000000000..48c334b5252a58
--- /dev/null
+++ b/doc/stringio/getbyte.rdoc
@@ -0,0 +1,29 @@
+Reads and returns the next integer byte (not character) from the stream:
+
+  s = 'foo'
+  s.bytes       # => [102, 111, 111]
+  strio = StringIO.new(s)
+  strio.getbyte # => 102
+  strio.getbyte # => 111
+  strio.getbyte # => 111
+
+Returns +nil+ if at end-of-stream:
+
+  strio.eof?    # => true
+  strio.getbyte # => nil
+
+Returns a byte, not a character:
+
+  s = 'тест'
+  s.bytes       # => [209, 130, 208, 181, 209, 129, 209, 130]
+  strio = StringIO.new(s)
+  strio.getbyte # => 209
+  strio.getbyte # => 130
+
+  s = 'こんにちは'
+  s.bytes       # => [227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175]
+  strio = StringIO.new(s)
+  strio.getbyte # => 227
+  strio.getbyte # => 129
+
+Related: StringIO.getc.
diff --git a/doc/stringio/gets.rdoc b/doc/stringio/gets.rdoc
new file mode 100644
index 00000000000000..892c3feb53a9bf
--- /dev/null
+++ b/doc/stringio/gets.rdoc
@@ -0,0 +1,98 @@
+Reads and returns a line from the stream;
+returns +nil+ if at end-of-stream.
+
+Side effects:
+
+- Increments stream position by the number of bytes read.
+- Assigns the return value to global variable <tt>$_</tt>.
+
+With no arguments given, reads a line using the default record separator
+(global variable <tt>$/</tt>,* whose initial value is <tt>"\n"</tt>):
+
+  strio = StringIO.new(TEXT)
+  strio.pos  # => 0
+  strio.gets # => "First line\n"
+  strio.pos  # => 11
+  $_         # => "First line\n"
+  strio.gets # => "Second line\n"
+  strio.read # => "\nFourth line\nFifth line\n"
+  strio.eof? # => true
+  strio.gets # => nil
+
+  strio = StringIO.new('тест')  # Four 2-byte characters.
+  strio.pos  # => 0
+  strio.gets # => "тест"
+  strio.pos  # => 8
+
+<b>Argument +sep+</b>
+
+With only string argument +sep+ given, reads a line using that string as the record separator:
+
+  strio = StringIO.new(TEXT)
+  strio.gets(' ') # => "First "
+  strio.gets(' ') # => "line\nSecond "
+  strio.gets(' ') # => "line\n\nFourth "
+
+<b>Argument +limit+</b>
+
+With only integer argument +limit+ given,
+reads a line using the default record separator;
+limits the size (in characters) of each line to the given limit:
+
+  strio = StringIO.new(TEXT)
+  strio.gets(10) # => "First line"
+  strio.gets(10) # => "\n"
+  strio.gets(10) # => "Second lin"
+  strio.gets(10) # => "e\n"
+
+<b>Arguments +sep+ and +limit+</b>
+
+With arguments +sep+ and +limit+ both given, honors both:
+
+  strio = StringIO.new(TEXT)
+  strio.gets(' ', 10) # => "First "
+  strio.gets(' ', 10) # => "line\nSecon"
+  strio.gets(' ', 10) # => "d "
+
+<b>Position</b>
+
+As stated above, method +gets+ reads and returns the next line in the stream.
+
+In the examples above each +strio+ object starts with its position at beginning-of-stream;
+but in other cases the position may be anywhere:
+
+  strio = StringIO.new(TEXT)
+  strio.pos = 12
+  strio.gets # => "econd line\n"
+
+The position need not be at a character boundary:
+
+  strio = StringIO.new('тест')  # Four 2-byte characters.
+  strio.pos = 2                 # At beginning of second character.
+  strio.gets # => "ест"
+  strio.pos = 3                 # In middle of second character.
+  strio.gets # => "\xB5ст"
+
+<b>Special Record Separators</b>
+
+Like some methods in class IO, method +gets+ honors two special record separators;
+see {Special Line Separators}[https://docs.ruby-lang.org/en/master/IO.html#class-IO-label-Special+Line+Separator+Values]:
+
+  strio = StringIO.new(TEXT)
+  strio.gets('')  # Read "paragraph" (up to empty line).
+  # => "First line\nSecond line\n\n"
+
+  strio = StringIO.new(TEXT)
+  strio.gets(nil) # "Slurp": read all.
+  # => "First line\nSecond line\n\nFourth line\nFifth line\n"
+
+<b>Keyword Argument +chomp+</b>
+
+With keyword argument +chomp+ given as +true+ (the default is +false+),
+removes the trailing newline (if any) from the returned line:
+
+  strio = StringIO.new(TEXT)
+  strio.gets              # => "First line\n"
+  strio.gets(chomp: true) # => "Second line"
+
+Related: StringIO.each_line.
diff --git a/ext/io/wait/io-wait.gemspec b/ext/io/wait/io-wait.gemspec
index 1554dcdb304bef..44e6b65142e2d3 100644
--- a/ext/io/wait/io-wait.gemspec
+++ b/ext/io/wait/io-wait.gemspec
@@ -15,20 +15,20 @@ Gem::Specification.new do |spec|
   spec.metadata["homepage_uri"] = spec.homepage
   spec.metadata["source_code_uri"] = spec.homepage
 
-  spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
-    `git ls-files -z`.split("\x0").reject do |f|
-      File.identical?(f, __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features|rakelib)/|\.(?:git|travis|circleci)|appveyor|Rakefile)})
-    end
-  end
+  jruby = true if Gem::Platform.new('java') =~ spec.platform or RUBY_ENGINE == 'jruby'
+  dir, gemspec = File.split(__FILE__)
+  excludes = [
+    *%w[:^/.git* :^/Gemfile* :^/Rakefile* :^/bin/ :^/test/ :^/rakelib/ :^*.java],
+    *(jruby ? %w[:^/ext/io] : %w[:^/ext/java]),
+    ":(exclude,literal,top)#{gemspec}"
+  ]
+  files = IO.popen(%w[git ls-files -z --] + excludes, chdir: dir, &:read).split("\x0")
+
+  spec.files = files
   spec.bindir        = "exe"
   spec.executables   = []
   spec.require_paths = ["lib"]
 
-  jruby = true if Gem::Platform.new('java') =~ spec.platform or RUBY_ENGINE == 'jruby'
-  spec.files.delete_if do |f|
-    f.end_with?(".java") or
-      f.start_with?("ext/") && (jruby ^ f.start_with?("ext/java/"))
-  end
   if jruby
     spec.platform = 'java'
     spec.files << "lib/io/wait.jar"
diff --git a/ext/stringio/stringio.c b/ext/stringio/stringio.c
index cf3e06a71f130e..d66768a2c50279 100644
--- a/ext/stringio/stringio.c
+++ b/ext/stringio/stringio.c
@@ -990,10 +990,10 @@ strio_getc(VALUE self)
 
 /*
  * call-seq:
- *   getbyte -> byte or nil
+ *   getbyte -> integer or nil
+ *
+ * :include: stringio/getbyte.rdoc
  *
- * Reads and returns the next 8-bit byte from the stream;
- * see {Byte IO}[rdoc-ref:IO@Byte+IO].
  */
 static VALUE
 strio_getbyte(VALUE self)
@@ -1428,9 +1428,8 @@ strio_getline(struct getline_arg *arg, struct StringIO *ptr)
  *   gets(limit, chomp: false) -> string or nil
  *   gets(sep, limit, chomp: false) -> string or nil
  *
- * Reads and returns a line from the stream;
- * assigns the return value to <tt>$_</tt>;
- * see {Line IO}[rdoc-ref:IO@Line+IO].
+ * :include: stringio/gets.rdoc
+ *
  */
 static VALUE
 strio_gets(int argc, VALUE *argv, VALUE self)
diff --git a/lib/rubygems/package/tar_header.rb b/lib/rubygems/package/tar_header.rb
index dd20d65080ff00..0ebcbd789d4038 100644
--- a/lib/rubygems/package/tar_header.rb
+++ b/lib/rubygems/package/tar_header.rb
@@ -56,7 +56,7 @@ class Gem::Package::TarHeader
   ##
   # Pack format for a tar header
 
-  PACK_FORMAT = ("a100" + # name
+  PACK_FORMAT = "a100" + # name
                 "a8"   + # mode
                 "a8"   + # uid
                 "a8"   + # gid
@@ -71,12 +71,12 @@ class Gem::Package::TarHeader
                 "a32"  + # gname
                 "a8"   + # devmajor
                 "a8"   + # devminor
-                "a155").freeze # prefix
+                "a155"   # prefix
 
   ##
   # Unpack format for a tar header
 
-  UNPACK_FORMAT = ("A100" + # name
+  UNPACK_FORMAT = "A100" + # name
                   "A8"   + # mode
                   "A8"   + # uid
                   "A8"   + # gid
@@ -91,7 +91,7 @@ class Gem::Package::TarHeader
                   "A32"  + # gname
                   "A8"   + # devmajor
                   "A8"   + # devminor
-                  "A155").freeze # prefix
+                  "A155"   # prefix
 
   attr_reader(*FIELDS)
 
diff --git a/lib/uri/mailto.rb b/lib/uri/mailto.rb
index f747b79ec753b3..cb8024f301fc47 100644
--- a/lib/uri/mailto.rb
+++ b/lib/uri/mailto.rb
@@ -52,11 +52,7 @@ class MailTo < Generic
     HEADER_REGEXP  = /\A(?<hfield>(?:%\h\h|[!$'-.0-;@-Z_a-z~])*=(?:%\h\h|[!$'-.0-;@-Z_a-z~])*)(?:&\g<hfield>)*\z/
     # practical regexp for email address
     # https://html.spec.whatwg.org/multipage/input.html#valid-e-mail-address
-    EMAIL_REGEXP = %r[\A#{
-      atext = %q[(?:[a-zA-Z0-9!\#$%&'*+\/=?^_`{|}~-]+)]
-    }(?:\.#{atext})*@#{
-      label = %q[(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)]
-    }(?:\.#{label})*\z]
+    EMAIL_REGEXP = /\A[a-zA-Z0-9.!\#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*\z/
     # :startdoc:
 
     #
diff --git a/test/rubygems/test_gem_package_tar_header.rb b/test/rubygems/test_gem_package_tar_header.rb
index 34f92967e9905b..a3f95bb7704f91 100644
--- a/test/rubygems/test_gem_package_tar_header.rb
+++ b/test/rubygems/test_gem_package_tar_header.rb
@@ -26,25 +26,6 @@ def setup
     @tar_header = Gem::Package::TarHeader.new header
   end
 
-  def test_decode_in_ractor
-    new_header = Ractor.new(@tar_header.to_s) do |str|
-      Gem::Package::TarHeader.from StringIO.new str
-    end.value
-
-    assert_headers_equal @tar_header, new_header
-  end if defined?(Ractor) && Ractor.instance_methods.include?(:value)
-
-  def test_encode_in_ractor
-    header_bytes = @tar_header.to_s
-
-    new_header = Ractor.new(header_bytes) do |str|
-      header = Gem::Package::TarHeader.from StringIO.new str
-      header.to_s
-    end.value
-
-    assert_headers_equal header_bytes, new_header
-  end if defined?(Ractor) && Ractor.instance_methods.include?(:value)
-
   def test_self_from
     io = TempIO.new @tar_header.to_s
 
diff --git a/test/uri/test_mailto.rb b/test/uri/test_mailto.rb
index 59bb5ded09b9a9..6cd33529784796 100644
--- a/test/uri/test_mailto.rb
+++ b/test/uri/test_mailto.rb
@@ -145,29 +145,23 @@ def test_check_to
     u.to = 'a@valid.com'
     assert_equal(u.to, 'a@valid.com')
 
-    # Invalid emails
-    assert_raise(URI::InvalidComponentError) do
-      u.to = '#1@mail.com'
-    end
+    # Intentionally allowed violations of RFC 5322
+    u.to = 'a..a@valid.com'
+    assert_equal(u.to, 'a..a@valid.com')
 
-    assert_raise(URI::InvalidComponentError) do
-      u.to = '@invalid.email'
-    end
+    u.to = 'hello.@valid.com'
+    assert_equal(u.to, 'hello.@valid.com')
 
-    assert_raise(URI::InvalidComponentError) do
-      u.to = '.hello@invalid.email'
-    end
-
-    assert_raise(URI::InvalidComponentError) do
-      u.to = 'hello.@invalid.email'
-    end
+    u.to = '.hello@valid.com'
+    assert_equal(u.to, '.hello@valid.com')
 
+    # Invalid emails
     assert_raise(URI::InvalidComponentError) do
-      u.to = 'n.@invalid.email'
+      u.to = '#1@mail.com'
     end
 
     assert_raise(URI::InvalidComponentError) do
-      u.to = 'n..t@invalid.email'
+      u.to = '@invalid.email'
     end
 
     # Invalid host emails
diff --git a/zjit/src/asm/x86_64/mod.rs b/zjit/src/asm/x86_64/mod.rs
index 9d3bf18dcdab41..cfedca4540361d 100644
--- a/zjit/src/asm/x86_64/mod.rs
+++ b/zjit/src/asm/x86_64/mod.rs
@@ -779,13 +779,6 @@ pub fn imul(cb: &mut CodeBlock, opnd0: X86Opnd, opnd1: X86Opnd) {
             write_rm(cb, false, true, opnd0, opnd1, None, &[0x0F, 0xAF]);
         }
 
-        // Flip the operands to handle this case. This instruction has weird encoding restrictions.
-        (X86Opnd::Mem(_), X86Opnd::Reg(_)) => {
-            //REX.W + 0F AF /rIMUL r64, r/m64
-            // Quadword register := Quadword register * r/m64.
-            write_rm(cb, false, true, opnd1, opnd0, None, &[0x0F, 0xAF]);
-        }
-
         _ => unreachable!()
     }
 }
diff --git a/zjit/src/asm/x86_64/tests.rs b/zjit/src/asm/x86_64/tests.rs
index 0f867259466de9..d574bdb0341066 100644
--- a/zjit/src/asm/x86_64/tests.rs
+++ b/zjit/src/asm/x86_64/tests.rs
@@ -228,22 +228,28 @@ fn test_cqo() {
 fn test_imul() {
     let cb1 = compile(|cb| imul(cb, RAX, RBX));
     let cb2 = compile(|cb| imul(cb, RDX, mem_opnd(64, RAX, 0)));
-    // Operands flipped for encoding since multiplication is commutative
-    let cb3 = compile(|cb| imul(cb, mem_opnd(64, RAX, 0), RDX));
 
-    assert_disasm_snapshot!(disasms!(cb1, cb2, cb3), @r"
+    assert_disasm_snapshot!(disasms!(cb1, cb2), @r"
     0x0: imul rax, rbx
     0x0: imul rdx, qword ptr [rax]
-    0x0: imul rdx, qword ptr [rax]
     ");
 
-    assert_snapshot!(hexdumps!(cb1, cb2, cb3), @r"
+    assert_snapshot!(hexdumps!(cb1, cb2), @r"
     480fafc3
     480faf10
-    480faf10
     ");
 }
 
+#[test]
+#[should_panic]
+fn test_imul_mem_reg() {
+    // imul doesn't have (Mem, Reg) encoding. Since multiplication is communicative, imul() could
+    // swap operands. However, x86_scratch_split may need to move the result to the output operand,
+    // which can be complicated if the assembler may sometimes change the result operand.
+    // So x86_scratch_split should be responsible for that swap, not the assembler.
+    compile(|cb| imul(cb, mem_opnd(64, RAX, 0), RDX));
+}
+
 #[test]
 fn test_jge_label() {
     let cb = compile(|cb| {
diff --git a/zjit/src/backend/arm64/mod.rs b/zjit/src/backend/arm64/mod.rs
index d762b14c911503..acf0576f9c80be 100644
--- a/zjit/src/backend/arm64/mod.rs
+++ b/zjit/src/backend/arm64/mod.rs
@@ -79,6 +79,9 @@ impl From<Opnd> for A64Opnd {
             Opnd::Mem(Mem { base: MemBase::VReg(_), .. }) => {
                 panic!("attempted to lower an Opnd::Mem with a MemBase::VReg base")
             },
+            Opnd::Mem(Mem { base: MemBase::Stack { .. }, .. }) => {
+                panic!("attempted to lower an Opnd::Mem with a MemBase::Stack base")
+            },
             Opnd::VReg { .. } => panic!("attempted to lower an Opnd::VReg"),
             Opnd::Value(_) => panic!("attempted to lower an Opnd::Value"),
             Opnd::None => panic!(
@@ -203,6 +206,7 @@ pub const ALLOC_REGS: &[Reg] = &[
 /// [`Assembler::arm64_scratch_split`] or [`Assembler::new_with_scratch_reg`].
 const SCRATCH0_OPND: Opnd = Opnd::Reg(X15_REG);
 const SCRATCH1_OPND: Opnd = Opnd::Reg(X17_REG);
+const SCRATCH2_OPND: Opnd = Opnd::Reg(X14_REG);
 
 impl Assembler {
     /// Special register for intermediate processing in arm64_emit. It should be used only by arm64_emit.
@@ -690,22 +694,129 @@ impl Assembler {
     /// need to be split with registers after `alloc_regs`, e.g. for `compile_exits`, so this
     /// splits them and uses scratch registers for it.
     fn arm64_scratch_split(self) -> Assembler {
-        let mut asm = Assembler::new_with_asm(&self);
+        /// If opnd is Opnd::Mem with a too large disp, make the disp smaller using lea.
+        fn split_large_disp(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
+            match opnd {
+                Opnd::Mem(Mem { num_bits, disp, .. }) if !mem_disp_fits_bits(disp) => {
+                    asm.lea_into(scratch_opnd, opnd);
+                    Opnd::mem(num_bits, scratch_opnd, 0)
+                }
+                _ => opnd,
+            }
+        }
+
+        /// If opnd is Opnd::Mem with MemBase::Stack, lower it to Opnd::Mem with MemBase::Reg, and split a large disp.
+        fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
+            let opnd = split_only_stack_membase(asm, opnd, scratch_opnd, stack_state);
+            split_large_disp(asm, opnd, scratch_opnd)
+        }
+
+        /// split_stack_membase but without split_large_disp. This should be used only by lea.
+        fn split_only_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
+            if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp: opnd_disp, num_bits: opnd_num_bits }) = opnd {
+                let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase));
+                let base = split_large_disp(asm, base, scratch_opnd);
+                asm.load_into(scratch_opnd, base);
+                Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp: opnd_disp, num_bits: opnd_num_bits })
+            } else {
+                opnd
+            }
+        }
+
+        /// If opnd is Opnd::Mem, lower it to scratch_opnd. You should use this when `opnd` is read by the instruction, not written.
+        fn split_memory_read(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
+            if let Opnd::Mem(_) = opnd {
+                let opnd = split_large_disp(asm, opnd, scratch_opnd);
+                let scratch_opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
+                asm.load_into(scratch_opnd, opnd);
+                scratch_opnd
+            } else {
+                opnd
+            }
+        }
+
+        /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg.
+        fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> {
+            if let Opnd::Mem(_) = opnd {
+                let mem_opnd = opnd.clone();
+                *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
+                Some(mem_opnd)
+            } else {
+                None
+            }
+        }
+
+        // Prepare StackState to lower MemBase::Stack
+        let stack_state = StackState::new(self.stack_base_idx);
+
+        let mut asm_local = Assembler::new_with_asm(&self);
+        let asm = &mut asm_local;
         asm.accept_scratch_reg = true;
         let mut iterator = self.insns.into_iter().enumerate().peekable();
 
         while let Some((_, mut insn)) = iterator.next() {
             match &mut insn {
-                &mut Insn::Mul { out, .. } => {
+                Insn::Add { left, right, out } |
+                Insn::Sub { left, right, out } |
+                Insn::And { left, right, out } |
+                Insn::Or { left, right, out } |
+                Insn::Xor { left, right, out } |
+                Insn::CSelZ  { truthy: left, falsy: right, out } |
+                Insn::CSelNZ { truthy: left, falsy: right, out } |
+                Insn::CSelE  { truthy: left, falsy: right, out } |
+                Insn::CSelNE { truthy: left, falsy: right, out } |
+                Insn::CSelL  { truthy: left, falsy: right, out } |
+                Insn::CSelLE { truthy: left, falsy: right, out } |
+                Insn::CSelG  { truthy: left, falsy: right, out } |
+                Insn::CSelGE { truthy: left, falsy: right, out } => {
+                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
+                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+
+                    asm.push_insn(insn);
+
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
+                    }
+                }
+                Insn::Mul { left, right, out } => {
+                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
+                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+                    let reg_out = out.clone();
+
                     asm.push_insn(insn);
 
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
+                    };
+
                     // If the next instruction is JoMul
                     if matches!(iterator.peek(), Some((_, Insn::JoMul(_)))) {
                         // Produce a register that is all zeros or all ones
                         // Based on the sign bit of the 64-bit mul result
-                        asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: out, shift: Opnd::UImm(63) });
+                        asm.push_insn(Insn::RShift { out: SCRATCH0_OPND, opnd: reg_out, shift: Opnd::UImm(63) });
+                    }
+                }
+                Insn::RShift { opnd, out, .. } => {
+                    *opnd = split_memory_read(asm, *opnd, SCRATCH0_OPND);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+
+                    asm.push_insn(insn);
+
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
                     }
                 }
+                Insn::Cmp { left, right } |
+                Insn::Test { left, right } => {
+                    *left = split_memory_read(asm, *left, SCRATCH0_OPND);
+                    *right = split_memory_read(asm, *right, SCRATCH1_OPND);
+                    asm.push_insn(insn);
+                }
                 // For compile_exits, support splitting simple C arguments here
                 Insn::CCall { opnds, .. } if !opnds.is_empty() => {
                     for (i, opnd) in opnds.iter().enumerate() {
@@ -714,16 +825,32 @@ impl Assembler {
                     *opnds = vec![];
                     asm.push_insn(insn);
                 }
-                &mut Insn::Lea { opnd, out } => {
-                    match (opnd, out) {
-                        // Split here for compile_exits
-                        (Opnd::Mem(_), Opnd::Mem(_)) => {
-                            asm.lea_into(SCRATCH0_OPND, opnd);
-                            asm.store(out, SCRATCH0_OPND);
-                        }
-                        _ => {
-                            asm.push_insn(insn);
+                Insn::Lea { opnd, out } => {
+                    *opnd = split_only_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+
+                    asm.push_insn(insn);
+
+                    if let Some(mem_out) = mem_out {
+                        let mem_out = split_large_disp(asm, mem_out, SCRATCH1_OPND);
+                        asm.store(mem_out, SCRATCH0_OPND);
+                    }
+                }
+                Insn::Load { opnd, out } |
+                Insn::LoadInto { opnd, dest: out } => {
+                    *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
+                    *out = split_stack_membase(asm, *out, SCRATCH1_OPND, &stack_state);
+
+                    if let Opnd::Mem(_) = out {
+                        // If NATIVE_STACK_PTR is used as a source for Store, it's handled as xzr, storeing zero.
+                        // To save the content of NATIVE_STACK_PTR, we need to load it into another register first.
+                        if *opnd == NATIVE_STACK_PTR {
+                            asm.load_into(SCRATCH0_OPND, NATIVE_STACK_PTR);
+                            *opnd = SCRATCH0_OPND;
                         }
+                        asm.store(*out, *opnd);
+                    } else {
+                        asm.push_insn(insn);
                     }
                 }
                 &mut Insn::IncrCounter { mem, value } => {
@@ -741,31 +868,24 @@ impl Assembler {
                     asm.cmp(SCRATCH1_OPND, 0.into());
                     asm.jne(label);
                 }
-                &mut Insn::Store { dest, src } => {
-                    let Opnd::Mem(Mem { num_bits: dest_num_bits, disp: dest_disp, .. }) = dest else {
-                        panic!("Insn::Store destination must be Opnd::Mem: {dest:?}, {src:?}");
-                    };
-
-                    // Split dest using a scratch register if necessary.
-                    let dest = if mem_disp_fits_bits(dest_disp) {
-                        dest
-                    } else {
-                        asm.lea_into(SCRATCH0_OPND, dest);
-                        Opnd::mem(dest_num_bits, SCRATCH0_OPND, 0)
-                    };
-
-                    asm.store(dest, src);
+                Insn::Store { dest, .. } => {
+                    *dest = split_stack_membase(asm, *dest, SCRATCH0_OPND, &stack_state);
+                    asm.push_insn(insn);
                 }
-                &mut Insn::Mov { dest, src } => {
+                Insn::Mov { dest, src } => {
+                    *src = split_stack_membase(asm, *src, SCRATCH0_OPND, &stack_state);
+                    *dest = split_large_disp(asm, *dest, SCRATCH1_OPND);
                     match dest {
-                        Opnd::Reg(_) => asm.load_into(dest, src),
-                        Opnd::Mem(_) => asm.store(dest, src),
+                        Opnd::Reg(_) => asm.load_into(*dest, *src),
+                        Opnd::Mem(_) => asm.store(*dest, *src),
                         _ => asm.push_insn(insn),
                     }
                 }
                 // Resolve ParallelMov that couldn't be handled without a scratch register.
                 Insn::ParallelMov { moves } => {
                     for (dst, src) in Self::resolve_parallel_moves(moves, Some(SCRATCH0_OPND)).unwrap() {
+                        let src = split_stack_membase(asm, src, SCRATCH1_OPND, &stack_state);
+                        let dst = split_large_disp(asm, dst, SCRATCH2_OPND);
                         match dst {
                             Opnd::Reg(_) => asm.load_into(dst, src),
                             Opnd::Mem(_) => asm.store(dst, src),
@@ -779,7 +899,7 @@ impl Assembler {
             }
         }
 
-        asm
+        asm_local
     }
 
     /// Emit platform-specific machine code
@@ -1157,10 +1277,11 @@ impl Assembler {
                                 load_effective_address(cb, Self::EMIT_OPND, src_base_reg_no, src_disp);
                                 A64Opnd::new_mem(dest.rm_num_bits(), Self::EMIT_OPND, 0)
                             };
+                            let dst = A64Opnd::Reg(Self::EMIT_REG.with_num_bits(src_num_bits));
                             match src_num_bits {
-                                64 | 32 => ldur(cb, Self::EMIT_OPND, src_mem),
-                                16 => ldurh(cb, Self::EMIT_OPND, src_mem),
-                                8 => ldurb(cb, Self::EMIT_OPND, src_mem),
+                                64 | 32 => ldur(cb, dst, src_mem),
+                                16 => ldurh(cb, dst, src_mem),
+                                8 => ldurb(cb, dst, src_mem),
                                 num_bits => panic!("unexpected num_bits: {num_bits}")
                             };
                             Self::EMIT_REG
diff --git a/zjit/src/backend/lir.rs b/zjit/src/backend/lir.rs
index 584251de802bf2..66e89a1304d715 100644
--- a/zjit/src/backend/lir.rs
+++ b/zjit/src/backend/lir.rs
@@ -28,8 +28,12 @@ pub static JIT_PRESERVED_REGS: &[Opnd] = &[CFP, SP, EC];
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum MemBase
 {
+    /// Register: Every Opnd::Mem should have MemBase::Reg as of emit.
     Reg(u8),
+    /// Virtual register: Lowered to MemBase::Reg or MemBase::Stack in alloc_regs.
     VReg(usize),
+    /// Stack slot: Lowered to MemBase::Reg in scratch_split.
+    Stack { stack_idx: usize, num_bits: u8 },
 }
 
 // Memory location
@@ -55,6 +59,8 @@ impl fmt::Display for Mem {
         match self.base {
             MemBase::Reg(reg_no) => write!(f, "{}", mem_base_reg(reg_no))?,
             MemBase::VReg(idx) => write!(f, "v{idx}")?,
+            MemBase::Stack { stack_idx, num_bits } if num_bits == 64 => write!(f, "Stack[{stack_idx}]")?,
+            MemBase::Stack { stack_idx, num_bits } => write!(f, "Stack{num_bits}[{stack_idx}]")?,
         }
         if self.disp != 0 {
             let sign = if self.disp > 0 { '+' } else { '-' };
@@ -1143,6 +1149,81 @@ impl LiveRange {
     }
 }
 
+/// StackState manages which stack slots are used by which VReg
+pub struct StackState {
+    /// The maximum number of spilled VRegs at a time
+    stack_size: usize,
+    /// Map from index at the C stack for spilled VRegs to Some(vreg_idx) if allocated
+    stack_slots: Vec<Option<usize>>,
+    /// Copy of Assembler::stack_base_idx. Used for calculating stack slot offsets.
+    stack_base_idx: usize,
+}
+
+impl StackState {
+    /// Initialize a stack allocator
+    pub(super) fn new(stack_base_idx: usize) -> Self {
+        StackState {
+            stack_size: 0,
+            stack_slots: vec![],
+            stack_base_idx,
+        }
+    }
+
+    /// Allocate a stack slot for a given vreg_idx
+    fn alloc_stack(&mut self, vreg_idx: usize) -> Opnd {
+        for stack_idx in 0..self.stack_size {
+            if self.stack_slots[stack_idx].is_none() {
+                self.stack_slots[stack_idx] = Some(vreg_idx);
+                return Opnd::mem(64, NATIVE_BASE_PTR, self.stack_idx_to_disp(stack_idx));
+            }
+        }
+        // Every stack slot is in use. Allocate a new stack slot.
+        self.stack_size += 1;
+        self.stack_slots.push(Some(vreg_idx));
+        Opnd::mem(64, NATIVE_BASE_PTR, self.stack_idx_to_disp(self.stack_slots.len() - 1))
+    }
+
+    /// Deallocate a stack slot for a given disp
+    fn dealloc_stack(&mut self, disp: i32) {
+        let stack_idx = self.disp_to_stack_idx(disp);
+        if self.stack_slots[stack_idx].is_some() {
+            self.stack_slots[stack_idx] = None;
+        }
+    }
+
+    /// Convert the `disp` of a stack slot operand to the stack index
+    fn disp_to_stack_idx(&self, disp: i32) -> usize {
+        (-disp / SIZEOF_VALUE_I32) as usize - self.stack_base_idx - 1
+    }
+
+    /// Convert a stack index to the `disp` of the stack slot
+    fn stack_idx_to_disp(&self, stack_idx: usize) -> i32 {
+        (self.stack_base_idx + stack_idx + 1) as i32 * -SIZEOF_VALUE_I32
+    }
+
+    /// Convert Mem to MemBase::Stack
+    fn mem_to_stack_membase(&self, mem: Mem) -> MemBase {
+        match mem {
+            Mem { base: MemBase::Reg(reg_no), disp, num_bits } if NATIVE_BASE_PTR.unwrap_reg().reg_no == reg_no => {
+                let stack_idx = self.disp_to_stack_idx(disp);
+                MemBase::Stack { stack_idx, num_bits }
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    /// Convert MemBase::Stack to Mem
+    pub(super) fn stack_membase_to_mem(&self, membase: MemBase) -> Mem {
+        match membase {
+            MemBase::Stack { stack_idx, num_bits } => {
+                let disp = self.stack_idx_to_disp(stack_idx);
+                Mem { base: MemBase::Reg(NATIVE_BASE_PTR.unwrap_reg().reg_no), disp, num_bits }
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
 /// RegisterPool manages which registers are used by which VReg
 struct RegisterPool {
     /// List of registers that can be allocated
@@ -1155,45 +1236,54 @@ struct RegisterPool {
     /// The number of live registers.
     /// Provides a quick way to query `pool.filter(|r| r.is_some()).count()`
     live_regs: usize,
+
+    /// Fallback to let StackState allocate stack slots when RegisterPool runs out of registers.
+    stack_state: StackState,
 }
 
 impl RegisterPool {
     /// Initialize a register pool
-    fn new(regs: Vec<Reg>) -> Self {
+    fn new(regs: Vec<Reg>, stack_base_idx: usize) -> Self {
         let pool = vec![None; regs.len()];
         RegisterPool {
             regs,
             pool,
             live_regs: 0,
+            stack_state: StackState::new(stack_base_idx),
         }
     }
 
     /// Mutate the pool to indicate that the register at the index
     /// has been allocated and is live.
-    fn alloc_reg(&mut self, vreg_idx: usize) -> Option<Reg> {
+    fn alloc_opnd(&mut self, vreg_idx: usize) -> Opnd {
         for (reg_idx, reg) in self.regs.iter().enumerate() {
             if self.pool[reg_idx].is_none() {
                 self.pool[reg_idx] = Some(vreg_idx);
                 self.live_regs += 1;
-                return Some(*reg);
+                return Opnd::Reg(*reg);
             }
         }
-        None
+        self.stack_state.alloc_stack(vreg_idx)
     }
 
     /// Allocate a specific register
-    fn take_reg(&mut self, reg: &Reg, vreg_idx: usize) -> Reg {
+    fn take_reg(&mut self, reg: &Reg, vreg_idx: usize) -> Opnd {
         let reg_idx = self.regs.iter().position(|elem| elem.reg_no == reg.reg_no)
             .unwrap_or_else(|| panic!("Unable to find register: {}", reg.reg_no));
         assert_eq!(self.pool[reg_idx], None, "register already allocated for VReg({:?})", self.pool[reg_idx]);
         self.pool[reg_idx] = Some(vreg_idx);
         self.live_regs += 1;
-        *reg
+        Opnd::Reg(*reg)
     }
 
     // Mutate the pool to indicate that the given register is being returned
     // as it is no longer used by the instruction that previously held it.
-    fn dealloc_reg(&mut self, reg: &Reg) {
+    fn dealloc_opnd(&mut self, opnd: &Opnd) {
+        if let Opnd::Mem(Mem { disp, .. }) = *opnd {
+            return self.stack_state.dealloc_stack(disp);
+        }
+
+        let reg = opnd.unwrap_reg();
         let reg_idx = self.regs.iter().position(|elem| elem.reg_no == reg.reg_no)
             .unwrap_or_else(|| panic!("Unable to find register: {}", reg.reg_no));
         if self.pool[reg_idx].is_some() {
@@ -1428,61 +1518,40 @@ impl Assembler
     /// registers because their output is used as the operand on a subsequent
     /// instruction. This is our implementation of the linear scan algorithm.
     pub(super) fn alloc_regs(mut self, regs: Vec<Reg>) -> Result<Assembler, CompileError> {
-        // Dump live registers for register spill debugging.
-        fn dump_live_regs(insns: Vec<Insn>, live_ranges: Vec<LiveRange>, num_regs: usize, spill_index: usize) {
-            // Convert live_ranges to live_regs: the number of live registers at each index
-            let mut live_regs: Vec<usize> = vec![];
-            for insn_idx in 0..insns.len() {
-                let live_count = live_ranges.iter().filter(|range|
-                    match (range.start, range.end) {
-                        (Some(start), Some(end)) => start <= insn_idx && insn_idx <= end,
-                        _ => false,
-                    }
-                ).count();
-                live_regs.push(live_count);
-            }
-
-            // Dump insns along with live registers
-            for (insn_idx, insn) in insns.iter().enumerate() {
-                eprint!("{:3} ", if spill_index == insn_idx { "==>" } else { "" });
-                for reg in 0..=num_regs {
-                    eprint!("{:1}", if reg < live_regs[insn_idx] { "|" } else { "" });
-                }
-                eprintln!(" [{:3}] {:?}", insn_idx, insn);
-            }
-        }
-
         // First, create the pool of registers.
-        let mut pool = RegisterPool::new(regs.clone());
+        let mut pool = RegisterPool::new(regs.clone(), self.stack_base_idx);
 
-        // Mapping between VReg and allocated VReg for each VReg index.
-        // None if no register has been allocated for the VReg.
-        let mut reg_mapping: Vec<Option<Reg>> = vec![None; self.live_ranges.len()];
+        // Mapping between VReg and register or stack slot for each VReg index.
+        // None if no register or stack slot has been allocated for the VReg.
+        let mut vreg_opnd: Vec<Option<Opnd>> = vec![None; self.live_ranges.len()];
 
         // List of registers saved before a C call, paired with the VReg index.
         let mut saved_regs: Vec<(Reg, usize)> = vec![];
 
+        // Remember the indexes of Insn::FrameSetup to update the stack size later
+        let mut frame_setup_idxs: Vec<usize> = vec![];
+
         // live_ranges is indexed by original `index` given by the iterator.
         let mut asm = Assembler::new_with_asm(&self);
         let live_ranges: Vec<LiveRange> = take(&mut self.live_ranges);
         let mut iterator = self.insns.into_iter().enumerate().peekable();
 
         while let Some((index, mut insn)) = iterator.next() {
+            // Remember the index of FrameSetup to bump slot_count when we know the max number of spilled VRegs.
+            if let Insn::FrameSetup { .. } = insn {
+                frame_setup_idxs.push(asm.insns.len());
+            }
+
             let before_ccall = match (&insn, iterator.peek().map(|(_, insn)| insn)) {
                 (Insn::ParallelMov { .. }, Some(Insn::CCall { .. })) |
                 (Insn::CCall { .. }, _) if !pool.is_empty() => {
                     // If C_RET_REG is in use, move it to another register.
                     // This must happen before last-use registers are deallocated.
                     if let Some(vreg_idx) = pool.vreg_for(&C_RET_REG) {
-                        let new_reg = if let Some(new_reg) = pool.alloc_reg(vreg_idx) {
-                            new_reg
-                        } else {
-                            debug!("spilling VReg is not implemented yet, can't evacuate C_RET_REG on CCall");
-                            return Err(CompileError::RegisterSpillOnCCall);
-                        };
-                        asm.mov(Opnd::Reg(new_reg), C_RET_OPND);
-                        pool.dealloc_reg(&C_RET_REG);
-                        reg_mapping[vreg_idx] = Some(new_reg);
+                        let new_opnd = pool.alloc_opnd(vreg_idx);
+                        asm.mov(new_opnd, C_RET_OPND);
+                        pool.dealloc_opnd(&Opnd::Reg(C_RET_REG));
+                        vreg_opnd[vreg_idx] = Some(new_opnd);
                     }
 
                     true
@@ -1501,8 +1570,8 @@ impl Assembler
                         // uses this operand. If it is, we can return the allocated
                         // register to the pool.
                         if live_ranges[idx].end() == index {
-                            if let Some(reg) = reg_mapping[idx] {
-                                pool.dealloc_reg(&reg);
+                            if let Some(opnd) = vreg_opnd[idx] {
+                                pool.dealloc_opnd(&opnd);
                             } else {
                                 unreachable!("no register allocated for insn {:?}", insn);
                             }
@@ -1520,7 +1589,7 @@ impl Assembler
                 // Save live registers
                 for &(reg, _) in saved_regs.iter() {
                     asm.cpush(Opnd::Reg(reg));
-                    pool.dealloc_reg(&reg);
+                    pool.dealloc_opnd(&Opnd::Reg(reg));
                 }
                 // On x86_64, maintain 16-byte stack alignment
                 if cfg!(target_arch = "x86_64") && saved_regs.len() % 2 == 1 {
@@ -1560,7 +1629,7 @@ impl Assembler
 
                     if let Some(Opnd::VReg{ idx, .. }) = opnd_iter.next() {
                         if live_ranges[*idx].end() == index {
-                            if let Some(reg) = reg_mapping[*idx] {
+                            if let Some(Opnd::Reg(reg)) = vreg_opnd[*idx] {
                                 out_reg = Some(pool.take_reg(&reg, vreg_idx));
                             }
                         }
@@ -1569,23 +1638,7 @@ impl Assembler
 
                 // Allocate a new register for this instruction if one is not
                 // already allocated.
-                if out_reg.is_none() {
-                    out_reg = match pool.alloc_reg(vreg_idx) {
-                        Some(reg) => Some(reg),
-                        None => {
-                            if get_option!(debug) {
-                                let mut insns = asm.insns;
-                                insns.push(insn);
-                                for (_, insn) in iterator.by_ref() {
-                                    insns.push(insn);
-                                }
-                                dump_live_regs(insns, live_ranges, regs.len(), index);
-                            }
-                            debug!("Register spill not supported");
-                            return Err(CompileError::RegisterSpillOnAlloc);
-                        }
-                    };
-                }
+                let out_opnd = out_reg.unwrap_or_else(|| pool.alloc_opnd(vreg_idx));
 
                 // Set the output operand on the instruction
                 let out_num_bits = Opnd::match_num_bits_iter(insn.opnd_iter());
@@ -1594,9 +1647,9 @@ impl Assembler
                 // output operand on this instruction because the live range
                 // extends beyond the index of the instruction.
                 let out = insn.out_opnd_mut().unwrap();
-                let reg = out_reg.unwrap().with_num_bits(out_num_bits);
-                reg_mapping[out.vreg_idx()] = Some(reg);
-                *out = Opnd::Reg(reg);
+                let out_opnd = out_opnd.with_num_bits(out_num_bits);
+                vreg_opnd[out.vreg_idx()] = Some(out_opnd);
+                *out = out_opnd;
             }
 
             // Replace VReg and Param operands by their corresponding register
@@ -1604,11 +1657,15 @@ impl Assembler
             while let Some(opnd) = opnd_iter.next() {
                 match *opnd {
                     Opnd::VReg { idx, num_bits } => {
-                        *opnd = Opnd::Reg(reg_mapping[idx].unwrap()).with_num_bits(num_bits);
+                        *opnd = vreg_opnd[idx].unwrap().with_num_bits(num_bits);
                     },
                     Opnd::Mem(Mem { base: MemBase::VReg(idx), disp, num_bits }) => {
-                        let base = MemBase::Reg(reg_mapping[idx].unwrap().reg_no);
-                        *opnd = Opnd::Mem(Mem { base, disp, num_bits });
+                        *opnd = match vreg_opnd[idx].unwrap() {
+                            Opnd::Reg(reg) => Opnd::Mem(Mem { base: MemBase::Reg(reg.reg_no), disp, num_bits }),
+                            // If the base is spilled, lower it to MemBase::Stack, which scratch_split will lower to MemBase::Reg.
+                            Opnd::Mem(mem) => Opnd::Mem(Mem { base: pool.stack_state.mem_to_stack_membase(mem), disp, num_bits }),
+                            _ => unreachable!(),
+                        }
                     }
                     _ => {},
                 }
@@ -1618,8 +1675,8 @@ impl Assembler
             // register
             if let Some(idx) = vreg_idx {
                 if live_ranges[idx].end() == index {
-                    if let Some(reg) = reg_mapping[idx] {
-                        pool.dealloc_reg(&reg);
+                    if let Some(opnd) = vreg_opnd[idx] {
+                        pool.dealloc_opnd(&opnd);
                     } else {
                         unreachable!("no register allocated for insn {:?}", insn);
                     }
@@ -1671,6 +1728,16 @@ impl Assembler
             }
         }
 
+        // Extend the stack space for spilled operands
+        for frame_setup_idx in frame_setup_idxs {
+            match &mut asm.insns[frame_setup_idx] {
+                Insn::FrameSetup { slot_count, .. } => {
+                    *slot_count += pool.stack_state.stack_size;
+                }
+                _ => unreachable!(),
+            }
+        }
+
         assert!(pool.is_empty(), "Expected all registers to be returned to the pool");
         Ok(asm)
     }
diff --git a/zjit/src/backend/x86_64/mod.rs b/zjit/src/backend/x86_64/mod.rs
index 14c0df8dd02e04..1d5d90a856c92e 100644
--- a/zjit/src/backend/x86_64/mod.rs
+++ b/zjit/src/backend/x86_64/mod.rs
@@ -1,4 +1,4 @@
-use std::mem::take;
+use std::mem::{self, take};
 
 use crate::asm::*;
 use crate::asm::x86_64::*;
@@ -97,13 +97,13 @@ pub const ALLOC_REGS: &[Reg] = &[
     RCX_REG,
     R8_REG,
     R9_REG,
-    R10_REG,
     RAX_REG,
 ];
 
 /// Special scratch register for intermediate processing. It should be used only by
 /// [`Assembler::x86_scratch_split`] or [`Assembler::new_with_scratch_reg`].
 const SCRATCH0_OPND: Opnd = Opnd::Reg(R11_REG);
+const SCRATCH1_OPND: Opnd = Opnd::Reg(R10_REG);
 
 impl Assembler {
     /// Return an Assembler with scratch registers disabled in the backend, and a scratch register.
@@ -395,13 +395,13 @@ impl Assembler {
         /// without requiring more registers to be available in the register
         /// allocator. So we just use the SCRATCH0_OPND register temporarily to hold
         /// the value before we immediately use it.
-        fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd) -> Opnd {
+        fn split_64bit_immediate(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd) -> Opnd {
             match opnd {
                 Opnd::Imm(value) => {
                     // 32-bit values will be sign-extended
                     if imm_num_bits(value) > 32 {
-                        asm.mov(SCRATCH0_OPND, opnd);
-                        SCRATCH0_OPND
+                        asm.mov(scratch_opnd, opnd);
+                        scratch_opnd
                     } else {
                         opnd
                     }
@@ -409,8 +409,8 @@ impl Assembler {
                 Opnd::UImm(value) => {
                     // 32-bit values will be sign-extended
                     if imm_num_bits(value as i64) > 32 {
-                        asm.mov(SCRATCH0_OPND, opnd);
-                        SCRATCH0_OPND
+                        asm.mov(scratch_opnd, opnd);
+                        scratch_opnd
                     } else {
                         Opnd::Imm(value as i64)
                     }
@@ -419,23 +419,102 @@ impl Assembler {
             }
         }
 
-        let mut asm = Assembler::new_with_asm(&self);
+        /// If a given operand is Opnd::Mem and it uses MemBase::Stack, lower it to MemBase::Reg using a scratch regsiter.
+        fn split_stack_membase(asm: &mut Assembler, opnd: Opnd, scratch_opnd: Opnd, stack_state: &StackState) -> Opnd {
+            if let Opnd::Mem(Mem { base: stack_membase @ MemBase::Stack { .. }, disp, num_bits }) = opnd {
+                let base = Opnd::Mem(stack_state.stack_membase_to_mem(stack_membase));
+                asm.load_into(scratch_opnd, base);
+                Opnd::Mem(Mem { base: MemBase::Reg(scratch_opnd.unwrap_reg().reg_no), disp, num_bits })
+            } else {
+                opnd
+            }
+        }
+
+        /// If opnd is Opnd::Mem, set scratch_reg to *opnd. Return Some(Opnd::Mem) if it needs to be written back from scratch_reg.
+        fn split_memory_write(opnd: &mut Opnd, scratch_opnd: Opnd) -> Option<Opnd> {
+            if let Opnd::Mem(_) = opnd {
+                let mem_opnd = opnd.clone();
+                *opnd = opnd.num_bits().map(|num_bits| scratch_opnd.with_num_bits(num_bits)).unwrap_or(scratch_opnd);
+                Some(mem_opnd)
+            } else {
+                None
+            }
+        }
+
+        /// If both opnd and other are Opnd::Mem, split opnd with scratch_opnd.
+        fn split_if_both_memory(asm: &mut Assembler, opnd: Opnd, other: Opnd, scratch_opnd: Opnd) -> Opnd {
+            if let (Opnd::Mem(_), Opnd::Mem(_)) = (opnd, other) {
+                asm.load_into(scratch_opnd.with_num_bits(opnd.rm_num_bits()), opnd);
+                scratch_opnd.with_num_bits(opnd.rm_num_bits())
+            } else {
+                opnd
+            }
+        }
+
+        /// Move src to dst, splitting it with scratch_opnd if it's a Mem-to-Mem move. Skip it if dst == src.
+        fn asm_mov(asm: &mut Assembler, dst: Opnd, src: Opnd, scratch_opnd: Opnd) {
+            if dst != src {
+                if let (Opnd::Mem(_), Opnd::Mem(_)) = (dst, src) {
+                    asm.mov(scratch_opnd, src);
+                    asm.mov(dst, scratch_opnd);
+                } else {
+                    asm.mov(dst, src);
+                }
+            }
+        }
+
+        // Prepare StackState to lower MemBase::Stack
+        let stack_state = StackState::new(self.stack_base_idx);
+
+        let mut asm_local = Assembler::new_with_asm(&self);
+        let asm = &mut asm_local;
         asm.accept_scratch_reg = true;
         let mut iterator = self.insns.into_iter().enumerate().peekable();
 
         while let Some((_, mut insn)) = iterator.next() {
             match &mut insn {
-                Insn::Add { right, .. } |
-                Insn::Sub { right, .. } |
-                Insn::Mul { right, .. } |
-                Insn::And { right, .. } |
-                Insn::Or { right, .. } |
-                Insn::Xor { right, .. } |
-                Insn::Test { right, .. } => {
-                    *right = split_64bit_immediate(&mut asm, *right);
+                Insn::Add { left, right, out } |
+                Insn::Sub { left, right, out } |
+                Insn::And { left, right, out } |
+                Insn::Or  { left, right, out } |
+                Insn::Xor { left, right, out } => {
+                    *left = split_stack_membase(asm, *left, SCRATCH0_OPND, &stack_state);
+                    *left = split_if_both_memory(asm, *left, *right, SCRATCH0_OPND);
+                    *right = split_stack_membase(asm, *right, SCRATCH1_OPND, &stack_state);
+                    *right = split_64bit_immediate(asm, *right, SCRATCH1_OPND);
+
+                    let (out, left) = (*out, *left);
+                    asm.push_insn(insn);
+                    asm_mov(asm, out, left, SCRATCH0_OPND);
+                }
+                Insn::Mul { left, right, out } => {
+                    *left = split_stack_membase(asm, *left, SCRATCH0_OPND, &stack_state);
+                    *left = split_if_both_memory(asm, *left, *right, SCRATCH0_OPND);
+                    *right = split_stack_membase(asm, *right, SCRATCH1_OPND, &stack_state);
+                    *right = split_64bit_immediate(asm, *right, SCRATCH1_OPND);
+
+                    // imul doesn't have (Mem, Reg) encoding. Swap left and right in that case.
+                    if let (Opnd::Mem(_), Opnd::Reg(_)) = (&left, &right) {
+                        mem::swap(left, right);
+                    }
+
+                    let (out, left) = (*out, *left);
                     asm.push_insn(insn);
+                    asm_mov(asm, out, left, SCRATCH0_OPND);
                 }
+                &mut Insn::Not { opnd, out } |
+                &mut Insn::LShift { opnd, out, .. } |
+                &mut Insn::RShift { opnd, out, .. } |
+                &mut Insn::URShift { opnd, out, .. } => {
+                    asm.push_insn(insn);
+                    asm_mov(asm, out, opnd, SCRATCH0_OPND);
+                }
+                Insn::Test { left, right } |
                 Insn::Cmp { left, right } => {
+                    *left = split_stack_membase(asm, *left, SCRATCH1_OPND, &stack_state);
+                    *right = split_stack_membase(asm, *right, SCRATCH0_OPND, &stack_state);
+                    *right = split_if_both_memory(asm, *right, *left, SCRATCH0_OPND);
+
                     let num_bits = match right {
                         Opnd::Imm(value) => Some(imm_num_bits(*value)),
                         Opnd::UImm(value) => Some(uimm_num_bits(*value)),
@@ -450,7 +529,7 @@ impl Assembler {
                     // directly in the instruction.
                     let use_imm = num_bits.is_some() && left.num_bits() == num_bits && num_bits.unwrap() < 64;
                     if !use_imm {
-                        *right = split_64bit_immediate(&mut asm, *right);
+                        *right = split_64bit_immediate(asm, *right, SCRATCH0_OPND);
                     }
                     asm.push_insn(insn);
                 }
@@ -462,16 +541,19 @@ impl Assembler {
                     *opnds = vec![];
                     asm.push_insn(insn);
                 }
-                &mut Insn::Lea { opnd, out } => {
-                    match (opnd, out) {
-                        // Split here for compile_exits
-                        (Opnd::Mem(_), Opnd::Mem(_)) => {
-                            asm.lea_into(SCRATCH0_OPND, opnd);
-                            asm.store(out, SCRATCH0_OPND);
-                        }
-                        _ => {
-                            asm.push_insn(insn);
-                        }
+                Insn::CSelZ { out, .. } |
+                Insn::CSelNZ { out, .. } |
+                Insn::CSelE { out, .. } |
+                Insn::CSelNE { out, .. } |
+                Insn::CSelL { out, .. } |
+                Insn::CSelLE { out, .. } |
+                Insn::CSelG { out, .. } |
+                Insn::CSelGE { out, .. } |
+                Insn::Lea { out, .. } => {
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+                    asm.push_insn(insn);
+                    if let Some(mem_out) = mem_out {
+                        asm.store(mem_out, SCRATCH0_OPND);
                     }
                 }
                 Insn::LeaJumpTarget { target, out } => {
@@ -480,6 +562,15 @@ impl Assembler {
                         asm.mov(*out, SCRATCH0_OPND);
                     }
                 }
+                Insn::Load { out, opnd } |
+                Insn::LoadInto { dest: out, opnd } => {
+                    *opnd = split_stack_membase(asm, *opnd, SCRATCH0_OPND, &stack_state);
+                    let mem_out = split_memory_write(out, SCRATCH0_OPND);
+                    asm.push_insn(insn);
+                    if let Some(mem_out) = mem_out {
+                        asm.store(mem_out, SCRATCH0_OPND.with_num_bits(mem_out.rm_num_bits()));
+                    }
+                }
                 // Convert Opnd::const_ptr into Opnd::Mem. This split is done here to give
                 // a register for compile_exits.
                 &mut Insn::IncrCounter { mem, value } => {
@@ -487,17 +578,19 @@ impl Assembler {
                     asm.load_into(SCRATCH0_OPND, mem);
                     asm.incr_counter(Opnd::mem(64, SCRATCH0_OPND, 0), value);
                 }
+                &mut Insn::Mov { dest, src } => {
+                    asm_mov(asm, dest, src, SCRATCH0_OPND);
+                }
                 // Resolve ParallelMov that couldn't be handled without a scratch register.
                 Insn::ParallelMov { moves } => {
                     for (dst, src) in Self::resolve_parallel_moves(&moves, Some(SCRATCH0_OPND)).unwrap() {
-                        asm.mov(dst, src)
+                        asm_mov(asm, dst, src, SCRATCH0_OPND);
                     }
                 }
                 // Handle various operand combinations for spills on compile_exits.
                 &mut Insn::Store { dest, src } => {
-                    let Opnd::Mem(Mem { num_bits, .. }) = dest else {
-                        panic!("Unexpected Insn::Store destination in x86_scratch_split: {dest:?}");
-                    };
+                    let num_bits = dest.rm_num_bits();
+                    let dest = split_stack_membase(asm, dest, SCRATCH1_OPND, &stack_state);
 
                     let src = match src {
                         Opnd::Reg(_) => src,
@@ -541,7 +634,7 @@ impl Assembler {
             }
         }
 
-        asm
+        asm_local
     }
 
     /// Emit platform-specific machine code