diff --git a/depend b/depend
index 63e73a56390056..1f9f0c31eba695 100644
--- a/depend
+++ b/depend
@@ -7382,8 +7382,10 @@ jit.$(OBJEXT): $(CCAN_DIR)/str/str.h
 jit.$(OBJEXT): $(hdrdir)/ruby/ruby.h
 jit.$(OBJEXT): $(top_srcdir)/internal/array.h
 jit.$(OBJEXT): $(top_srcdir)/internal/basic_operators.h
+jit.$(OBJEXT): $(top_srcdir)/internal/bits.h
 jit.$(OBJEXT): $(top_srcdir)/internal/class.h
 jit.$(OBJEXT): $(top_srcdir)/internal/compilers.h
+jit.$(OBJEXT): $(top_srcdir)/internal/fixnum.h
 jit.$(OBJEXT): $(top_srcdir)/internal/gc.h
 jit.$(OBJEXT): $(top_srcdir)/internal/imemo.h
 jit.$(OBJEXT): $(top_srcdir)/internal/namespace.h
diff --git a/doc/contributing/concurrency_guide.md b/doc/contributing/concurrency_guide.md
new file mode 100644
index 00000000000000..1fb58f7203ad8f
--- /dev/null
+++ b/doc/contributing/concurrency_guide.md
@@ -0,0 +1,154 @@
+# Concurrency Guide
+
+This is a guide to thinking about concurrency in the cruby source code, whether that's contributing to Ruby
+by writing C or by contributing to one of the JITs. This does not touch on native extensions, only the core
+language. It will go over:
+
+* What needs synchronizing?
+* How to use the VM lock, and what you can and can't do when you've acquired this lock.
+* What you can and can't do when you've acquired other native locks.
+* The difference between the VM lock and the GVL.
+* What a VM barrier is and when to use it.
+* The lock ordering of some important locks.
+* How ruby interrupt handling works.
+* The timer thread and what it's responsible for.
+
+## What needs synchronizing?
+
+Before ractors, only one ruby thread could run at once. That didn't mean you could forget about concurrency issues, though. The timer thread
+is a native thread that interacts with other ruby threads and changes some VM internals, so if these changes can be done in parallel by both the timer
+thread and a ruby thread, they need to be synchronized.
+
+When you add ractors to the mix, it gets more complicated. However, ractors allow you to forget about synchronization for non-shareable objects because
+they aren't used across ractors. Only one ruby thread can touch the object at once. For shareable objects, they are deeply frozen so there isn't any
+mutation on the objects themselves. However, something like reading/writing constants across ractors does need to be synchronized. In this case, ruby threads need to
+see a consistent view of the VM. If publishing the update takes 2 steps or even two separate instructions, like in this case, synchronization is required.
+
+Most synchronization is to protect VM internals. These internals include structures for the thread scheduler on each ractor, the global ractor scheduler, the
+coordination between ruby threads and ractors, global tables (for `fstrings`, encodings, symbols and global vars), etc. Anything that can be mutated by a ractor
+that can also be read or mutated by another ractor at the same time requires proper synchronization.
+
+## The VM Lock
+
+There's only one VM lock and it is for critical sections that can only be entered by one ractor at a time.
+Without ractors, the VM lock is useless. It does not stop all ractors from running, as ractors can run
+without trying to acquire this lock. If you're updating global (shared) data between ractors and aren't using
+atomics, you need to use a lock and this is a convenient one to use. Unlike other locks, you can allocate ruby-managed
+memory with it held. When you take the VM lock, there are things you can and can't do during your critical section:
+
+You can (as long as no other locks are also held before the VM lock):
+
+* Create ruby objects, call `ruby_xmalloc`, etc.
+
+You can't:
+
+* Context switch to another ruby thread or ractor. This is important, as many things can cause ruby-level context switches including:
+
+    * Calling any ruby method through, for example, `rb_funcall`. If you execute ruby code, a context switch could happen.
+    This also applies to ruby methods defined in C, as they can be redefined in Ruby. Things that call ruby methods such as
+    `rb_obj_respond_to` are also disallowed.
+
+    * Calling `rb_raise`. This will call `initialize` on the new exception object. With the VM lock
+      held, nothing you call should be able to raise an exception. `NoMemoryError` is allowed, however.
+
+    * Calling `rb_nogvl` or a ruby-level mechanism that can context switch like `rb_mutex_lock`.
+
+    * Enter any blocking operation managed by ruby. This will context switch to another ruby thread using `rb_nogvl` or
+    something equivalent. A blocking operation is one that blocks the thread's progress, such as `sleep` or `IO#read`.
+
+Internally, the VM lock is the `vm->ractor.sync.lock`.
+
+You need to be on a ruby thread to take the VM lock. You also can't take it inside any functions that could be called during sweeping, as MMTK sweeps
+on another thread and you need a valid `ec` to grab the lock. For this same reason (among others), you can't take it from the timer thread either.
+
+## Other Locks
+
+All native locks that aren't the VM lock share a more strict set of rules for what's allowed during the critical section. By native locks, we mean
+anything that uses `rb_native_mutex_lock`. Some important locks include the `interrupt_lock`, the ractor scheduling lock (protects global scheduling data structures),
+the thread scheduling lock (local to each ractor, protects per-ractor scheduling data structures) and the ractor lock (local to each ractor, protects ractor data structures).
+
+When you acquire one of these locks,
+
+You can:
+
+* Allocate memory though non-ruby allocation such as raw `malloc` or the standard library. But be careful, some functions like `strdup` use
+ruby allocation through the use of macros!
+
+* Use `ccan` lists, as they don't allocate.
+
+* Do the usual things like set variables or struct fields, manipulate linked lists, signal condition variables etc.
+
+You can't:
+
+* Allocate ruby-managed memory. This includes creating ruby objects or using `ruby_xmalloc` or `st_insert`. The reason this
+is disallowed is if that allocation causes a GC, then all other ruby threads must join a VM barrier as soon as possible
+(when they next check interrupts or acquire the VM lock). This is so that no other ractors are running during GC. If a ruby thread
+is waiting (blocked) on this same native lock, it can't join the barrier and a deadlock occurs because the barrier will never finish.
+
+* Raise exceptions. You also can't use `EC_JUMP_TAG` if it jumps out of the critical section.
+
+* Context switch. See the `VM Lock` section for more info.
+
+## Difference Between VM Lock and GVL
+
+The VM Lock is a particular lock in the source code. There is only one VM Lock. The GVL, on the other hand, is more of a combination of locks.
+It is "acquired" when a ruby thread is about to run or is running. Since many ruby threads can run at the same time if they're in different ractors,
+there are many GVLs (1 per `SNT` + 1 for the main ractor). It can no longer be thought of as a "Global VM Lock" like it once was before ractors.
+
+## VM Barriers
+
+Sometimes, taking the VM Lock isn't enough and you need a guarantee that all ractors have stopped. This happens when running `GC`, for instance.
+To get a barrier, you take the VM Lock and call `rb_vm_barrier()`. For the duration that the VM lock is held, no other ractors will be running. It's not used
+often as taking a barrier slows ractor performance down considerably, but it's useful to know about and is sometimes the only solution.
+
+## Lock Orderings
+
+It's a good idea to not hold more than 2 locks at once on the same thread. Locking multiple locks can introduce deadlocks, so do it with care. When locking
+multiple locks at once, follow an ordering that is consistent across the program, otherwise you can introduce deadlocks. Here are the orderings of some important locks:
+
+* VM lock before ractor_sched_lock
+* thread_sched_lock before ractor_sched_lock
+* interrupt_lock before timer_th.waiting_lock
+* timer_th.waiting_lock before ractor_sched_lock
+
+These orderings are subject to change, so check the source if you're not sure. On top of this:
+
+* During each `ubf` (unblock) function, the VM lock can be taken around it in some circumstances. This happens during VM shutdown, for example.
+See the "Interrupt Handling" section for more details.
+
+## Ruby Interrupt Handling
+
+When the VM runs ruby code, ruby's threads intermittently check ruby-level interrupts. These software interrupts
+are for various things in ruby and they can be set by other ruby threads or the timer thread.
+
+* Ruby threads check when they should give up their timeslice. The native thread switches to another ruby thread when their time is up.
+* The timer thread sends a "trap" interrupt to the main thread if any ruby-level signal handlers are pending.
+* Ruby threads can have other ruby threads run tasks for them by sending them an interrupt. For instance, ractors send
+the main thread an interrupt when they need to `require` a file so that it's done on the main thread. They wait for the
+main thread's result.
+* During VM shutdown, a "terminate" interrupt is sent to all ractor main threads top stop them asap.
+* When calling `Thread#raise`, the caller sends an interrupt to that thread telling it which exception to raise.
+* Unlocking a mutex sends the next waiter (if any) an interrupt telling it to grab the lock.
+* Signalling or broadcasting on a condition variable tells the waiter(s) to wake up.
+
+This isn't a complete list.
+
+When sending an interrupt to a ruby thread, the ruby thread can be blocked. For example, it could be in the middle of a `TCPSocket#read` call. If so,
+the receiving thread's `ubf` (unblock function) gets called from the thread (ruby thread or timer thread) that sent the interrupt.
+Each ruby thread has a `ubf` that is set when it enters a blocking operation and is unset after returning from it. By default, this `ubf` function sends a
+`SIGVTALRM` to the receiving thread to try to unblock it from the kernel so it can check its interrupts. There are other `ubfs` that
+aren't associated with a syscall, such as when calling `Ractor#join` or `sleep`. All `ubfs` are called with the `interrupt_lock` held,
+so take that into account when using locks inside `ubfs`.
+
+Remember, `ubfs` can be called from the timer thread so you cannot assume an `ec` inside them. The `ec` (execution context) is only set on ruby threads.
+
+## The Timer Thread
+
+The timer thread has a few functions. They are:
+
+* Send interrupts to ruby threads that have run for their whole timeslice.
+* Wake up M:N ruby threads (threads in non-main ractors) blocked on IO or after a specified timeout. This
+uses `kqueue` or `epoll`, depending on the OS, to receive IO events on behalf of the threads.
+* Continue calling  the `SIGVTARLM` signal if a thread is still blocked on a syscall after the first `ubf` call.
+* Signal native threads (`SNT`) waiting on a ractor if there are ractors waiting in the global run queue.
+* Create more `SNT`s if some are blocked, like on IO or on `Ractor#join`.
diff --git a/insns.def b/insns.def
index 8225d1cceaf97e..ce358da28575ed 100644
--- a/insns.def
+++ b/insns.def
@@ -846,6 +846,7 @@ send
 (CALL_DATA cd, ISEQ blockiseq)
 (...)
 (VALUE val)
+// attr bool zjit_profile = true;
 // attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci);
 // attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci);
 {
diff --git a/jit.c b/jit.c
index 0b491f0481d875..b7cb05d1c34efd 100644
--- a/jit.c
+++ b/jit.c
@@ -14,6 +14,7 @@
 #include "iseq.h"
 #include "internal/gc.h"
 #include "vm_sync.h"
+#include "internal/fixnum.h"
 
 // Field offsets for the RObject struct
 enum robject_offsets {
@@ -720,3 +721,9 @@ rb_jit_icache_invalidate(void *start, void *end)
 #error No instruction cache clear available with this compiler on Aarch64!
 #endif
 }
+
+VALUE
+rb_jit_fix_mod_fix(VALUE recv, VALUE obj)
+{
+    return rb_fix_mod_fix(recv, obj);
+}
diff --git a/yjit.c b/yjit.c
index 598fe5716704d0..d0ab367b1c7bb1 100644
--- a/yjit.c
+++ b/yjit.c
@@ -332,12 +332,6 @@ rb_yjit_fix_div_fix(VALUE recv, VALUE obj)
     return rb_fix_div_fix(recv, obj);
 }
 
-VALUE
-rb_yjit_fix_mod_fix(VALUE recv, VALUE obj)
-{
-    return rb_fix_mod_fix(recv, obj);
-}
-
 // Return non-zero when `obj` is an array and its last item is a
 // `ruby2_keywords` hash. We don't support this kind of splat.
 size_t
diff --git a/yjit/bindgen/src/main.rs b/yjit/bindgen/src/main.rs
index 0d4d57e0695941..2b4f48d73ec4bd 100644
--- a/yjit/bindgen/src/main.rs
+++ b/yjit/bindgen/src/main.rs
@@ -367,7 +367,7 @@ fn main() {
         .allowlist_function("rb_yarv_ary_entry_internal")
         .allowlist_function("rb_yjit_ruby2_keywords_splat_p")
         .allowlist_function("rb_yjit_fix_div_fix")
-        .allowlist_function("rb_yjit_fix_mod_fix")
+        .allowlist_function("rb_jit_fix_mod_fix")
         .allowlist_function("rb_FL_TEST")
         .allowlist_function("rb_FL_TEST_RAW")
         .allowlist_function("rb_RB_TYPE_P")
diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs
index 36baecd5358031..0d9e3b74dad874 100644
--- a/yjit/src/cruby.rs
+++ b/yjit/src/cruby.rs
@@ -199,7 +199,7 @@ pub use rb_get_call_data_ci as get_call_data_ci;
 pub use rb_yarv_str_eql_internal as rb_str_eql_internal;
 pub use rb_yarv_ary_entry_internal as rb_ary_entry_internal;
 pub use rb_yjit_fix_div_fix as rb_fix_div_fix;
-pub use rb_yjit_fix_mod_fix as rb_fix_mod_fix;
+pub use rb_jit_fix_mod_fix as rb_fix_mod_fix;
 pub use rb_FL_TEST as FL_TEST;
 pub use rb_FL_TEST_RAW as FL_TEST_RAW;
 pub use rb_RB_TYPE_P as RB_TYPE_P;
diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs
index 0a14a699284268..74661e7ade9bf8 100644
--- a/yjit/src/cruby_bindings.inc.rs
+++ b/yjit/src/cruby_bindings.inc.rs
@@ -1142,7 +1142,6 @@ extern "C" {
     pub fn rb_ary_unshift_m(argc: ::std::os::raw::c_int, argv: *mut VALUE, ary: VALUE) -> VALUE;
     pub fn rb_yjit_rb_ary_subseq_length(ary: VALUE, beg: ::std::os::raw::c_long) -> VALUE;
     pub fn rb_yjit_fix_div_fix(recv: VALUE, obj: VALUE) -> VALUE;
-    pub fn rb_yjit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE;
     pub fn rb_yjit_ruby2_keywords_splat_p(obj: VALUE) -> usize;
     pub fn rb_yjit_splat_varg_checks(
         sp: *mut VALUE,
@@ -1275,4 +1274,5 @@ extern "C" {
         start: *mut ::std::os::raw::c_void,
         end: *mut ::std::os::raw::c_void,
     );
+    pub fn rb_jit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE;
 }
diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs
index 1f04e61dbc9757..f7b335f1bfce89 100644
--- a/zjit/src/codegen.rs
+++ b/zjit/src/codegen.rs
@@ -398,6 +398,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
         Insn::FixnumGe { left, right } => gen_fixnum_ge(asm, opnd!(left), opnd!(right)),
         Insn::FixnumAnd { left, right } => gen_fixnum_and(asm, opnd!(left), opnd!(right)),
         Insn::FixnumOr { left, right } => gen_fixnum_or(asm, opnd!(left), opnd!(right)),
+        &Insn::FixnumMod { left, right, state } => gen_fixnum_mod(jit, asm, opnd!(left), opnd!(right), &function.frame_state(state)),
         Insn::IsNil { val } => gen_isnil(asm, opnd!(val)),
         &Insn::IsMethodCfunc { val, cd, cfunc, state: _ } => gen_is_method_cfunc(jit, asm, opnd!(val), cd, cfunc),
         &Insn::IsBitEqual { left, right } => gen_is_bit_equal(asm, opnd!(left), opnd!(right)),
@@ -411,7 +412,8 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
         // Give up CCallWithFrame for 7+ args since asm.ccall() doesn't support it.
         Insn::CCallWithFrame { cd, state, args, .. } if args.len() > C_ARG_OPNDS.len() =>
             gen_send_without_block(jit, asm, *cd, &function.frame_state(*state), SendFallbackReason::CCallWithFrameTooManyArgs),
-        Insn::CCallWithFrame { cfunc, args, cme, state, .. } => gen_ccall_with_frame(jit, asm, *cfunc, opnds!(args), *cme, &function.frame_state(*state)),
+        Insn::CCallWithFrame { cfunc, args, cme, state, blockiseq, .. } =>
+            gen_ccall_with_frame(jit, asm, *cfunc, opnds!(args), *cme, *blockiseq, &function.frame_state(*state)),
         Insn::CCallVariadic { cfunc, recv, args, name: _, cme, state, return_type: _, elidable: _ } => {
             gen_ccall_variadic(jit, asm, *cfunc, opnd!(recv), opnds!(args), *cme, &function.frame_state(*state))
         }
@@ -446,7 +448,6 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio
         &Insn::LoadIvarExtended { self_val, id, index } => gen_load_ivar_extended(asm, opnd!(self_val), id, index),
         &Insn::ArrayMax { state, .. }
         | &Insn::FixnumDiv { state, .. }
-        | &Insn::FixnumMod { state, .. }
         | &Insn::Throw { state, .. }
         => return Err(state),
     };
@@ -673,20 +674,36 @@ fn gen_patch_point(jit: &mut JITState, asm: &mut Assembler, invariant: &Invarian
 }
 
 /// Generate code for a C function call that pushes a frame
-fn gen_ccall_with_frame(jit: &mut JITState, asm: &mut Assembler, cfunc: *const u8, args: Vec<Opnd>, cme: *const rb_callable_method_entry_t, state: &FrameState) -> lir::Opnd {
+fn gen_ccall_with_frame(
+    jit: &mut JITState,
+    asm: &mut Assembler,
+    cfunc: *const u8,
+    args: Vec<Opnd>,
+    cme: *const rb_callable_method_entry_t,
+    blockiseq: Option<IseqPtr>,
+    state: &FrameState,
+) -> lir::Opnd {
     gen_incr_counter(asm, Counter::non_variadic_cfunc_optimized_send_count);
 
-    gen_prepare_non_leaf_call(jit, asm, state);
+    let caller_stack_size = state.stack_size() - args.len();
+
+    // Can't use gen_prepare_non_leaf_call() because we need to adjust the SP
+    // to account for the receiver and arguments (and block arguments if any)
+    gen_prepare_call_with_gc(asm, state, false);
+    gen_save_sp(asm, caller_stack_size);
+    gen_spill_stack(jit, asm, state);
+    gen_spill_locals(jit, asm, state);
 
     gen_push_frame(asm, args.len(), state, ControlFrame {
         recv: args[0],
         iseq: None,
         cme,
         frame_type: VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL,
+        block_iseq: blockiseq,
     });
 
     asm_comment!(asm, "switch to new SP register");
-    let sp_offset = (state.stack().len() - args.len() + VM_ENV_DATA_SIZE.as_usize()) * SIZEOF_VALUE;
+    let sp_offset = (caller_stack_size + VM_ENV_DATA_SIZE.as_usize()) * SIZEOF_VALUE;
     let new_sp = asm.add(SP, sp_offset.into());
     asm.mov(SP, new_sp);
 
@@ -738,6 +755,7 @@ fn gen_ccall_variadic(
         iseq: None,
         cme,
         frame_type: VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL,
+        block_iseq: None,
     });
 
     asm_comment!(asm, "switch to new SP register");
@@ -1130,6 +1148,7 @@ fn gen_send_without_block_direct(
         iseq: Some(iseq),
         cme,
         frame_type: VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL,
+        block_iseq: None,
     });
 
     asm_comment!(asm, "switch to new SP register");
@@ -1441,6 +1460,13 @@ fn gen_fixnum_or(asm: &mut Assembler, left: lir::Opnd, right: lir::Opnd) -> lir:
     asm.or(left, right)
 }
 
+fn gen_fixnum_mod(jit: &mut JITState, asm: &mut Assembler, left: lir::Opnd, right: lir::Opnd, state: &FrameState) -> lir::Opnd {
+    // Check for left % 0, which raises ZeroDivisionError
+    asm.cmp(right, Opnd::from(VALUE::fixnum_from_usize(0)));
+    asm.je(side_exit(jit, state, FixnumModByZero));
+    asm_ccall!(asm, rb_fix_mod_fix, left, right)
+}
+
 // Compile val == nil
 fn gen_isnil(asm: &mut Assembler, val: lir::Opnd) -> lir::Opnd {
     asm.cmp(val, Qnil.into());
@@ -1719,6 +1745,7 @@ struct ControlFrame {
     iseq: Option<IseqPtr>,
     cme: *const rb_callable_method_entry_t,
     frame_type: u32,
+    block_iseq: Option<IseqPtr>,
 }
 
 /// Compile an interpreter frame
@@ -1735,9 +1762,20 @@ fn gen_push_frame(asm: &mut Assembler, argc: usize, state: &FrameState, frame: C
     };
     let ep_offset = state.stack().len() as i32 + local_size - argc as i32 + VM_ENV_DATA_SIZE as i32 - 1;
     asm.store(Opnd::mem(64, SP, (ep_offset - 2) * SIZEOF_VALUE_I32), VALUE::from(frame.cme).into());
+
+    let block_handler_opnd = if let Some(block_iseq) = frame.block_iseq {
+        // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
+        // VM_CFP_TO_CAPTURED_BLOCK does &cfp->self, rb_captured_block->code.iseq aliases
+        // with cfp->block_code.
+        asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), VALUE::from(block_iseq).into());
+        let cfp_self_addr = asm.lea(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF));
+        asm.or(cfp_self_addr, Opnd::Imm(1))
+    } else {
+        VM_BLOCK_HANDLER_NONE.into()
+    };
+
     // ep[-1]: block_handler or prev EP
-    // block_handler is not supported for now
-    asm.store(Opnd::mem(64, SP, (ep_offset - 1) * SIZEOF_VALUE_I32), VM_BLOCK_HANDLER_NONE.into());
+    asm.store(Opnd::mem(64, SP, (ep_offset - 1) * SIZEOF_VALUE_I32), block_handler_opnd);
     // ep[0]: ENV_FLAGS
     asm.store(Opnd::mem(64, SP, ep_offset * SIZEOF_VALUE_I32), frame.frame_type.into());
 
diff --git a/zjit/src/cruby.rs b/zjit/src/cruby.rs
index 5f4eac1db5ed9e..a84e408861fc54 100644
--- a/zjit/src/cruby.rs
+++ b/zjit/src/cruby.rs
@@ -134,6 +134,7 @@ unsafe extern "C" {
     pub fn rb_str_setbyte(str: VALUE, index: VALUE, value: VALUE) -> VALUE;
     pub fn rb_str_getbyte(str: VALUE, index: VALUE) -> VALUE;
     pub fn rb_vm_splat_array(flag: VALUE, ary: VALUE) -> VALUE;
+    pub fn rb_jit_fix_mod_fix(x: VALUE, y: VALUE) -> VALUE;
     pub fn rb_vm_concat_array(ary1: VALUE, ary2st: VALUE) -> VALUE;
     pub fn rb_vm_get_special_object(reg_ep: *const VALUE, value_type: vm_special_object_type) -> VALUE;
     pub fn rb_vm_concat_to_array(ary1: VALUE, ary2st: VALUE) -> VALUE;
@@ -219,6 +220,7 @@ pub use rb_vm_ci_kwarg as vm_ci_kwarg;
 pub use rb_METHOD_ENTRY_VISI as METHOD_ENTRY_VISI;
 pub use rb_RCLASS_ORIGIN as RCLASS_ORIGIN;
 pub use rb_vm_get_special_object as vm_get_special_object;
+pub use rb_jit_fix_mod_fix as rb_fix_mod_fix;
 
 /// Helper so we can get a Rust string for insn_name()
 pub fn insn_name(opcode: usize) -> String {
diff --git a/zjit/src/cruby_bindings.inc.rs b/zjit/src/cruby_bindings.inc.rs
index c67e229a8009e7..af604661b299b3 100644
--- a/zjit/src/cruby_bindings.inc.rs
+++ b/zjit/src/cruby_bindings.inc.rs
@@ -680,32 +680,33 @@ pub const YARVINSN_trace_setlocal_WC_1: ruby_vminsn_type = 215;
 pub const YARVINSN_trace_putobject_INT2FIX_0_: ruby_vminsn_type = 216;
 pub const YARVINSN_trace_putobject_INT2FIX_1_: ruby_vminsn_type = 217;
 pub const YARVINSN_zjit_getinstancevariable: ruby_vminsn_type = 218;
-pub const YARVINSN_zjit_opt_send_without_block: ruby_vminsn_type = 219;
-pub const YARVINSN_zjit_objtostring: ruby_vminsn_type = 220;
-pub const YARVINSN_zjit_opt_nil_p: ruby_vminsn_type = 221;
-pub const YARVINSN_zjit_opt_plus: ruby_vminsn_type = 222;
-pub const YARVINSN_zjit_opt_minus: ruby_vminsn_type = 223;
-pub const YARVINSN_zjit_opt_mult: ruby_vminsn_type = 224;
-pub const YARVINSN_zjit_opt_div: ruby_vminsn_type = 225;
-pub const YARVINSN_zjit_opt_mod: ruby_vminsn_type = 226;
-pub const YARVINSN_zjit_opt_eq: ruby_vminsn_type = 227;
-pub const YARVINSN_zjit_opt_neq: ruby_vminsn_type = 228;
-pub const YARVINSN_zjit_opt_lt: ruby_vminsn_type = 229;
-pub const YARVINSN_zjit_opt_le: ruby_vminsn_type = 230;
-pub const YARVINSN_zjit_opt_gt: ruby_vminsn_type = 231;
-pub const YARVINSN_zjit_opt_ge: ruby_vminsn_type = 232;
-pub const YARVINSN_zjit_opt_ltlt: ruby_vminsn_type = 233;
-pub const YARVINSN_zjit_opt_and: ruby_vminsn_type = 234;
-pub const YARVINSN_zjit_opt_or: ruby_vminsn_type = 235;
-pub const YARVINSN_zjit_opt_aref: ruby_vminsn_type = 236;
-pub const YARVINSN_zjit_opt_aset: ruby_vminsn_type = 237;
-pub const YARVINSN_zjit_opt_length: ruby_vminsn_type = 238;
-pub const YARVINSN_zjit_opt_size: ruby_vminsn_type = 239;
-pub const YARVINSN_zjit_opt_empty_p: ruby_vminsn_type = 240;
-pub const YARVINSN_zjit_opt_succ: ruby_vminsn_type = 241;
-pub const YARVINSN_zjit_opt_not: ruby_vminsn_type = 242;
-pub const YARVINSN_zjit_opt_regexpmatch2: ruby_vminsn_type = 243;
-pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 244;
+pub const YARVINSN_zjit_send: ruby_vminsn_type = 219;
+pub const YARVINSN_zjit_opt_send_without_block: ruby_vminsn_type = 220;
+pub const YARVINSN_zjit_objtostring: ruby_vminsn_type = 221;
+pub const YARVINSN_zjit_opt_nil_p: ruby_vminsn_type = 222;
+pub const YARVINSN_zjit_opt_plus: ruby_vminsn_type = 223;
+pub const YARVINSN_zjit_opt_minus: ruby_vminsn_type = 224;
+pub const YARVINSN_zjit_opt_mult: ruby_vminsn_type = 225;
+pub const YARVINSN_zjit_opt_div: ruby_vminsn_type = 226;
+pub const YARVINSN_zjit_opt_mod: ruby_vminsn_type = 227;
+pub const YARVINSN_zjit_opt_eq: ruby_vminsn_type = 228;
+pub const YARVINSN_zjit_opt_neq: ruby_vminsn_type = 229;
+pub const YARVINSN_zjit_opt_lt: ruby_vminsn_type = 230;
+pub const YARVINSN_zjit_opt_le: ruby_vminsn_type = 231;
+pub const YARVINSN_zjit_opt_gt: ruby_vminsn_type = 232;
+pub const YARVINSN_zjit_opt_ge: ruby_vminsn_type = 233;
+pub const YARVINSN_zjit_opt_ltlt: ruby_vminsn_type = 234;
+pub const YARVINSN_zjit_opt_and: ruby_vminsn_type = 235;
+pub const YARVINSN_zjit_opt_or: ruby_vminsn_type = 236;
+pub const YARVINSN_zjit_opt_aref: ruby_vminsn_type = 237;
+pub const YARVINSN_zjit_opt_aset: ruby_vminsn_type = 238;
+pub const YARVINSN_zjit_opt_length: ruby_vminsn_type = 239;
+pub const YARVINSN_zjit_opt_size: ruby_vminsn_type = 240;
+pub const YARVINSN_zjit_opt_empty_p: ruby_vminsn_type = 241;
+pub const YARVINSN_zjit_opt_succ: ruby_vminsn_type = 242;
+pub const YARVINSN_zjit_opt_not: ruby_vminsn_type = 243;
+pub const YARVINSN_zjit_opt_regexpmatch2: ruby_vminsn_type = 244;
+pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 245;
 pub type ruby_vminsn_type = u32;
 pub type rb_iseq_callback = ::std::option::Option<
     unsafe extern "C" fn(arg1: *const rb_iseq_t, arg2: *mut ::std::os::raw::c_void),
diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs
index 370ed568579e0c..7083a082fba1a8 100644
--- a/zjit/src/hir.rs
+++ b/zjit/src/hir.rs
@@ -468,6 +468,7 @@ pub enum SideExitReason {
     BlockParamProxyModified,
     BlockParamProxyNotIseqOrIfunc,
     StackOverflow,
+    FixnumModByZero,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -668,6 +669,7 @@ pub enum Insn {
         state: InsnId,
         return_type: Type,
         elidable: bool,
+        blockiseq: Option<IseqPtr>,
     },
 
     /// Call a variadic C function with signature: func(int argc, VALUE *argv, VALUE recv)
@@ -1063,11 +1065,14 @@ impl<'a> std::fmt::Display for InsnPrinter<'a> {
                 }
                 Ok(())
             },
-            Insn::CCallWithFrame { cfunc, args, name, .. } => {
+            Insn::CCallWithFrame { cfunc, args, name, blockiseq, .. } => {
                 write!(f, "CCallWithFrame {}@{:p}", name.contents_lossy(), self.ptr_map.map_ptr(cfunc))?;
                 for arg in args {
                     write!(f, ", {arg}")?;
                 }
+                if let Some(blockiseq) = blockiseq {
+                    write!(f, ", block={:p}", self.ptr_map.map_ptr(blockiseq))?;
+                }
                 Ok(())
             },
             Insn::CCallVariadic { cfunc,  recv, args, name, .. } => {
@@ -1598,7 +1603,17 @@ impl Function {
             &ObjectAlloc { val, state } => ObjectAlloc { val: find!(val), state },
             &ObjectAllocClass { class, state } => ObjectAllocClass { class, state: find!(state) },
             &CCall { cfunc, ref args, name, return_type, elidable } => CCall { cfunc, args: find_vec!(args), name, return_type, elidable },
-            &CCallWithFrame { cd, cfunc, ref args, cme, name, state, return_type, elidable } => CCallWithFrame { cd, cfunc, args: find_vec!(args), cme, name, state: find!(state), return_type, elidable },
+            &CCallWithFrame { cd, cfunc, ref args, cme, name, state, return_type, elidable, blockiseq } => CCallWithFrame {
+                cd,
+                cfunc,
+                args: find_vec!(args),
+                cme,
+                name,
+                state: find!(state),
+                return_type,
+                elidable,
+                blockiseq,
+            },
             &CCallVariadic { cfunc, recv, ref args, cme, name, state, return_type, elidable } => CCallVariadic {
                 cfunc, recv: find!(recv), args: find_vec!(args), cme, name, state, return_type, elidable
             },
@@ -2134,7 +2149,7 @@ impl Function {
                         }
                     }
                     // This doesn't actually optimize Send yet, just replaces the fallback reason to be more precise.
-                    // TODO: Optimize Send
+                    // The actual optimization is done in reduce_send_to_ccall.
                     Insn::Send { recv, cd, state, .. } => {
                         let frame_state = self.frame_state(state);
                         let klass = if let Some(klass) = self.type_of(recv).runtime_exact_ruby_class() {
@@ -2338,8 +2353,111 @@ impl Function {
             fun.push_insn(block, Insn::PatchPoint { invariant: Invariant::MethodRedefined { klass: recv_class, method: method_id, cme: method }, state });
         }
 
-        // Try to reduce one SendWithoutBlock to a CCall
-        fn reduce_to_ccall(
+        // Try to reduce a Send insn to a CCallWithFrame
+        fn reduce_send_to_ccall(
+            fun: &mut Function,
+            block: BlockId,
+            self_type: Type,
+            send: Insn,
+            send_insn_id: InsnId,
+        ) -> Result<(), ()> {
+            let Insn::Send { mut recv, cd, blockiseq, mut args, state, .. } = send else {
+                return Err(());
+            };
+
+            let call_info = unsafe { (*cd).ci };
+            let argc = unsafe { vm_ci_argc(call_info) };
+            let method_id = unsafe { rb_vm_ci_mid(call_info) };
+
+            // If we have info about the class of the receiver
+            let (recv_class, profiled_type) = if let Some(class) = self_type.runtime_exact_ruby_class() {
+                (class, None)
+            } else {
+                let iseq_insn_idx = fun.frame_state(state).insn_idx;
+                let Some(recv_type) = fun.profiled_type_of_at(recv, iseq_insn_idx) else { return Err(()) };
+                (recv_type.class(), Some(recv_type))
+            };
+
+            // Do method lookup
+            let method: *const rb_callable_method_entry_struct = unsafe { rb_callable_method_entry(recv_class, method_id) };
+            if method.is_null() {
+                return Err(());
+            }
+
+            // Filter for C methods
+            let def_type = unsafe { get_cme_def_type(method) };
+            if def_type != VM_METHOD_TYPE_CFUNC {
+                return Err(());
+            }
+
+            // Find the `argc` (arity) of the C method, which describes the parameters it expects
+            let cfunc = unsafe { get_cme_def_body_cfunc(method) };
+            let cfunc_argc = unsafe { get_mct_argc(cfunc) };
+            match cfunc_argc {
+                0.. => {
+                    // (self, arg0, arg1, ..., argc) form
+                    //
+                    // Bail on argc mismatch
+                    if argc != cfunc_argc as u32 {
+                        return Err(());
+                    }
+
+                    let ci_flags = unsafe { vm_ci_flag(call_info) };
+
+                    // When seeing &block argument, fall back to dynamic dispatch for now
+                    // TODO: Support block forwarding
+                    if ci_flags & VM_CALL_ARGS_BLOCKARG != 0 {
+                        return Err(());
+                    }
+
+                    // Commit to the replacement. Put PatchPoint.
+                    gen_patch_points_for_optimized_ccall(fun, block, recv_class, method_id, method, state);
+                    if recv_class.instance_can_have_singleton_class() {
+                        fun.push_insn(block, Insn::PatchPoint { invariant: Invariant::NoSingletonClass { klass: recv_class }, state });
+                    }
+
+                    if let Some(profiled_type) = profiled_type {
+                        // Guard receiver class
+                        recv = fun.push_insn(block, Insn::GuardType { val: recv, guard_type: Type::from_profiled_type(profiled_type), state });
+                        fun.insn_types[recv.0] = fun.infer_type(recv);
+                    }
+
+                    let blockiseq = if blockiseq.is_null() { None } else { Some(blockiseq) };
+
+                    // Emit a call
+                    let cfunc = unsafe { get_mct_func(cfunc) }.cast();
+                    let mut cfunc_args = vec![recv];
+                    cfunc_args.append(&mut args);
+
+                    let ccall = fun.push_insn(block, Insn::CCallWithFrame {
+                        cd,
+                        cfunc,
+                        args: cfunc_args,
+                        cme: method,
+                        name: method_id,
+                        state,
+                        return_type: types::BasicObject,
+                        elidable: false,
+                        blockiseq,
+                    });
+                    fun.make_equal_to(send_insn_id, ccall);
+                    return Ok(());
+                }
+                // Variadic method
+                -1 => {
+                    // func(int argc, VALUE *argv, VALUE recv)
+                    return Err(());
+                }
+                -2 => {
+                    // (self, args_ruby_array)
+                    return Err(());
+                }
+                _ => unreachable!("unknown cfunc kind: argc={argc}")
+            }
+        }
+
+        // Try to reduce a SendWithoutBlock insn to a CCall/CCallWithFrame
+        fn reduce_send_without_block_to_ccall(
             fun: &mut Function,
             block: BlockId,
             self_type: Type,
@@ -2440,7 +2558,17 @@ impl Function {
                         if get_option!(stats) {
                             count_not_inlined_cfunc(fun, block, method);
                         }
-                        let ccall = fun.push_insn(block, Insn::CCallWithFrame { cd, cfunc, args: cfunc_args, cme: method, name: method_id, state, return_type, elidable });
+                        let ccall = fun.push_insn(block, Insn::CCallWithFrame {
+                            cd,
+                            cfunc,
+                            args: cfunc_args,
+                            cme: method,
+                            name: method_id,
+                            state,
+                            return_type,
+                            elidable,
+                            blockiseq: None,
+                        });
                         fun.make_equal_to(send_insn_id, ccall);
                     }
 
@@ -2555,11 +2683,21 @@ impl Function {
             let old_insns = std::mem::take(&mut self.blocks[block.0].insns);
             assert!(self.blocks[block.0].insns.is_empty());
             for insn_id in old_insns {
-                if let send @ Insn::SendWithoutBlock { recv, .. } = self.find(insn_id) {
-                    let recv_type = self.type_of(recv);
-                    if reduce_to_ccall(self, block, recv_type, send, insn_id).is_ok() {
-                        continue;
+                let send = self.find(insn_id);
+                match send {
+                    send @ Insn::SendWithoutBlock { recv, .. } => {
+                        let recv_type = self.type_of(recv);
+                        if reduce_send_without_block_to_ccall(self, block, recv_type, send, insn_id).is_ok() {
+                            continue;
+                        }
+                    }
+                    send @ Insn::Send { recv, .. } => {
+                        let recv_type = self.type_of(recv);
+                        if reduce_send_to_ccall(self, block, recv_type, send, insn_id).is_ok() {
+                            continue;
+                        }
                     }
+                    _ => {}
                 }
                 self.push_insn_id(block, insn_id);
             }
@@ -12583,6 +12721,108 @@ mod opt_tests {
         ");
     }
 
+    #[test]
+    fn test_optimize_send_with_block() {
+        eval(r#"
+            def test = [1, 2, 3].map { |x| x * 2 }
+            test; test
+        "#);
+        assert_snapshot!(hir_string("test"), @r"
+        fn test@<compiled>:2:
+        bb0():
+          EntryPoint interpreter
+          v1:BasicObject = LoadSelf
+          Jump bb2(v1)
+        bb1(v4:BasicObject):
+          EntryPoint JIT(0)
+          Jump bb2(v4)
+        bb2(v6:BasicObject):
+          v10:ArrayExact[VALUE(0x1000)] = Const Value(VALUE(0x1000))
+          v12:ArrayExact = ArrayDup v10
+          PatchPoint MethodRedefined(Array@0x1008, map@0x1010, cme:0x1018)
+          PatchPoint NoSingletonClass(Array@0x1008)
+          v23:BasicObject = CCallWithFrame map@0x1040, v12, block=0x1048
+          CheckInterrupts
+          Return v23
+        ");
+    }
+
+    #[test]
+    fn test_do_not_optimize_send_variadic_with_block() {
+        eval(r#"
+            def test = [1, 2, 3].index { |x| x == 2 }
+            test; test
+        "#);
+        assert_snapshot!(hir_string("test"), @r"
+        fn test@<compiled>:2:
+        bb0():
+          EntryPoint interpreter
+          v1:BasicObject = LoadSelf
+          Jump bb2(v1)
+        bb1(v4:BasicObject):
+          EntryPoint JIT(0)
+          Jump bb2(v4)
+        bb2(v6:BasicObject):
+          v10:ArrayExact[VALUE(0x1000)] = Const Value(VALUE(0x1000))
+          v12:ArrayExact = ArrayDup v10
+          v14:BasicObject = Send v12, 0x1008, :index
+          CheckInterrupts
+          Return v14
+        ");
+    }
+
+    #[test]
+    fn test_do_not_optimize_send_with_block_forwarding() {
+        eval(r#"
+            def test(&block) = [].map(&block)
+            test; test
+        "#);
+        assert_snapshot!(hir_string("test"), @r"
+        fn test@<compiled>:2:
+        bb0():
+          EntryPoint interpreter
+          v1:BasicObject = LoadSelf
+          v2:BasicObject = GetLocal l0, SP@4
+          Jump bb2(v1, v2)
+        bb1(v5:BasicObject, v6:BasicObject):
+          EntryPoint JIT(0)
+          Jump bb2(v5, v6)
+        bb2(v8:BasicObject, v9:BasicObject):
+          v14:ArrayExact = NewArray
+          GuardBlockParamProxy l0
+          v17:HeapObject[BlockParamProxy] = Const Value(VALUE(0x1000))
+          v19:BasicObject = Send v14, 0x1008, :map, v17
+          CheckInterrupts
+          Return v19
+        ");
+    }
+
+    #[test]
+    fn test_do_not_optimize_send_to_iseq_method_with_block() {
+        eval(r#"
+            def foo
+              yield 1
+            end
+
+            def test = foo {}
+            test; test
+        "#);
+        assert_snapshot!(hir_string("test"), @r"
+        fn test@<compiled>:6:
+        bb0():
+          EntryPoint interpreter
+          v1:BasicObject = LoadSelf
+          Jump bb2(v1)
+        bb1(v4:BasicObject):
+          EntryPoint JIT(0)
+          Jump bb2(v4)
+        bb2(v6:BasicObject):
+          v11:BasicObject = Send v6, 0x1000, :foo
+          CheckInterrupts
+          Return v11
+        ");
+    }
+
     #[test]
     fn test_inline_attr_reader_constant() {
         eval("
diff --git a/zjit/src/profile.rs b/zjit/src/profile.rs
index e935ec9731f383..a6c837df5a48ff 100644
--- a/zjit/src/profile.rs
+++ b/zjit/src/profile.rs
@@ -83,7 +83,7 @@ fn profile_insn(bare_opcode: ruby_vminsn_type, ec: EcPtr) {
         YARVINSN_opt_length    => profile_operands(profiler, profile, 1),
         YARVINSN_opt_size      => profile_operands(profiler, profile, 1),
         YARVINSN_opt_succ      => profile_operands(profiler, profile, 1),
-        YARVINSN_opt_send_without_block => {
+        YARVINSN_opt_send_without_block | YARVINSN_send => {
             let cd: *const rb_call_data = profiler.insn_opnd(0).as_ptr();
             let argc = unsafe { vm_ci_argc((*cd).ci) };
             // Profile all the arguments and self (+1).
diff --git a/zjit/src/stats.rs b/zjit/src/stats.rs
index 50f6e61f5c242e..33f29fb3aaed22 100644
--- a/zjit/src/stats.rs
+++ b/zjit/src/stats.rs
@@ -137,6 +137,7 @@ make_counters! {
         exit_fixnum_add_overflow,
         exit_fixnum_sub_overflow,
         exit_fixnum_mult_overflow,
+        exit_fixnum_mod_by_zero,
         exit_guard_type_failure,
         exit_guard_type_not_failure,
         exit_guard_bit_equals_failure,
@@ -332,6 +333,7 @@ pub fn side_exit_counter(reason: crate::hir::SideExitReason) -> Counter {
         FixnumAddOverflow             => exit_fixnum_add_overflow,
         FixnumSubOverflow             => exit_fixnum_sub_overflow,
         FixnumMultOverflow            => exit_fixnum_mult_overflow,
+        FixnumModByZero               => exit_fixnum_mod_by_zero,
         GuardType(_)                  => exit_guard_type_failure,
         GuardTypeNot(_)               => exit_guard_type_not_failure,
         GuardBitEquals(_)             => exit_guard_bit_equals_failure,