diff --git a/docs/envvars.rst b/docs/envvars.rst
index 1e040b4c3e..89c6d2abe5 100644
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -458,6 +458,12 @@ JAX Triton Extensions
    :Default: ``0``
    :Description: Raise a ``RuntimeError`` when the installed JAX is too old to safely run ``TritonAutotunedKernelCall`` (`jax-ml/jax#35218 <https://github.com/jax-ml/jax/pull/35218>`_) instead of silently falling back to non-autotuned dispatch. Useful for CI or debugging to ensure Triton autotuning is active. When set to ``0`` (default), old JAX versions silently fall back to single-config (non-autotuned) kernel dispatch for compatibility.
 
+.. envvar:: NVTE_TRITON_PERMUTATION_BLOCK_SIZES
+
+   :Type: comma-separated list of ``int`` (e.g. ``"128"`` or ``"64,128,256"``)
+   :Default: ``"64,128,256,512,1024,2048,4096"`` (the full sweep)
+   :Description: Override the ``BLOCK_SIZE`` configs evaluated by ``triton.autotune`` for the MoE permutation kernels in ``transformer_engine/common/triton/permutation.py`` (``_permute_kernel``, ``_unpermute_kernel``, ``_unpermute_bwd_with_merging_probs_kernel``, ``_sort_chunks_by_map_kernel``). The default 7-config sweep yields the best runtime on production shapes but costs ~1-5 s of MLIR→LLVM→PTX→cubin compile per config-per-kernel on a cold start (≈2-5 min total per backend, serialized on a single GPU). Set to a single value (e.g. ``"128"``) to skip autotuning entirely for tests / CI where correctness -- not throughput -- is the goal. Must be a comma-separated list of positive ints; malformed values raise ``ValueError`` at kernel-registration time. **Do NOT set this in production runs** -- you will lose autotuned performance.
+
 Examples
 --------
 
diff --git a/qa/L0_jax_distributed_unittest/test.sh b/qa/L0_jax_distributed_unittest/test.sh
index 3f25816600..34dd3d8df4 100644
--- a/qa/L0_jax_distributed_unittest/test.sh
+++ b/qa/L0_jax_distributed_unittest/test.sh
@@ -37,6 +37,33 @@ wait
 TE_PATH=$TE_PATH bash $TE_PATH/examples/jax/collective_gemm/run_test_cgemm.sh || test_fail "run_test_cgemm.sh"
 wait
 
+# MoE custom_vjp distributed (Level 2 smoke + Level 3 perf). Single-host
+# multi-GPU; requires >=4 visible GPUs.
+#
+# Flags required for this file (mirrored in tests/jax/run_distributed_moe_vjp.sh):
+#
+# * ``-p no:typeguard`` — jaxtyping's pytest plugin auto-loads typeguard,
+#   whose @typechecked import hook materialises JAX tracers via isinstance()
+#   checks during shard_map tracing. We disable it only here (other jax tests
+#   need it for type-hint validation).
+# * ``XLA_PYTHON_CLIENT_PREALLOCATE=false`` + ``MEM_FRACTION=0.5`` —
+#   prevents NCCL OOM during EP all-to-all communicator setup (default 90%
+#   preallocation leaves no room).
+# * ``CUDA_LAUNCH_BLOCKING=1`` — workaround for an async-dispatch hang
+#   between Triton custom_calls with ``input_output_aliases`` and the
+#   downstream NCCL ragged_all_to_all in this test's bwd path. Without it,
+#   MainThread parks in _pjit_call_impl_python and one GPU pins at 100%
+#   forever. With it, the smoke suite passes in <1 min. See
+#   ``tests/jax/test_distributed_moe_vjp.py`` module docstring for the
+#   bisection record and TODO for the proper fix.
+XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_MEM_FRACTION=0.5 \
+    CUDA_LAUNCH_BLOCKING=1 \
+    python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v -s \
+    -p no:typeguard \
+    --junitxml=$XML_LOG_DIR/pytest_test_distributed_moe_vjp.xml \
+    $TE_PATH/tests/jax/test_distributed_moe_vjp.py || test_fail "test_distributed_moe_vjp.py"
+wait
+
 if [ $RET -ne 0 ]; then
     echo "Error: some sub-tests failed: $FAILED_CASES"
     exit 1
diff --git a/tests/jax/conftest.py b/tests/jax/conftest.py
index db30f0ed39..4627b9ff5b 100644
--- a/tests/jax/conftest.py
+++ b/tests/jax/conftest.py
@@ -86,6 +86,20 @@ def pytest_sessionfinish(self, session, exitstatus):
         print("=" * 80)
 
 
+def pytest_addoption(parser):
+    """CLI options for multiprocess JAX tests.
+
+    Mirrors examples/jax/encoder/conftest.py so multiprocess tests in
+    tests/jax/ can be launched one-process-per-GPU via a sibling shell
+    script. Required by tests/jax/test_multiprocess_moe_vjp.py to work
+    around the JAX/XLA + lazy Triton kernel load + active NCCL deadlock
+    documented in past_JAX_XLA_deadlock.txt and nvbug/5564750. Harmless
+    for other tests; defaults to 0 (= "not a multiprocess launch").
+    """
+    parser.addoption("--num-process", action="store", default=0)
+    parser.addoption("--process-id", action="store", default=0)
+
+
 def pytest_configure(config):
     config.addinivalue_line(
         "markers",
diff --git a/tests/jax/repro_distributed_triton_hang.py b/tests/jax/repro_distributed_triton_hang.py
new file mode 100644
index 0000000000..c21033203d
--- /dev/null
+++ b/tests/jax/repro_distributed_triton_hang.py
@@ -0,0 +1,364 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Distributed (2x2 ep,fsdp) bisection repro for the TRITON moe hang.
+
+Run on a >=4-GPU node:
+
+    TRITON_PRINT_AUTOTUNING=1 \
+    JAX_LOG_COMPILES=1 \
+    python tests/jax/repro_distributed_triton_hang.py 2>&1 \
+        | tee dist_triton_repro.log
+
+Each phase logs ``[t=...]`` BEFORE executing, so the last printed line
+identifies the hang's call site. Phase ordering:
+
+  1. mesh / axis_rules sanity check (no kernels)
+  2. PURE_JAX block forward (should pass -- confirms wiring)
+  3. TRITON dispatch helpers in isolation under shard_map
+       3a. make_row_id_map only
+       3b. permute_with_mask_map only
+       3c. ragged_all_to_all only
+       3d. sort_chunks_by_map only
+       3e. unpermute_with_mask_map only
+  4. TRITON full forward, eager
+  5. TRITON full forward, jit'd
+  6. TRITON full forward, jit'd  with rerun (cache hit -- should be fast)
+
+A hang in 3a/3b/3e isolates the bug to a single triton kernel; in 4/5
+it's a higher-level orchestration (shard_map spec mismatch, recv buffer
+sizing, etc.).
+"""
+
+import os
+import sys
+import time
+
+import jax
+import jax.numpy as jnp
+from jax.experimental import mesh_utils
+from jax.experimental.shard_map import shard_map
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+from flax.linen import partitioning as nn_partitioning
+
+
+def _log(msg: str) -> None:
+    sys.stdout.write(f"[t={time.monotonic():.2f}s] {msg}\n")
+    sys.stdout.flush()
+
+
+def main():
+    EP_AXIS, FSDP_AXIS = "ep", "fsdp"
+    EP, FSDP = 2, 2
+
+    if jax.device_count() < EP * FSDP:
+        _log(f"FATAL: need >={EP*FSDP} devices, have {jax.device_count()}")
+        return
+
+    devices = mesh_utils.create_device_mesh((EP, FSDP))
+    mesh = Mesh(devices, axis_names=(EP_AXIS, FSDP_AXIS))
+    _log(f"mesh built: {mesh}")
+
+    DTYPE = jnp.bfloat16
+    # Shapes are configurable via env vars so this script can run against
+    # either the original 8x32x64 "medium" repro shape or against the
+    # ``test_distributed_moe_vjp.py`` smoke shape (4x16x32) that exposes
+    # the May-2026 in-process triton hang.
+    #
+    # Defaults match the smoke test exactly so a one-line ``python3
+    # tests/jax/repro_distributed_triton_hang.py`` invocation reproduces
+    # the hang. Set REPRO_SHAPE=medium to revert to the original.
+    _shape = os.environ.get("REPRO_SHAPE", "smoke").lower()
+    if _shape == "medium":
+        BATCH = EP * FSDP * 2  # 8 -- two micro-batches per device
+        SEQ, HIDDEN, INTER = 32, 64, 128
+        E, K = 8, 2
+    elif _shape == "smoke":
+        BATCH = EP * FSDP  # 4 -- one micro-batch per device (smoke test)
+        SEQ, HIDDEN, INTER = 16, 32, 64
+        E, K = 4, 2
+    else:
+        raise ValueError(
+            f"REPRO_SHAPE={_shape!r}; expected one of 'smoke' (smoke-test"
+            " parity, exposes hang) or 'medium' (legacy)."
+        )
+    _log(
+        f"shape config: REPRO_SHAPE={_shape} BATCH={BATCH} SEQ={SEQ}"
+        f" HIDDEN={HIDDEN} INTER={INTER} E={E} K={K}"
+    )
+    LOGICAL_AXIS_RULES = (
+        ("exp", EP_AXIS),
+        ("embed", FSDP_AXIS),
+        ("mlp", None),
+        ("batch", (EP_AXIS, FSDP_AXIS)),
+    )
+
+    from transformer_engine.jax.flax import _MoEBlock as MoEBlock
+    from transformer_engine.jax.moe import PermutationBackend
+    from transformer_engine.jax.sharding import MeshResource, global_shard_guard
+    from transformer_engine.jax.triton_extensions.permutation import (
+        make_row_id_map,
+        permute_with_mask_map,
+        unpermute_with_mask_map,
+        sort_chunks_by_map,
+        make_chunk_sort_map,
+    )
+
+    _log("imports done")
+
+    def _make_block(backend):
+        return MoEBlock(
+            num_experts=E,
+            num_experts_per_tok=K,
+            intermediate_size=INTER,
+            permutation_backend=backend,
+            data_parallelism_axes=(FSDP_AXIS,),
+            dtype=DTYPE,
+            _align_size=0,
+        )
+
+    x = jax.random.normal(jax.random.PRNGKey(0), (BATCH, SEQ, HIDDEN), dtype=DTYPE)
+
+    # -----------------------------------------------------------------
+    # Phase 1: open mesh / axis_rules context. No kernels.
+    # -----------------------------------------------------------------
+    _log("phase 1: open mesh + axis_rules + MeshResource (no kernels)")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        pass
+    _log("phase 1: done")
+
+    # -----------------------------------------------------------------
+    # Phase 2: PURE_JAX block forward (sanity).
+    # -----------------------------------------------------------------
+    _log("phase 2: PURE_JAX block forward (jit'd)")
+    block_pj = _make_block(PermutationBackend.PURE_JAX)
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        variables = jax.jit(block_pj.init)(jax.random.PRNGKey(1), x_sh)
+        out_pj, _ = jax.jit(block_pj.apply)(variables, x_sh)
+        out_pj.block_until_ready()
+    _log(f"phase 2: done -- out_pj.shape={out_pj.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 3: TRITON dispatch primitives in isolation under shard_map.
+    # Shapes per shard: each shard owns BATCH/(EP*FSDP)=2 batches of SEQ
+    # tokens, so num_tokens_per_shard = 2*32 = 64, num_out_tokens = 128.
+    # -----------------------------------------------------------------
+    T_per_shard = (BATCH // (EP * FSDP)) * SEQ  # 2*32 = 64
+    NUM_OUT_PER_SHARD = T_per_shard * K  # 128
+    _log(f"phase 3 prep: per-shard T={T_per_shard} num_out={NUM_OUT_PER_SHARD} H={HIDDEN} E={E}")
+
+    rng = jax.random.PRNGKey(42)
+    rng_r, rng_x = jax.random.split(rng)
+    # Build a fake routing map sharded over batch.
+    routing_map_full = jax.random.bernoulli(rng_r, p=K / E, shape=(BATCH * SEQ, E))
+    x_2d_full = jax.random.normal(rng_x, (BATCH * SEQ, HIDDEN), dtype=DTYPE)
+
+    spec_batch = P((EP_AXIS, FSDP_AXIS), None)
+
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        routing_map_full = jax.lax.with_sharding_constraint(
+            routing_map_full, NamedSharding(mesh, spec_batch)
+        )
+        x_2d_full = jax.lax.with_sharding_constraint(x_2d_full, NamedSharding(mesh, spec_batch))
+
+        # --- 3a: make_row_id_map per shard ---
+        _log("phase 3a: make_row_id_map under shard_map (jit'd)")
+
+        @jax.jit
+        def _fn_3a(rmap):
+            def body(rmap_local):
+                return make_row_id_map(rmap_local, T_per_shard, E)
+
+            return shard_map(
+                body,
+                mesh=mesh,
+                in_specs=(spec_batch,),
+                out_specs=spec_batch,
+                check_rep=False,
+            )(rmap)
+
+        row_id_map_full = _fn_3a(routing_map_full)
+        row_id_map_full.block_until_ready()
+        _log(f"phase 3a: done -- row_id_map_full.shape={row_id_map_full.shape}")
+
+        # --- 3b: permute_with_mask_map per shard ---
+        _log("phase 3b: permute_with_mask_map under shard_map (jit'd)")
+
+        @jax.jit
+        def _fn_3b(x2d, rmap_ids):
+            def body(x2d_l, rmap_ids_l):
+                sorted_x, _ = permute_with_mask_map(
+                    x2d_l,
+                    rmap_ids_l,
+                    None,
+                    T_per_shard,
+                    E,
+                    NUM_OUT_PER_SHARD,
+                    HIDDEN,
+                )
+                return sorted_x
+
+            return shard_map(
+                body,
+                mesh=mesh,
+                in_specs=(spec_batch, spec_batch),
+                out_specs=spec_batch,
+                check_rep=False,
+            )(x2d, rmap_ids)
+
+        sorted_x_full = _fn_3b(x_2d_full, row_id_map_full)
+        sorted_x_full.block_until_ready()
+        _log(f"phase 3b: done -- sorted_x_full.shape={sorted_x_full.shape}")
+
+        # --- 3c: ragged_all_to_all on a tiny payload ---
+        _log("phase 3c: ragged_all_to_all under shard_map (jit'd)")
+        recv_rows = NUM_OUT_PER_SHARD * EP  # worst case
+        send_sizes = jnp.full((EP,), NUM_OUT_PER_SHARD // EP, dtype=jnp.int32)
+        send_offsets = jnp.cumsum(
+            jnp.concatenate([jnp.array([0], dtype=jnp.int32), send_sizes[:-1]])
+        )
+        recv_sizes = send_sizes.copy()
+        recv_offsets = jnp.cumsum(
+            jnp.concatenate([jnp.array([0], dtype=jnp.int32), recv_sizes[:-1]])
+        )
+
+        @jax.jit
+        def _fn_3c(sx):
+            def body(sx_l):
+                recv = jnp.zeros((recv_rows, HIDDEN), dtype=sx_l.dtype)
+                return jax.lax.ragged_all_to_all(
+                    sx_l,
+                    recv,
+                    send_offsets,
+                    send_sizes,
+                    recv_offsets,
+                    recv_sizes,
+                    axis_name=EP_AXIS,
+                )
+
+            return shard_map(
+                body,
+                mesh=mesh,
+                in_specs=spec_batch,
+                out_specs=P((EP_AXIS, FSDP_AXIS), None),
+                check_rep=False,
+            )(sx)
+
+        sx_recv_full = _fn_3c(sorted_x_full)
+        sx_recv_full.block_until_ready()
+        _log(f"phase 3c: done -- sx_recv_full.shape={sx_recv_full.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 4: TRITON block forward, eager (no jit). Reuse pure_jax's
+    # variables for shape parity.
+    # -----------------------------------------------------------------
+    _log("phase 4: TRITON block forward (eager, reuses PURE_JAX init)")
+    block_tr = _make_block(PermutationBackend.TRITON)
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        out_tr, _ = block_tr.apply(variables, x_sh)
+        out_tr.block_until_ready()
+    _log(f"phase 4: done -- out_tr.shape={out_tr.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 4b: TRITON block forward with TRITON init (this is what
+    # the actual failing test does -- jit(block.init) traces the moe
+    # forward with permutation_backend=TRITON and may compile a
+    # different graph than jit(block.apply) does later).
+    # -----------------------------------------------------------------
+    _log("phase 4b: jit(block_tr.init) -- this is the first thing the failing test does")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        variables_tr = jax.jit(block_tr.init)(jax.random.PRNGKey(1), x_sh)
+        jax.tree.map(
+            lambda v: v.value.block_until_ready() if hasattr(v, "value") else v.block_until_ready(),
+            variables_tr,
+        )
+    _log("phase 4b: done")
+
+    # -----------------------------------------------------------------
+    # Phase 4c: TRITON block apply using TRITON-initialised variables
+    # (i.e. the exact second call the failing test makes).
+    # -----------------------------------------------------------------
+    _log("phase 4c: jit(block_tr.apply)(variables_tr, x) -- the failing test's apply call")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        out_tr_init, _ = jax.jit(block_tr.apply)(variables_tr, x_sh)
+        out_tr_init.block_until_ready()
+    _log(f"phase 4c: done -- out_tr_init.shape={out_tr_init.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 4d: EXACTLY mirror what the failing test does -- init +
+    # apply inside the same `with` block, then do assertion-style
+    # access (jnp.isfinite + .item()) OUTSIDE the mesh/axis_rules
+    # context. The .item() forces compute under no active mesh.
+    # -----------------------------------------------------------------
+    _log("phase 4d: init+apply inside `with`, .item() OUTSIDE the with-block")
+    block_tr2 = _make_block(PermutationBackend.TRITON)
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        variables_tr2 = jax.jit(block_tr2.init)(jax.random.PRNGKey(11), x_sh)
+        out_4d, aux_4d = jax.jit(block_tr2.apply)(variables_tr2, x_sh)
+    _log("phase 4d: with-block exited; now doing .item() on isfinite outside ctx")
+    finite = jnp.all(jnp.isfinite(out_4d)).item()
+    _log(f"phase 4d: done -- finite={finite} aux_4d_is_none={aux_4d is None}")
+
+    # -----------------------------------------------------------------
+    # Phase 5: TRITON block forward, jit'd.
+    # -----------------------------------------------------------------
+    _log("phase 5: TRITON block forward (jit'd)")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        out_tr2, _ = jax.jit(block_tr.apply)(variables, x_sh)
+        out_tr2.block_until_ready()
+    _log(f"phase 5: done -- out_tr2.shape={out_tr2.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 6: TRITON block forward, jit'd, rerun (cache hit).
+    # -----------------------------------------------------------------
+    _log("phase 6: TRITON block forward (rerun, expect cache hit)")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        out_tr3, _ = jax.jit(block_tr.apply)(variables, x_sh)
+        out_tr3.block_until_ready()
+    _log(f"phase 6: done -- out_tr3.shape={out_tr3.shape}")
+
+    _log("ALL PHASES DONE")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/jax/repro_triton_hang.py b/tests/jax/repro_triton_hang.py
new file mode 100644
index 0000000000..69c3ddd90a
--- /dev/null
+++ b/tests/jax/repro_triton_hang.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Minimal standalone repro to bisect where the TRITON moe path hangs.
+
+Run with:
+
+    TRITON_PRINT_AUTOTUNING=1 \
+    JAX_LOG_COMPILES=1 \
+    python tests/jax/repro_triton_hang.py 2>&1 | tee triton_repro.log
+
+Each phase prints its name BEFORE running so a hang's culprit is
+obvious from the last printed line.
+"""
+
+import os
+import sys
+import time
+
+import jax
+import jax.numpy as jnp
+
+
+def _log(msg: str) -> None:
+    sys.stdout.write(f"[t={time.monotonic():.2f}s] {msg}\n")
+    sys.stdout.flush()
+
+
+def main():
+    DTYPE = jnp.float32
+    BATCH, SEQ, H, M = 2, 16, 32, 64
+    E, K = 8, 2
+    T = BATCH * SEQ
+
+    _log("imports: starting")
+    from transformer_engine.jax.moe import PermutationBackend, moe
+    from transformer_engine.jax.triton_extensions.permutation import (
+        make_row_id_map,
+        permute_with_mask_map,
+        unpermute_with_mask_map,
+    )
+    from transformer_engine.jax import cpp_extensions as tex
+
+    _log("imports: done")
+
+    key = jax.random.PRNGKey(0)
+    kp, kx = jax.random.split(key)
+    x = jax.random.normal(kx, (BATCH, SEQ, H), dtype=DTYPE)
+    init = jax.nn.initializers.variance_scaling(1.0, "fan_in", "truncated_normal")
+    kg, kw0, kw1, kwo = jax.random.split(kp, 4)
+    gate_kernel = init(kg, (H, E), DTYPE)
+    wi_0 = init(kw0, (E, H, M), DTYPE)
+    wi_1 = init(kw1, (E, H, M), DTYPE)
+    wo = init(kwo, (E, M, H), DTYPE)
+    _log(f"shapes: x={x.shape} gate={gate_kernel.shape} wi0={wi_0.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 1: make_row_id_map only.
+    # -----------------------------------------------------------------
+    _log("phase 1: make_row_id_map (eager, no jit)")
+    routing_map = jnp.zeros((T, E), dtype=jnp.bool_)
+    routing_map = routing_map.at[jnp.arange(T), jnp.arange(T) % E].set(True)
+    routing_map = routing_map.at[jnp.arange(T), (jnp.arange(T) + 1) % E].set(True)
+    row_id_map = make_row_id_map(routing_map, T, E)
+    row_id_map.block_until_ready()
+    _log(f"phase 1: done -- row_id_map.shape={row_id_map.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 2: permute_with_mask_map only.
+    # -----------------------------------------------------------------
+    _log("phase 2: permute_with_mask_map (eager, no jit)")
+    x_2d = x.reshape(T, H)
+    sorted_x, _ = permute_with_mask_map(x_2d, row_id_map, None, T, E, T * K, H)
+    sorted_x.block_until_ready()
+    _log(f"phase 2: done -- sorted_x.shape={sorted_x.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 3: unpermute_with_mask_map only.
+    # -----------------------------------------------------------------
+    _log("phase 3: unpermute_with_mask_map (eager, no jit)")
+    merging = jnp.ones((T, E), dtype=DTYPE) * (1.0 / K)
+    out_2d, _ = unpermute_with_mask_map(sorted_x, row_id_map, merging, None, T, E, H)
+    out_2d.block_until_ready()
+    _log(f"phase 3: done -- out_2d.shape={out_2d.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 4: grouped_quantize + grouped_gemm only (FFN building blocks
+    # -- these are shared with the pure_jax path so they should be fine,
+    # but worth measuring in isolation).
+    # -----------------------------------------------------------------
+    _log("phase 4: grouped_quantize + grouped_gemm (eager)")
+    from transformer_engine.jax.quantize import noop_quantizer_set, TensorUsage
+
+    group_sizes = jnp.full((E,), T * K // E, dtype=jnp.int32)
+    cs = tex.grouped_quantize(sorted_x, noop_quantizer_set.x, group_sizes, flatten_axis=-1)
+    cw = tex.grouped_quantize(wi_0, noop_quantizer_set.kernel, flatten_axis=-1)
+    out = tex.grouped_gemm(
+        cs.get_tensor(usage=TensorUsage.LHS),
+        cw.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((1,), (1,)),
+    )
+    out.block_until_ready()
+    _log(f"phase 4: done -- out.shape={out.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 5: full TRITON forward, eager (no jit, no grad).
+    # -----------------------------------------------------------------
+    _log("phase 5: full triton forward (eager, no jit, no grad)")
+    out_te, _ = moe(
+        x,
+        gate_kernel,
+        wi_0,
+        wi_1,
+        wo,
+        num_experts=E,
+        num_experts_per_tok=K,
+        activation_type="silu",
+        score_function="softmax",
+        use_pre_softmax=False,
+        scaling_factor=1.0,
+        aux_loss_coeff=0.0,
+        permutation_backend=PermutationBackend.TRITON,
+        align_size=0,
+        dtype=DTYPE,
+    )
+    out_te.block_until_ready()
+    _log(f"phase 5: done -- out_te.shape={out_te.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 6: jit'd forward.
+    # -----------------------------------------------------------------
+    _log("phase 6: full triton forward (jit'd)")
+
+    @jax.jit
+    def _fwd(x, gate_kernel, wi_0, wi_1, wo):
+        return moe(
+            x,
+            gate_kernel,
+            wi_0,
+            wi_1,
+            wo,
+            num_experts=E,
+            num_experts_per_tok=K,
+            activation_type="silu",
+            score_function="softmax",
+            use_pre_softmax=False,
+            scaling_factor=1.0,
+            aux_loss_coeff=0.0,
+            permutation_backend=PermutationBackend.TRITON,
+            align_size=0,
+            dtype=DTYPE,
+        )
+
+    out_te2, _ = _fwd(x, gate_kernel, wi_0, wi_1, wo)
+    out_te2.block_until_ready()
+    _log(f"phase 6: done -- out_te2.shape={out_te2.shape}")
+
+    # -----------------------------------------------------------------
+    # Phase 7: jit'd grad (this is what test_grads_finite_and_nonzero hits).
+    # -----------------------------------------------------------------
+    _log("phase 7: jit'd grad of mean(out**2)")
+
+    @jax.jit
+    def _grad_loss(x, gate_kernel, wi_0, wi_1, wo):
+        def loss(*args):
+            o, _ = moe(
+                *args,
+                num_experts=E,
+                num_experts_per_tok=K,
+                activation_type="silu",
+                score_function="softmax",
+                use_pre_softmax=False,
+                scaling_factor=1.0,
+                aux_loss_coeff=0.0,
+                permutation_backend=PermutationBackend.TRITON,
+                align_size=0,
+                dtype=DTYPE,
+            )
+            return jnp.mean(o**2)
+
+        return jax.grad(loss, argnums=(1, 2, 3, 4))(x, gate_kernel, wi_0, wi_1, wo)
+
+    g_gate, g_wi0, g_wi1, g_wo = _grad_loss(x, gate_kernel, wi_0, wi_1, wo)
+    g_gate.block_until_ready()
+    _log(f"phase 7: done -- g_gate.shape={g_gate.shape}")
+
+    _log("ALL PHASES DONE")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/jax/run_distributed_moe_vjp.sh b/tests/jax/run_distributed_moe_vjp.sh
new file mode 100755
index 0000000000..c22af874fe
--- /dev/null
+++ b/tests/jax/run_distributed_moe_vjp.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+#
+# Dev-loop convenience wrapper for the distributed MoE VJP tests.
+#
+# This is NOT the CI invocation -- CI uses
+# ``qa/L0_jax_distributed_unittest/test.sh`` which calls pytest with the
+# exact same flags as this script. Keep the two in sync: any flag added
+# here for correctness (``-p no:typeguard``, env vars, etc.) MUST also
+# appear in the QA script and vice versa.
+#
+# Usage from the TransformerEngine repo root (or any cwd; this script
+# resolves its own path):
+#
+#   # All tests (smoke + perf):
+#   bash tests/jax/run_distributed_moe_vjp.sh
+#
+#   # Just smoke (Level 2 correctness):
+#   bash tests/jax/run_distributed_moe_vjp.sh smoke
+#
+#   # Just perf (Level 3 throughput):
+#   bash tests/jax/run_distributed_moe_vjp.sh perf
+#
+#   # A single test by name pattern (passed through to pytest -k):
+#   bash tests/jax/run_distributed_moe_vjp.sh test_fwd_and_bwd_smoke
+#
+# Required environment / flags (mirrored from
+# qa/L0_jax_distributed_unittest/test.sh):
+#
+#   * XLA_PYTHON_CLIENT_PREALLOCATE=false / MEM_FRACTION=0.5 -- prevents
+#     NCCL OOM during EP all-to-all setup. JAX's default 90% HBM
+#     preallocation leaves no room for the communicator.
+#   * ``-p no:typeguard`` -- jaxtyping's pytest plugin auto-loads
+#     typeguard, whose @typechecked import hook materialises JAX tracers
+#     via isinstance() checks and deadlocks the first ``block.apply`` of
+#     the triton backend inside shard_map + ragged_all_to_all. See
+#     CLAUDE.md and the test module docstring for the bisection record.
+#
+# Optional environment knobs (dev-only; CI does not need these):
+#
+#   CUDA_VISIBLE_DEVICES -- defaults to "0,1,2,3".
+#   PYTEST_EXTRA_ARGS    -- appended verbatim to the pytest invocation,
+#                           e.g. PYTEST_EXTRA_ARGS="--maxfail=1 -x" or
+#                           PYTEST_EXTRA_ARGS="-k 'fwd_and_bwd_smoke[triton]'".
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TE_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+TEST_FILE="$TE_ROOT/tests/jax/test_distributed_moe_vjp.py"
+PYTEST_INI="$TE_ROOT/tests/jax/pytest.ini"
+
+mode="${1:-all}"
+
+case "$mode" in
+    smoke|level2)
+        marker_args=("-m" "triton and not slow")
+        kfilter=()
+        ;;
+    perf|level3)
+        marker_args=("-m" "triton and slow")
+        kfilter=()
+        ;;
+    all)
+        marker_args=("-m" "triton")
+        kfilter=()
+        ;;
+    *)
+        marker_args=("-m" "triton")
+        kfilter=("-k" "$mode")
+        ;;
+esac
+
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
+export XLA_PYTHON_CLIENT_PREALLOCATE="${XLA_PYTHON_CLIENT_PREALLOCATE:-false}"
+export XLA_PYTHON_CLIENT_MEM_FRACTION="${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.5}"
+
+# CUDA_LAUNCH_BLOCKING=1 forces every CUDA launch synchronous. Without
+# this the bwd of the triton backend hangs: MainThread parks in
+# _pjit_call_impl_python, one GPU pinned at 100%, no NCCL ops enqueued.
+# The root cause is an XLA dependency edge that is mis-tracked between
+# our Triton kernels (which use input_output_aliases on a pre-zeroed
+# output buffer) and the downstream NCCL ragged_all_to_all -- the
+# all_to_all launches before the triton kernel finishes writing
+# sorted_inputs, and different ranks then read different versions of
+# the per-expert token counts, deadlocking NCCL. Empirically:
+# CUDA_LAUNCH_BLOCKING=1 makes the smoke suite pass in <1 min across 3
+# consecutive runs; without it, all triton experiments hang at 300+ s.
+# Slowdown for these correctness tests is ~negligible (small shapes).
+# TODO(teddy/moe_block): replace this workaround with a fix in
+# triton_call_lowering (or moe.py inserting an explicit data-dep edge)
+# and remove this export.
+export CUDA_LAUNCH_BLOCKING="${CUDA_LAUNCH_BLOCKING:-1}"
+
+echo "============================================================"
+echo "MoE VJP distributed tests (dev wrapper; CI: qa/L0_jax_distributed_unittest/test.sh)"
+echo "  mode                : $mode"
+echo "  marker filter       : ${marker_args[*]}"
+echo "  -k filter           : ${kfilter[*]:-<none>}"
+echo "  CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+echo "  test file           : $TEST_FILE"
+echo "  XLA_PYTHON_CLIENT_PREALLOCATE: $XLA_PYTHON_CLIENT_PREALLOCATE"
+echo "  XLA_PYTHON_CLIENT_MEM_FRACTION: $XLA_PYTHON_CLIENT_MEM_FRACTION"
+echo "  CUDA_LAUNCH_BLOCKING: $CUDA_LAUNCH_BLOCKING"
+echo "  PYTEST_EXTRA_ARGS   : ${PYTEST_EXTRA_ARGS:-<unset>}"
+echo "============================================================"
+
+# IMPORTANT: keep the pytest invocation in lock-step with
+# qa/L0_jax_distributed_unittest/test.sh. The two scripts must call
+# pytest with identical flags so a dev-loop pass guarantees a CI pass.
+exec python3 -m pytest \
+    -c "$PYTEST_INI" \
+    "$TEST_FILE" \
+    "${marker_args[@]}" \
+    "${kfilter[@]}" \
+    -p no:typeguard \
+    -v -s \
+    ${PYTEST_EXTRA_ARGS:-}
diff --git a/tests/jax/run_multiprocess_moe_vjp.sh b/tests/jax/run_multiprocess_moe_vjp.sh
new file mode 100755
index 0000000000..252edbfdd4
--- /dev/null
+++ b/tests/jax/run_multiprocess_moe_vjp.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+#
+# Multiprocess (one-GPU-per-process) launcher for the unified MoE VJP
+# smoke suite. See tests/jax/test_multiprocess_moe_vjp.py for *why* we
+# need this instead of -- or in addition to -- the single-process file.
+#
+# Pattern mirrors examples/jax/encoder/run_test_multiprocessing_encoder.sh:
+# fork one pytest invocation per visible GPU, pass each its own
+# --num-process=N --process-id=i, and wait for all of them. Each child
+# calls jax.distributed.initialize(..., local_device_ids=process_id) so
+# each Python process only sees its one GPU as a local device, the four
+# processes form a global 4-device mesh, and the JAX/XLA lazy-Triton-
+# load + active-NCCL deadlock (past_JAX_XLA_deadlock.txt, nvbug/5564750)
+# CANNOT occur: every process has its own CUDA driver context, so the
+# global module-load lock is not shared across the threads driving
+# different GPUs.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TE_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+TEST_FILE="$TE_ROOT/tests/jax/test_multiprocess_moe_vjp.py"
+PYTEST_INI="$TE_ROOT/tests/jax/pytest.ini"
+
+NUM_GPUS="${NUM_GPUS:-$(nvidia-smi -L | wc -l)}"
+if [ "$NUM_GPUS" -lt 4 ]; then
+    echo "[run_multiprocess_moe_vjp.sh] need >=4 GPUs (got $NUM_GPUS); aborting" >&2
+    exit 1
+fi
+
+mode="${1:-smoke}"
+case "$mode" in
+    smoke|level2) marker_args=("-m" "triton and not slow") ;;
+    perf|level3)  marker_args=("-m" "triton and slow") ;;
+    all)          marker_args=("-m" "triton") ;;
+    *)            marker_args=("-m" "triton") ;;
+esac
+
+export XLA_PYTHON_CLIENT_PREALLOCATE="${XLA_PYTHON_CLIENT_PREALLOCATE:-false}"
+export XLA_PYTHON_CLIENT_MEM_FRACTION="${XLA_PYTHON_CLIENT_MEM_FRACTION:-0.5}"
+export MOE_VJP_COORDINATOR_ADDRESS="${MOE_VJP_COORDINATOR_ADDRESS:-127.0.0.1:13456}"
+
+# We do NOT set CUDA_LAUNCH_BLOCKING=1 here. The whole point of this
+# launcher is that one-GPU-per-process makes the deadlock window
+# impossible without needing that workaround.
+
+echo "============================================================"
+echo "MoE VJP MULTIPROCESS smoke (one process per GPU, ${NUM_GPUS} GPUs)"
+echo "  mode               : $mode"
+echo "  marker filter      : ${marker_args[*]}"
+echo "  test file          : $TEST_FILE"
+echo "  coordinator        : $MOE_VJP_COORDINATOR_ADDRESS"
+echo "  XLA_PYTHON_CLIENT_PREALLOCATE: $XLA_PYTHON_CLIENT_PREALLOCATE"
+echo "  XLA_PYTHON_CLIENT_MEM_FRACTION: $XLA_PYTHON_CLIENT_MEM_FRACTION"
+echo "============================================================"
+
+# Per-process logs. MOE_VJP_MP_LOG_DIR can be set to a host-mounted dir
+# (e.g. when running inside a container that throws away /tmp on exit)
+# so logs survive for postmortem inspection. Defaults to a fresh /tmp.
+if [ -n "${MOE_VJP_MP_LOG_DIR:-}" ]; then
+    LOG_DIR="$MOE_VJP_MP_LOG_DIR"
+    mkdir -p "$LOG_DIR"
+else
+    LOG_DIR=$(mktemp -d -t moe_vjp_mp_XXXXXX)
+fi
+echo "Per-process logs: $LOG_DIR"
+
+PIDS=()
+
+cleanup() {
+    for pid in "${PIDS[@]:-}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -TERM "$pid" 2>/dev/null || true
+        fi
+    done
+    sleep 1
+    for pid in "${PIDS[@]:-}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -KILL "$pid" 2>/dev/null || true
+        fi
+    done
+}
+trap cleanup EXIT INT TERM
+
+# Launch one pytest per GPU. Process 0 streams to stdout; others log
+# only to file so the live output isn't a mosaic.
+for i in $(seq 0 $((NUM_GPUS - 1))); do
+    LOG_FILE="$LOG_DIR/proc_${i}.log"
+    PYTEST_CMD=(
+        python3 -m pytest -c "$PYTEST_INI"
+        "$TEST_FILE"
+        "${marker_args[@]}"
+        -p no:typeguard
+        -v -s
+        --num-process="$NUM_GPUS"
+        --process-id="$i"
+    )
+    if [ "$i" -eq 0 ]; then
+        echo "=== Live output from process 0 ==="
+        "${PYTEST_CMD[@]}" 2>&1 | tee "$LOG_FILE" &
+    else
+        "${PYTEST_CMD[@]}" > "$LOG_FILE" 2>&1 &
+    fi
+    PIDS+=("$!")
+done
+
+# Wait for all and collect exit codes.
+EXITS=()
+for pid in "${PIDS[@]}"; do
+    if wait "$pid"; then
+        EXITS+=("0")
+    else
+        EXITS+=("$?")
+    fi
+done
+
+# Summary.
+echo
+echo "============================================================"
+echo "Per-process exit codes:"
+for i in "${!EXITS[@]}"; do
+    echo "  proc $i -> ${EXITS[$i]}"
+done
+
+# Final pass/fail. Any non-zero in any process fails the suite, but
+# we tolerate non-zero on the non-zero processes only if proc 0
+# reports PASS (this matches the encoder launcher's logic). Simplest
+# strict rule: any non-zero is a failure.
+FAILED=0
+for e in "${EXITS[@]}"; do
+    if [ "$e" != "0" ]; then
+        FAILED=1
+        break
+    fi
+done
+
+echo
+if [ "$FAILED" -eq 0 ]; then
+    echo "[run_multiprocess_moe_vjp.sh] all processes PASSED"
+    if [ -z "${MOE_VJP_MP_LOG_DIR:-}" ]; then
+        rm -rf "$LOG_DIR"
+    fi
+    exit 0
+fi
+
+echo "[run_multiprocess_moe_vjp.sh] at least one process FAILED"
+echo "  retaining logs at $LOG_DIR for diagnosis"
+echo "  process 0 tail:"
+tail -20 "$LOG_DIR/proc_0.log" 2>/dev/null || true
+exit 1
diff --git a/tests/jax/standalone_smoke_triton.py b/tests/jax/standalone_smoke_triton.py
new file mode 100644
index 0000000000..5d74eda634
--- /dev/null
+++ b/tests/jax/standalone_smoke_triton.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Standalone equivalent of ``test_fwd_and_bwd_smoke[triton]``.
+
+This script runs *literally* the same code body as the pytest test but with
+no pytest, no conftest.py, no autouse fixtures, no plugins (jaxtyping,
+typeguard, forked, anyio). Run with:
+
+    XLA_PYTHON_CLIENT_PREALLOCATE=false \
+    XLA_PYTHON_CLIENT_MEM_FRACTION=0.5 \
+    NVTE_TRITON_PERMUTATION_BLOCK_SIZES=128 \
+    python3 tests/jax/standalone_smoke_triton.py 2>&1 | tee /tmp/standalone.log
+
+If this **passes** while
+``pytest -k 'test_fwd_and_bwd_smoke[triton]' tests/jax/test_distributed_moe_vjp.py``
+hangs, the bug is in pytest's plugin / conftest layer (likely the
+``import transformer_engine.jax`` at conftest module-level race, the
+autouse ``clear_live_arrays`` fixture, the ``NVTE_FUSED_ATTN=1`` flip in
+``enable_fused_attn_after_hopper``, or a typeguard/jaxtyping wrapper).
+
+If this **also hangs**, the bug is in our application code; the
+distributed_triton_hang.py repro is missing whatever the actual triggering
+sequence is and we should add it here as a starting point.
+"""
+
+import os
+import sys
+import time
+import faulthandler
+import signal
+
+
+_WATCHDOG_SECS = int(os.environ.get("MOE_VJP_WATCHDOG_SECS", "60") or "0")
+faulthandler.enable()
+try:
+    faulthandler.register(signal.SIGUSR1, all_threads=True, chain=False)
+except (AttributeError, ValueError):
+    pass
+if _WATCHDOG_SECS > 0:
+    faulthandler.dump_traceback_later(_WATCHDOG_SECS, repeat=True)
+
+
+import jax
+import jax.numpy as jnp
+from jax.experimental import mesh_utils
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+from flax.linen import partitioning as nn_partitioning
+
+
+# Identical config to TestMoeVjpDistributedSmoke.test_fwd_and_bwd_smoke[triton]
+EP_AXIS = "ep"
+FSDP_AXIS = "fsdp"
+EP_SIZE = 2
+FSDP_SIZE = 2
+LOGICAL_AXIS_RULES = (
+    ("exp", EP_AXIS),
+    ("embed", FSDP_AXIS),
+    ("mlp", None),
+    ("batch", (EP_AXIS, FSDP_AXIS)),
+)
+SMOKE_BATCH = 4
+SMOKE_SEQ = 16
+SMOKE_HIDDEN = 32
+SMOKE_INTER = 64
+SMOKE_NUM_EXPERTS = 4
+SMOKE_TOPK = 2
+
+
+def _hb(msg: str) -> None:
+    sys.stdout.write(f"  [{time.strftime('%H:%M:%S')}] {msg}\n")
+    sys.stdout.flush()
+
+
+def main() -> int:
+    if jax.device_count() < EP_SIZE * FSDP_SIZE:
+        _hb(f"FATAL: need {EP_SIZE*FSDP_SIZE} devices, have {jax.device_count()}")
+        return 1
+
+    _hb("import transformer_engine.jax")
+    from transformer_engine.jax.flax import _MoEBlock as MoEBlock
+    from transformer_engine.jax.moe import PermutationBackend
+    from transformer_engine.jax.sharding import MeshResource, global_shard_guard
+
+    _hb("building mesh")
+    devices = mesh_utils.create_device_mesh((EP_SIZE, FSDP_SIZE))
+    mesh = Mesh(devices, axis_names=(EP_AXIS, FSDP_AXIS))
+
+    _hb("building block")
+    block = MoEBlock(
+        num_experts=SMOKE_NUM_EXPERTS,
+        num_experts_per_tok=SMOKE_TOPK,
+        intermediate_size=SMOKE_INTER,
+        permutation_backend=PermutationBackend.TRITON,
+        data_parallelism_axes=(FSDP_AXIS,),
+        aux_loss_coeff=0.0,
+        dtype=jnp.bfloat16,
+        _align_size=0,
+    )
+
+    x = jax.random.normal(
+        jax.random.PRNGKey(0),
+        (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+        dtype=jnp.bfloat16,
+    )
+
+    _hb("entering mesh + global_shard_guard + axis_rules")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+        _hb("  -> jit(block.init)")
+        variables = jax.jit(block.init)(jax.random.PRNGKey(1), x_sh)
+        jax.block_until_ready(jax.tree_util.tree_leaves(variables)[0])
+        _hb("  -> jit(block.apply)")
+        output, aux = jax.jit(block.apply)(variables, x_sh)
+        jax.block_until_ready(output)
+    _hb(f"apply done -- output.shape={output.shape}, aux={aux}")
+
+    # Grad step (matches _grad_step in the test)
+    _hb("entering mesh ctx for grad")
+    with mesh, global_shard_guard(
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x_sh = jax.lax.with_sharding_constraint(
+            x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+        )
+
+        def loss_fn(variables, x):
+            out, a = block.apply(variables, x)
+            main = jnp.mean(out.astype(jnp.float32) ** 2)
+            return main + (a.astype(jnp.float32) if a is not None else 0.0)
+
+        _hb("  -> jit(grad(loss_fn))")
+        grads = jax.jit(jax.grad(loss_fn))(variables, x_sh)
+        jax.block_until_ready(jax.tree_util.tree_leaves(grads)[0])
+    _hb("grad done")
+
+    _hb("SUCCESS: standalone triton smoke completed")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/jax/test_distributed_moe_vjp.py b/tests/jax/test_distributed_moe_vjp.py
new file mode 100644
index 0000000000..34a463b085
--- /dev/null
+++ b/tests/jax/test_distributed_moe_vjp.py
@@ -0,0 +1,608 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Multi-GPU (single-host) tests for the unified MoE custom_vjp.
+
+Targets a 4-GPU box (e.g. a single GB200/B200 node) running a 2x2 mesh
+``("ep", "fsdp")``. Two test classes:
+
+* :class:`TestMoeVjpDistributedSmoke` -- "Level 2" structural / numerical
+  correctness checks. Small shapes, fast. Verifies that
+  :func:`transformer_engine.jax.moe.moe`'s internal ``shard_map`` opens
+  cleanly under EP + FSDP-of-batch, that ``out_specs`` matches the
+  returned ``ctx`` pytree, that gradients are finite, and that
+  ``PURE_JAX`` and ``TRITON`` backends agree.
+
+* :class:`TestMoeVjpDistributedPerf` -- "Level 3" Mixtral-ish-shape
+  throughput + multi-step training-loop simulation. Reports
+  ``tokens/sec``. Marked ``slow``; opt in with ``-m slow``.
+
+Both share the same fixture-built mesh and ``MeshResource`` context. The
+file is intentionally self-contained: no MaxText dependency, just JAX +
+TE. To compare end-to-end against a MaxText / MoEBlock baseline, run the
+existing ``test_perm.sh`` / ``test_router.sh`` scripts in the maxtext
+repo (those drive a real Mixtral training step and report tokens/sec).
+
+How to run
+----------
+
+CI invocation (canonical):
+
+    bash qa/L0_jax_distributed_unittest/test.sh
+
+Dev-loop invocation (thin shim around the same pytest command):
+
+    bash tests/jax/run_distributed_moe_vjp.sh smoke
+
+Both scripts apply ``-p no:typeguard`` AND ``CUDA_LAUNCH_BLOCKING=1`` --
+see "CRITICAL" below.
+
+Raw pytest invocation (do NOT use this in CI; only for one-off dev
+work where you understand the gotchas):
+
+    CUDA_LAUNCH_BLOCKING=1 \
+        XLA_PYTHON_CLIENT_PREALLOCATE=false XLA_PYTHON_CLIENT_MEM_FRACTION=0.5 \
+        python -m pytest -c tests/jax/pytest.ini -v -s \
+        -p no:typeguard \
+        tests/jax/test_distributed_moe_vjp.py
+
+The combination of ``XLA_PYTHON_CLIENT_PREALLOCATE=false`` (set at the
+top of this file) and tests deliberately structured so each
+parametrize variant only compiles the MoE custom_vjp once means a
+single process runs the entire smoke suite in well under a minute,
+even with CUDA_LAUNCH_BLOCKING=1.
+
+CRITICAL: ``CUDA_LAUNCH_BLOCKING=1`` is REQUIRED
+------------------------------------------------
+
+Without ``CUDA_LAUNCH_BLOCKING=1`` the bwd of the triton backend hangs
+forever: MainThread parks in ``_pjit_call_impl_python``, one GPU pins
+at 100%, three GPUs sit idle, no NCCL ops are ever enqueued.
+
+Root cause: an async-dispatch race between our Triton custom_calls and
+the downstream NCCL collectives in the same shard_map body. Our
+permute/unpermute Triton kernels use ``input_output_aliases`` on a
+pre-zeroed output buffer (see
+``transformer_engine/jax/triton_extensions/permutation.py`` ::
+``PermuteWithMaskMapPrimitive`` / ``SortChunksByMapPrimitive``).
+XLA's dependency tracker mis-handles the cross-stream sync edge
+between such an aliased custom_call and the immediately-following
+``jax.lax.ragged_all_to_all`` -- the all_to_all is launched on the
+NCCL communicator stream before the Triton kernel finishes writing
+sorted_inputs on the compute stream, so different ranks read different
+versions of the per-expert token counts and NCCL deadlocks.
+
+Empirically confirmed on dlcluster (GB200, jaxlib 0.10.1.dev20260519):
+* CUDA_LAUNCH_BLOCKING unset -> triton smoke + bwd hang at 300 s
+  (watchdog stacks identical every interval, only MainThread visible
+  in Python -- the XLA worker threads are in C++)
+* CUDA_LAUNCH_BLOCKING=1 -> smoke suite passes in <1 min across 3
+  consecutive runs.
+
+Workaround: ``CUDA_LAUNCH_BLOCKING=1`` is exported by both
+``qa/L0_jax_distributed_unittest/test.sh`` and
+``tests/jax/run_distributed_moe_vjp.sh``. The runtime cost for these
+correctness shapes is negligible. The proper fix is to either (a)
+teach ``triton_call_lowering`` to emit the right stream-sync edges,
+or (b) file an upstream JAX FFI bug for ``operand_output_aliases``
+with ``api_version=2`` interacting with NCCL collectives. TODO when
+the team prioritises a clean fix.
+
+``-p no:typeguard`` is also passed defensively
+---------------------------------------------
+
+A separate historical issue: jaxtyping's pytest plugin auto-loads
+typeguard, whose @typechecked import hook wraps every annotated TE /
+JAX / Flax callable and can perform isinstance() checks on JAX
+tracers during shard_map tracing. We've never been able to fully
+rule out an interaction with the async-dispatch path, so the wrappers
+disable typeguard for this file only (other jax tests still rely on
+it for type-hint validation). After the proper fix for the
+async-dispatch race lands, ``-p no:typeguard`` may become redundant,
+but for now we keep it.
+
+This is the first TE test that combines (a) Triton autotuned kernels
+with ``input_output_aliases``, (b) ``shard_map`` body, (c) NCCL
+collectives (``ragged_all_to_all``, ``all_gather``), (d) ``custom_vjp``,
+and (e) JAX async dispatch. None of the previous JAX tests exercised
+this combination, which is why the interaction was not observed
+before.
+
+Heavier opt-in: pass ``--forked`` (requires ``pip install --user
+pytest-forked``) to fork a fresh Python/JAX/XLA process per test
+variant. Rarely needed now, useful for bisecting suspected
+cross-test state leakage.
+"""
+
+import os
+
+# IMPORTANT: configure JAX's HBM allocator BEFORE jax is imported.
+#
+# By default JAX preallocates ~90% of every visible GPU's HBM. That
+# pool is fixed for the life of the process, so NCCL is starved for
+# even the few KiB it needs to set up the EP communicator. On B200 /
+# GB200 nodes (192 GiB HBM, possibly shared with another tenant) we
+# saw NCCL `ncclCommInitRankConfig` fail with "Failed to CUDA calloc
+# async 1216 bytes" -- 1 KiB! -- which then either crashes the test
+# or deadlocks the all-to-all rendezvous because every rank waits
+# forever for the leader stuck inside a failed NCCL init.
+#
+# Fix: disable preallocation entirely so JAX grows its pool on demand
+# and always leaves room for NCCL allocations. The mem-fraction is
+# still respected as a *cap*, so we also set it conservatively to
+# leave headroom both for NCCL and for a co-resident tenant on
+# shared clusters (e.g. prenyx batch partition).
+#
+# Users can override either knob in their env before invoking pytest
+# if they know their node is dedicated and want max throughput.
+os.environ.setdefault("XLA_PYTHON_CLIENT_PREALLOCATE", "false")
+os.environ.setdefault("XLA_PYTHON_CLIENT_MEM_FRACTION", "0.5")
+
+# NOTE: ``NVTE_TRITON_PERMUTATION_BLOCK_SIZES`` (which shrinks the
+# autotune sweep from 7 configs to 1 to skip 2-5 min of cold MLIR
+# compiles) MUST be set BEFORE Python starts because
+# ``tests/jax/conftest.py`` does ``import transformer_engine.jax`` at
+# collection time -- by the time this test file's top level runs, the
+# ``triton.autotune(configs=[...])`` wrappers have already frozen.
+# The ``run_distributed_moe_vjp.sh`` wrapper exports it for us; if
+# running pytest directly, export it manually:
+#     export NVTE_TRITON_PERMUTATION_BLOCK_SIZES=128
+
+import sys
+import time
+from typing import Optional
+
+# ---------------------------------------------------------------------------
+# Deadlock diagnostics. Install Python's built-in ``faulthandler`` BEFORE
+# any heavy imports so a hang in jax/xla/triton/nccl can always be turned
+# into a Python+C stack dump on demand. Two channels:
+#
+#   1. ``faulthandler.dump_traceback_later(N, repeat=True)`` -- prints all
+#      thread stacks every ``MOE_VJP_WATCHDOG_SECS`` seconds. Off by
+#      default; opt in with e.g. ``MOE_VJP_WATCHDOG_SECS=120``.
+#   2. SIGUSR1 handler -- ``kill -USR1 <pid>`` dumps stacks once. Always
+#      installed; zero overhead until the signal fires. Combined with the
+#      ``_hb`` heartbeat (prints elapsed wallclock every few lines) this
+#      lets you distinguish "slow compile" from "deadlock" in under a
+#      minute without needing gdb / py-spy / ptrace permissions.
+# ---------------------------------------------------------------------------
+import faulthandler
+import signal
+
+faulthandler.enable()  # crash -> stack to stderr (no-op if already enabled)
+try:
+    faulthandler.register(signal.SIGUSR1, all_threads=True, chain=False)
+except (AttributeError, ValueError):
+    # Windows or signal already taken; not fatal.
+    pass
+
+_WATCHDOG_SECS = int(os.environ.get("MOE_VJP_WATCHDOG_SECS", "0") or "0")
+if _WATCHDOG_SECS > 0:
+    # dump_traceback_later already dumps every Python thread by default
+    # (the keyword arg `all_threads` does NOT exist on this function --
+    # don't be fooled by faulthandler.register which has one). XLA's
+    # worker threads are C++ only and won't appear here; for those use
+    # py-spy or gdb if a future hang needs deeper investigation.
+    faulthandler.dump_traceback_later(_WATCHDOG_SECS, repeat=True)
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+
+from jax.experimental import mesh_utils
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+from flax.linen import partitioning as nn_partitioning
+
+
+# Optional XLA persistent compilation cache. ~3-5x speed-up on the
+# second and subsequent runs of this file in the same environment
+# (e.g. across CI invocations sharing a /lustre mount). Off by default
+# so a fresh checkout doesn't silently pick up artifacts from a
+# stale TE build. Set ``MOE_VJP_COMPILE_CACHE_DIR=/some/path`` to opt
+# in; we recommend a path on shared persistent storage in CI (e.g.
+# ``/lustre/.../jax_compile_cache``) and a per-user path on workstations.
+_compile_cache_dir = os.environ.get("MOE_VJP_COMPILE_CACHE_DIR")
+if _compile_cache_dir:
+    # Bump min size to 0 so even small jit'd helpers are cached, and
+    # min entry size so single-device jit's qualify too. Default in
+    # newer JAX is a multi-megabyte threshold that excludes most of
+    # the test scaffolding (init, small reductions, etc.).
+    jax.config.update("jax_compilation_cache_dir", _compile_cache_dir)
+    jax.config.update("jax_persistent_cache_min_compile_time_secs", 0)
+    jax.config.update("jax_persistent_cache_min_entry_size_bytes", 0)
+    # Triton has its own per-kernel ptx/cubin cache (separate from
+    # XLA's). Steer it to a sibling subdir so first-run Triton
+    # autotune + compile artifacts also survive across runs. Massive
+    # win on cold-start because every triton.autotune kernel evaluates
+    # ~7 BLOCK_SIZE configs by compiling and timing each on the GPU.
+    os.environ.setdefault("TRITON_CACHE_DIR", os.path.join(_compile_cache_dir, "triton"))
+
+
+NUM_DEVICES_REQUIRED = 4
+EP_AXIS = "ep"
+FSDP_AXIS = "fsdp"
+EP_SIZE = 2
+FSDP_SIZE = 2
+
+# Logical -> mesh axis rules. Match the TE / MaxText convention used by
+# ``_MoEBlock``'s default ``wi_kernel_axes=("exp", "embed", "mlp")`` /
+# ``wo_kernel_axes=("exp", "mlp", "embed")``.
+LOGICAL_AXIS_RULES = (
+    ("exp", EP_AXIS),
+    ("embed", FSDP_AXIS),
+    ("mlp", None),
+    ("batch", (EP_AXIS, FSDP_AXIS)),
+)
+
+
+@pytest.fixture(scope="module")
+def mesh():
+    if jax.device_count() < NUM_DEVICES_REQUIRED:
+        pytest.skip(
+            f"Need >={NUM_DEVICES_REQUIRED} devices for ep={EP_SIZE} x fsdp={FSDP_SIZE};"
+            f" have {jax.device_count()}"
+        )
+    devices = mesh_utils.create_device_mesh((EP_SIZE, FSDP_SIZE))
+    return Mesh(devices, axis_names=(EP_AXIS, FSDP_AXIS))
+
+
+@pytest.fixture(autouse=True, scope="function")
+def _inject_moe(request):
+    """Lazy-load TE MoE symbols only for ``triton``-marked tests so this
+    file imports cleanly in environments without the fused-router CUDA
+    kernel built.
+
+    Aggressive cache cleanup between tests is OPT-IN via the
+    ``MOE_VJP_AGGRESSIVE_CLEANUP=1`` env var. Off by default because:
+
+    * with ``XLA_PYTHON_CLIENT_PREALLOCATE=false`` (set at the top of
+      this file), JAX no longer hoards HBM so back-to-back compiled
+      executables in the same process do not OOM NCCL;
+    * keeping JAX's compilation cache alive across tests lets ``jit``
+      re-hit on identical closures (e.g. the same ``block.apply``
+      bound method), saving the full custom_vjp recompile.
+
+    Set ``MOE_VJP_AGGRESSIVE_CLEANUP=1`` if you suspect a test is
+    leaking state into the next one and want to bisect.
+    """
+    if os.environ.get("MOE_VJP_AGGRESSIVE_CLEANUP") == "1":
+        import gc
+
+        jax.clear_caches()
+        gc.collect()
+
+    if not request.node.get_closest_marker("triton"):
+        yield
+        return
+    from transformer_engine.jax.flax import _MoEBlock as MoEBlock
+    from transformer_engine.jax.moe import PermutationBackend
+    from transformer_engine.jax.sharding import MeshResource, global_shard_guard
+
+    mod = sys.modules[__name__]
+    mod.MoEBlock = MoEBlock
+    mod.PermutationBackend = PermutationBackend
+    mod.MeshResource = MeshResource
+    mod.global_shard_guard = global_shard_guard
+    yield
+    if os.environ.get("MOE_VJP_AGGRESSIVE_CLEANUP") == "1":
+        import gc
+
+        gc.collect()
+
+
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+
+
+def _make_block(
+    *,
+    num_experts: int,
+    num_experts_per_tok: int,
+    intermediate_size: int,
+    permutation_backend,
+    aux_loss_coeff: float = 0.0,
+    dtype=jnp.bfloat16,
+    align_size: int = 0,
+):
+    return MoEBlock(  # noqa: F821 -- injected by fixture
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        intermediate_size=intermediate_size,
+        permutation_backend=permutation_backend,
+        data_parallelism_axes=(FSDP_AXIS,),
+        aux_loss_coeff=aux_loss_coeff,
+        dtype=dtype,
+        _align_size=align_size,
+    )
+
+
+def _shard_inputs(x, mesh):
+    """Constrain ``x`` to ``P(("ep", "fsdp"), None, None)``."""
+    return jax.lax.with_sharding_constraint(
+        x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+    )
+
+
+def _hb(msg: str):
+    """Heartbeat print so a slow JAX/Triton compile shows progress in
+    the pytest log instead of looking like a hang. Each line gets
+    a wall-clock timestamp so the user can tell which step is the
+    expensive one."""
+    print(f"  [{time.strftime('%H:%M:%S')}] {msg}", flush=True)
+
+
+def _init_apply(block, mesh, x, key):
+    """Init + apply with logical_axis_rules + MeshResource context."""
+    with mesh, global_shard_guard(  # noqa: F821
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x = _shard_inputs(x, mesh)
+        _hb("  -> jit(block.init)")
+        variables = jax.jit(block.init)(key, x)
+        jax.block_until_ready(jax.tree_util.tree_leaves(variables)[0])
+        _hb("  -> jit(block.apply) (fwd compile + Triton autotune if first call)")
+        output, aux = jax.jit(block.apply)(variables, x)
+        jax.block_until_ready(output)
+    return variables, output, aux
+
+
+def _grad_step(block, variables, mesh, x):
+    with mesh, global_shard_guard(  # noqa: F821
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x = _shard_inputs(x, mesh)
+
+        def loss_fn(variables, x):
+            output, aux = block.apply(variables, x)
+            main = jnp.mean(output.astype(jnp.float32) ** 2)
+            return main + (aux.astype(jnp.float32) if aux is not None else 0.0)
+
+        _hb("  -> jit(grad(loss_fn)) (fwd+bwd compile + Triton autotune if first call)")
+        grads = jax.jit(jax.grad(loss_fn))(variables, x)
+        jax.block_until_ready(jax.tree_util.tree_leaves(grads)[0])
+        return grads
+
+
+def _unwrap(x):
+    return x.value if hasattr(x, "value") else x
+
+
+# -----------------------------------------------------------------------------
+# Level 2: smoke / correctness
+# -----------------------------------------------------------------------------
+
+
+# Smoke shapes are deliberately the smallest that still exercises every
+# code path (FSDP-of-batch, EP-of-experts, top-k>1 routing, alignment
+# padding -- though that one is off by default since align_size=0).
+# Keeping these small directly cuts cold-compile time:
+#   * Triton autotune times each BLOCK_SIZE config on the actual data;
+#     16-token-per-shard inputs finish autotune in seconds instead of
+#     minutes.
+#   * XLA's HLO optimization passes process less data per pass.
+# Constraints:
+#   * SMOKE_BATCH must be a multiple of EP_SIZE * FSDP_SIZE = 4
+#     (one micro-batch per device is the floor).
+#   * SMOKE_NUM_EXPERTS must be a multiple of EP_SIZE = 2.
+#   * SMOKE_HIDDEN and SMOKE_INTER should be multiples of 16 for bf16
+#     GEMM alignment (32 is the practical floor).
+SMOKE_BATCH = EP_SIZE * FSDP_SIZE  # 4 -- one micro-batch per device
+SMOKE_SEQ = 16
+SMOKE_HIDDEN = 32
+SMOKE_INTER = 64
+SMOKE_NUM_EXPERTS = 4  # divisible by EP_SIZE=2; 2 experts per shard locally
+SMOKE_TOPK = 2
+
+
+@pytest.mark.triton
+class TestMoeVjpDistributedSmoke:
+    """Level 2: structural + numerical correctness on 2x2 (ep, fsdp) mesh.
+
+    Test design notes (CI-friendly):
+
+    * The MoE custom_vjp compiles into a single large XLA executable.
+      Each ``jax.jit``-wrapped invocation in a test triggers a fresh
+      compile, so the number of compiles -- not the kernel shapes --
+      dominates wall-clock time. We therefore compile **once per
+      backend** and check fwd, bwd, and aux_loss within the same test.
+    * The aux_loss path adds a second collective and a second
+      ``fused_topk`` compile, so it would also be one extra compile per
+      backend. We accept that cost as one extra parametrized test
+      rather than rolling it into the main smoke (so a future
+      aux-specific regression is reported cleanly).
+    * The parity test deliberately compiles both backends a third
+      time; that is unavoidable because by definition it needs both
+      implementations side-by-side. Marked with a separate name so it
+      can be skipped (``-k 'not parity'``) when bandwidth is tight.
+    """
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_fwd_and_bwd_smoke(self, mesh, backend_name):
+        """One combined smoke check per backend: fwd shape / dtype /
+        finiteness AND bwd finiteness + non-zero on every learnable
+        parameter. Two compiles per backend (init+apply and grad)."""
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        block = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=backend,
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(0),
+            (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+        variables, output, aux = _init_apply(block, mesh, x, jax.random.PRNGKey(1))
+        # ---- Fwd checks ----
+        assert output.shape == x.shape, f"got {output.shape}, want {x.shape}"
+        assert output.dtype == x.dtype
+        assert jnp.all(jnp.isfinite(output)).item(), "output has NaN/Inf"
+        assert aux is None, "no aux_loss expected when aux_loss_coeff=0"
+        # ---- Bwd checks ----
+        grads = _grad_step(block, variables, mesh, x)
+        for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
+            g = _unwrap(grads["params"][name])
+            assert jnp.all(jnp.isfinite(g)).item(), f"{name} grad has NaN/Inf"
+            assert jnp.any(g != 0.0).item(), f"{name} grad is identically zero"
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_aux_loss_smoke(self, mesh, backend_name):
+        """Aux-loss path: scalar returned + finite + gate receives a
+        gradient through the aux branch."""
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        block = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=backend,
+            aux_loss_coeff=1e-2,
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(4),
+            (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+        variables, output, aux = _init_apply(block, mesh, x, jax.random.PRNGKey(5))
+        assert output.shape == x.shape
+        assert aux is not None
+        assert aux.shape == ()
+        assert jnp.isfinite(aux).item()
+        grads = _grad_step(block, variables, mesh, x)
+        g_gate = _unwrap(grads["params"]["gate_kernel"])
+        assert jnp.all(jnp.isfinite(g_gate)).item(), "gate grad NaN/Inf under aux"
+
+    def test_pure_jax_triton_parity(self, mesh):
+        """Same params, swap backend, output + grads must match within
+        bf16 tolerance."""
+        block_pj = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=PermutationBackend.PURE_JAX,  # noqa: F821
+        )
+        block_tr = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=PermutationBackend.TRITON,  # noqa: F821
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(6),
+            (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+        # Share parameter init across the two blocks so routing + FFN
+        # are identical and only the dispatch/combine implementation
+        # differs.
+        variables, out_pj, _ = _init_apply(block_pj, mesh, x, jax.random.PRNGKey(7))
+        with mesh, global_shard_guard(  # noqa: F821
+            MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+        ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+            x_sh = _shard_inputs(x, mesh)
+            out_tr, _ = jax.jit(block_tr.apply)(variables, x_sh)
+
+        diff = float(jnp.max(jnp.abs(out_pj - out_tr)))
+        # bf16 compounded error budget on these shapes -- matches the
+        # threshold the old single-device test used.
+        assert diff < 5e-2, f"forward parity breach: max_abs_diff={diff}"
+
+        grads_pj = _grad_step(block_pj, variables, mesh, x)
+        grads_tr = _grad_step(block_tr, variables, mesh, x)
+        for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
+            g_pj = _unwrap(grads_pj["params"][name])
+            g_tr = _unwrap(grads_tr["params"][name])
+            d = float(jnp.max(jnp.abs(g_pj - g_tr)))
+            assert d < 5e-2, f"grad parity breach on {name}: max_abs_diff={d}"
+
+
+# -----------------------------------------------------------------------------
+# Level 3: Mixtral-ish-shape throughput
+# -----------------------------------------------------------------------------
+#
+# Shapes intentionally smaller than full Mixtral-8x7B (which would need
+# ~16 GPUs for a single replica) so the test fits on a single 4-GPU box
+# at bf16. Keeps E=8, top_k=2, intermediate proportional to hidden,
+# batch large enough to saturate. Adjust ``PERF_*`` constants below if
+# you have more memory headroom.
+
+PERF_BATCH = EP_SIZE * FSDP_SIZE * 4  # 16
+PERF_SEQ = 2048
+PERF_HIDDEN = 1024
+PERF_INTER = 4096
+PERF_NUM_EXPERTS = 8
+PERF_TOPK = 2
+PERF_WARMUP_STEPS = 5
+PERF_TIMED_STEPS = 30
+
+
+@pytest.mark.triton
+@pytest.mark.slow
+class TestMoeVjpDistributedPerf:
+    """Level 3: tokens/sec on Mixtral-ish shapes."""
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_throughput(self, mesh, backend_name, capsys):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        block = _make_block(
+            num_experts=PERF_NUM_EXPERTS,
+            num_experts_per_tok=PERF_TOPK,
+            intermediate_size=PERF_INTER,
+            permutation_backend=backend,
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(0),
+            (PERF_BATCH, PERF_SEQ, PERF_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+
+        with mesh, global_shard_guard(  # noqa: F821
+            MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+        ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+            x = _shard_inputs(x, mesh)
+            variables = jax.jit(block.init)(jax.random.PRNGKey(1), x)
+
+            def loss_fn(variables, x):
+                output, _ = block.apply(variables, x)
+                return jnp.mean(output.astype(jnp.float32) ** 2)
+
+            # value_and_grad mirrors a real training step.
+            step = jax.jit(jax.value_and_grad(loss_fn))
+
+            # Warmup (compile + a few iters).
+            loss = None
+            for _ in range(PERF_WARMUP_STEPS):
+                loss, _ = step(variables, x)
+            loss.block_until_ready()
+
+            # Timed.
+            t0 = time.perf_counter()
+            for _ in range(PERF_TIMED_STEPS):
+                loss, _ = step(variables, x)
+            loss.block_until_ready()
+            elapsed = time.perf_counter() - t0
+
+        tokens_per_step = PERF_BATCH * PERF_SEQ
+        tokens_per_sec = (tokens_per_step * PERF_TIMED_STEPS) / elapsed
+        steps_per_sec = PERF_TIMED_STEPS / elapsed
+
+        # Use capsys.disabled() so the perf line shows up even when
+        # pytest captures stdout.
+        with capsys.disabled():
+            print(
+                f"\n[perf] backend={backend_name}"
+                f" batch={PERF_BATCH} seq={PERF_SEQ} hidden={PERF_HIDDEN}"
+                f" inter={PERF_INTER} E={PERF_NUM_EXPERTS} k={PERF_TOPK}"
+                f" mesh=ep{EP_SIZE}xfsdp{FSDP_SIZE}"
+                f"\n[perf]   loss(final)     = {float(loss):.6f}"
+                f"\n[perf]   elapsed         = {elapsed:.3f} s over"
+                f" {PERF_TIMED_STEPS} steps"
+                f"\n[perf]   steps/sec       = {steps_per_sec:.2f}"
+                f"\n[perf]   tokens/sec      = {tokens_per_sec:.0f}",
+                flush=True,
+            )
+
+        assert jnp.isfinite(loss).item(), "loss diverged during perf run"
diff --git a/tests/jax/test_moe_vjp.py b/tests/jax/test_moe_vjp.py
new file mode 100644
index 0000000000..92d95bc896
--- /dev/null
+++ b/tests/jax/test_moe_vjp.py
@@ -0,0 +1,449 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Single-device tests for the unified MoE custom_vjp at
+``transformer_engine.jax.moe.moe`` (and its Flax wrapper
+``transformer_engine.jax.flax._MoEBlock``).
+
+Strategy
+--------
+
+Rather than reproducing every internal kernel residual, we rely on a
+single end-to-end pure-JAX *reference* implementation of the whole
+MoE block (``_pure_jax_moe_reference`` below) and compare the TE
+``moe(...)`` forward output AND parameter gradients against it. This
+gives us coverage of:
+
+* the gate GEMM,
+* the fused top-k routing primitive (and its bwd),
+* the dispatch / per-expert FFN / combine pipeline (and their bwds
+  threaded through the absorbed primitives),
+* the optional aux-loss path (and its bwd).
+
+The reference uses only ``jnp`` ops + ``jax.vjp``, so we get a
+"definitive" pullback to compare against without needing the TE
+primitive bwd kernels.
+
+Distributed (EP + FSDP) testing is intentionally NOT in this file --
+that needs a multi-device setup and lives in
+``tests/jax/test_distributed_moe_vjp.py`` (follow-up).
+"""
+
+from functools import partial
+from typing import Optional, Tuple
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+
+
+# Lazy import (mirrors the gating in the old test file): the underlying
+# kernels require triton + the fused-router CUDA kernel.
+@pytest.fixture(autouse=True, scope="function")
+def _inject_moe(request):
+    if not request.node.get_closest_marker("triton"):
+        yield
+        return
+    import sys
+    from transformer_engine.jax.flax import _MoEBlock as MoEBlock
+    from transformer_engine.jax.moe import PermutationBackend, moe
+
+    mod = sys.modules[__name__]
+    mod.MoEBlock = MoEBlock
+    mod.PermutationBackend = PermutationBackend
+    mod.moe = moe
+    yield
+
+
+# -----------------------------------------------------------------------------
+# Test config
+# -----------------------------------------------------------------------------
+
+DTYPE = jnp.float32  # use fp32 for tighter parity assertions
+BATCH_SIZE = 2
+SEQUENCE_LENGTH = 16
+HIDDEN_SIZE = 32
+INTERMEDIATE_SIZE = 64
+NUM_EXPERTS = 8
+NUM_EXPERTS_PER_TOK = 2
+
+
+def _make_inputs(key: jax.Array, *, batch=BATCH_SIZE, seq=SEQUENCE_LENGTH) -> jax.Array:
+    return jax.random.normal(key, (batch, seq, HIDDEN_SIZE), dtype=DTYPE)
+
+
+# -----------------------------------------------------------------------------
+# Pure-JAX reference MoE
+# -----------------------------------------------------------------------------
+#
+# Implements EXACTLY the same math as ``moe(...)`` for the no-EP,
+# softmax-routing, no-bias, silu activation, no-quantization path.
+# Returns ``(output, aux_loss_or_zero)``. Used as ground truth for both
+# fwd and bwd parity.
+
+
+@partial(
+    jax.jit,
+    static_argnames=("num_experts", "num_experts_per_tok", "aux_loss_coeff"),
+)
+def _pure_jax_moe_reference(
+    x: jnp.ndarray,
+    gate_kernel: jnp.ndarray,
+    wi_0: jnp.ndarray,
+    wi_1: jnp.ndarray,
+    wo: jnp.ndarray,
+    *,
+    num_experts: int,
+    num_experts_per_tok: int,
+    aux_loss_coeff: float = 0.0,
+) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    """Reference no-EP MoE forward (pure JAX, no TE primitives).
+
+    Mirrors :func:`transformer_engine.jax.moe._body_fwd` for the
+    PURE_JAX backend, no biases, softmax routing, silu activation,
+    no quantization. Linear ops only -- ``jax.vjp`` over this gives
+    the canonical bwd to compare against.
+    """
+    B, S, H = x.shape
+    T = B * S
+    x_2d = x.reshape(T, H)
+
+    # Gate
+    logits = x_2d @ gate_kernel  # [T, E]
+
+    # Softmax + topk (no expert_bias, no grouping, scale=1.0)
+    probs_full = jax.nn.softmax(logits, axis=-1)  # [T, E]
+    # top-k by probability:
+    sorted_idx = jnp.argsort(probs_full, axis=-1)  # ascending
+    selected = sorted_idx[:, -num_experts_per_tok:]  # [T, K]
+    weights = jnp.take_along_axis(probs_full, selected, axis=-1)  # [T, K]
+    # Normalize topk weights to sum to 1 (matches softmax->topk semantics
+    # of fused_topk_with_score_function with use_pre_softmax=False):
+    weights = weights / jnp.sum(weights, axis=-1, keepdims=True)
+
+    # Build a sparse routing_map [T, E] with weights at selected positions
+    routing_weights_full = jnp.zeros_like(probs_full)
+    routing_weights_full = routing_weights_full.at[jnp.arange(T)[:, None], selected].set(weights)
+
+    # Per-expert FFN: replicate each token K times, gather by expert,
+    # run through wi_0 / wi_1 / wo, gather back, weighted-sum.
+    #
+    # Vectorize the gather without sorting: for each (token, slot k),
+    # multiply the corresponding expert's FFN by routing_weights[t, k]
+    # and sum over experts.
+    # x_2d: [T, H], wi_0: [E, H, M], wi_1: [E, H, M], wo: [E, M, H]
+    # For each expert e: layer_w0_e = x_2d @ wi_0[e]; layer_w1_e = x_2d @ wi_1[e]
+    #                    intermediate_e = silu(layer_w0_e) * layer_w1_e
+    #                    expert_out_e = intermediate_e @ wo[e]
+    # output[t, h] = sum_e routing_weights_full[t, e] * expert_out_e[t, h]
+    layer_w0 = jnp.einsum("th,ehm->tem", x_2d, wi_0)  # [T, E, M]
+    layer_w1 = jnp.einsum("th,ehm->tem", x_2d, wi_1)  # [T, E, M]
+    intermediate = jax.nn.silu(layer_w0) * layer_w1  # [T, E, M]
+    expert_out = jnp.einsum("tem,emh->teh", intermediate, wo)  # [T, E, H]
+    output_2d = jnp.einsum("te,teh->th", routing_weights_full, expert_out)  # [T, H]
+    output = output_2d.reshape(B, S, H)
+
+    if aux_loss_coeff > 0.0:
+        # aux scores: clean per-expert softmax (compute_aux_scores=True
+        # kernel uses a clean softmax, no bias, scale=1, no grouping).
+        aux_probs = jax.nn.softmax(logits.astype(jnp.float32), axis=-1)
+        # tokens_per_expert from REAL routing_map (post-grouping); here
+        # there's no grouping so == count of non-zero positions per expert.
+        routing_map = (routing_weights_full > 0).astype(jnp.int32)
+        tokens_per_expert = jnp.sum(routing_map, axis=0)  # [E]
+        # aux_loss formula: (E * coeff / (k * T^2)) * sum_e
+        # (sum_t aux_probs[t, e]) * tokens_per_expert[e]
+        sum_probs_per_expert = jnp.sum(aux_probs, axis=0)  # [E]
+        aux_loss = (num_experts * aux_loss_coeff / (num_experts_per_tok * (T**2))) * jnp.sum(
+            sum_probs_per_expert * tokens_per_expert.astype(jnp.float32)
+        )
+    else:
+        aux_loss = jnp.zeros((), dtype=DTYPE)
+
+    return output, aux_loss
+
+
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+
+
+def _init_params(key: jax.Array) -> dict:
+    k_g, k_w0, k_w1, k_wo = jax.random.split(key, 4)
+    init = jax.nn.initializers.variance_scaling(1.0, "fan_in", "truncated_normal")
+    return dict(
+        gate_kernel=init(k_g, (HIDDEN_SIZE, NUM_EXPERTS), DTYPE),
+        wi_0=init(k_w0, (NUM_EXPERTS, HIDDEN_SIZE, INTERMEDIATE_SIZE), DTYPE),
+        wi_1=init(k_w1, (NUM_EXPERTS, HIDDEN_SIZE, INTERMEDIATE_SIZE), DTYPE),
+        wo=init(k_wo, (NUM_EXPERTS, INTERMEDIATE_SIZE, HIDDEN_SIZE), DTYPE),
+    )
+
+
+@partial(jax.jit, static_argnames=("permutation_backend", "aux_loss_coeff"))
+def _run_te_moe(
+    x: jnp.ndarray,
+    params: dict,
+    *,
+    permutation_backend,
+    aux_loss_coeff: float = 0.0,
+) -> Tuple[jnp.ndarray, Optional[jnp.ndarray]]:
+    return moe(  # noqa: F821 -- injected by fixture
+        x,
+        params["gate_kernel"],
+        params["wi_0"],
+        params["wi_1"],
+        params["wo"],
+        num_experts=NUM_EXPERTS,
+        num_experts_per_tok=NUM_EXPERTS_PER_TOK,
+        activation_type="silu",
+        score_function="softmax",
+        use_pre_softmax=False,
+        scaling_factor=1.0,
+        aux_loss_coeff=aux_loss_coeff,
+        permutation_backend=permutation_backend,
+        align_size=0,
+        dtype=DTYPE,
+    )
+
+
+@partial(jax.jit, static_argnames=("permutation_backend", "aux_loss_coeff"))
+def _grads_te_main_loss(params, x, *, permutation_backend, aux_loss_coeff: float = 0.0):
+    """jit'd grad of ``mean(out**2)`` w.r.t. params (no aux contribution)."""
+
+    def loss(params, x):
+        out, _ = _run_te_moe(
+            x, params, permutation_backend=permutation_backend, aux_loss_coeff=aux_loss_coeff
+        )
+        return jnp.mean(out**2)
+
+    return jax.grad(loss)(params, x)
+
+
+@partial(jax.jit, static_argnames=("num_experts", "num_experts_per_tok", "aux_loss_coeff"))
+def _grads_ref_main_loss(params, x, *, num_experts, num_experts_per_tok, aux_loss_coeff=0.0):
+    """jit'd grad of ``mean(out**2)`` w.r.t. params on the pure-JAX ref."""
+
+    def loss(params, x):
+        out, _ = _pure_jax_moe_reference(
+            x,
+            **params,
+            num_experts=num_experts,
+            num_experts_per_tok=num_experts_per_tok,
+            aux_loss_coeff=aux_loss_coeff,
+        )
+        return jnp.mean(out**2)
+
+    return jax.grad(loss)(params, x)
+
+
+@partial(jax.jit, static_argnames=("permutation_backend",))
+def _grad_te_aux_only(params, x, *, permutation_backend):
+    """jit'd grad of just the aux loss scalar (no main contribution)."""
+
+    def aux_only(params, x):
+        _, aux = _run_te_moe(
+            x, params, permutation_backend=permutation_backend, aux_loss_coeff=1e-2
+        )
+        return aux.astype(jnp.float32)
+
+    return jax.grad(aux_only)(params, x)
+
+
+# -----------------------------------------------------------------------------
+# Tests
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.triton
+class TestMoeVjpForward:
+    """Forward shape / finiteness / parity vs pure-JAX reference."""
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_forward_shape_and_finite(self, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(0)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        out, aux = _run_te_moe(x, params, permutation_backend=backend)
+        assert out.shape == x.shape
+        assert out.dtype == x.dtype
+        assert jnp.all(jnp.isfinite(out))
+        assert aux is None
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_forward_parity_vs_pure_jax_reference(self, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(1)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        out_te, _ = _run_te_moe(x, params, permutation_backend=backend)
+        out_ref, _ = _pure_jax_moe_reference(
+            x,
+            **params,
+            num_experts=NUM_EXPERTS,
+            num_experts_per_tok=NUM_EXPERTS_PER_TOK,
+        )
+        # FP32, small shapes -> tight tolerance
+        np.testing.assert_allclose(np.array(out_te), np.array(out_ref), atol=2e-5, rtol=2e-5)
+
+    def test_pure_jax_triton_equivalence(self):
+        key = jax.random.PRNGKey(2)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        out_pj, _ = _run_te_moe(
+            x, params, permutation_backend=PermutationBackend.PURE_JAX  # noqa: F821
+        )
+        out_tr, _ = _run_te_moe(
+            x, params, permutation_backend=PermutationBackend.TRITON  # noqa: F821
+        )
+        np.testing.assert_allclose(np.array(out_pj), np.array(out_tr), atol=2e-5, rtol=2e-5)
+
+
+@pytest.mark.triton
+class TestMoeVjpBackward:
+    """Backward parity vs pure-JAX reference (which uses ``jax.vjp`` over
+    plain JAX ops, giving us the canonical pullback)."""
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_grads_finite_and_nonzero(self, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(3)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        grads = _grads_te_main_loss(params, x, permutation_backend=backend)
+        for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
+            g = grads[name]
+            assert jnp.all(jnp.isfinite(g)), f"{name} grad has NaN/Inf"
+            assert jnp.any(g != 0.0), f"{name} grad is identically zero"
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_grads_match_pure_jax_reference(self, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(4)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        grads_te = _grads_te_main_loss(params, x, permutation_backend=backend)
+        grads_ref = _grads_ref_main_loss(
+            params,
+            x,
+            num_experts=NUM_EXPERTS,
+            num_experts_per_tok=NUM_EXPERTS_PER_TOK,
+        )
+        # Loose-ish tol on grads: routing path has discrete topk so the
+        # softmax cotangent paths through the non-topk experts diverge
+        # slightly between TE (which uses the fused topk bwd) and the
+        # reference (which uses argsort-based take_along_axis).
+        # Tighter than the bf16 tests.
+        for name in ("wi_0", "wi_1", "wo"):
+            np.testing.assert_allclose(
+                np.array(grads_te[name]),
+                np.array(grads_ref[name]),
+                atol=5e-5,
+                rtol=5e-5,
+                err_msg=f"grad mismatch on {name}",
+            )
+        # Gate grad has more error budget because it propagates through
+        # the topk derivative kernel (which differs in zero-pattern
+        # treatment from a plain take_along_axis).
+        np.testing.assert_allclose(
+            np.array(grads_te["gate_kernel"]),
+            np.array(grads_ref["gate_kernel"]),
+            atol=5e-4,
+            rtol=5e-4,
+            err_msg="grad mismatch on gate_kernel",
+        )
+
+
+@pytest.mark.triton
+class TestMoeVjpAuxLoss:
+    """Aux-loss path: forward + grad parity."""
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_aux_loss_returned_and_finite(self, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(5)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        _, aux = _run_te_moe(x, params, permutation_backend=backend, aux_loss_coeff=1e-2)
+        assert aux is not None
+        assert aux.shape == ()
+        assert jnp.isfinite(aux)
+        assert jnp.abs(aux) < 1e2
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_aux_loss_parity_vs_reference(self, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(6)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        _, aux_te = _run_te_moe(x, params, permutation_backend=backend, aux_loss_coeff=1e-2)
+        _, aux_ref = _pure_jax_moe_reference(
+            x,
+            **params,
+            num_experts=NUM_EXPERTS,
+            num_experts_per_tok=NUM_EXPERTS_PER_TOK,
+            aux_loss_coeff=1e-2,
+        )
+        np.testing.assert_allclose(float(aux_te), float(aux_ref), atol=1e-5, rtol=1e-5)
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_aux_loss_grads_propagate_to_logits(self, backend_name):
+        """The aux-loss bwd path must produce non-zero gate-kernel grads
+        when only the aux-loss scalar is differentiated (no main-output
+        contribution)."""
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        key = jax.random.PRNGKey(7)
+        kp, kx = jax.random.split(key)
+        params = _init_params(kp)
+        x = _make_inputs(kx)
+        g_gate = _grad_te_aux_only(params, x, permutation_backend=backend)["gate_kernel"]
+        assert jnp.all(jnp.isfinite(g_gate))
+        assert jnp.any(
+            g_gate != 0.0
+        ), "aux_loss bwd should propagate to gate_kernel via fused_topk bwd"
+
+
+# -----------------------------------------------------------------------------
+# Flax wrapper smoke test
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.triton
+class TestMoEBlockFlaxWrapper:
+    """Sanity-check the thin Flax wrapper: forward + grad on init."""
+
+    def test_init_and_apply(self):
+        block = MoEBlock(  # noqa: F821
+            num_experts=NUM_EXPERTS,
+            num_experts_per_tok=NUM_EXPERTS_PER_TOK,
+            intermediate_size=INTERMEDIATE_SIZE,
+            permutation_backend=PermutationBackend.PURE_JAX,  # noqa: F821
+            dtype=DTYPE,
+        )
+        key = jax.random.PRNGKey(8)
+        ki, kx = jax.random.split(key)
+        x = _make_inputs(kx)
+        variables = jax.jit(block.init)(ki, x)
+        out, aux = jax.jit(block.apply)(variables, x)
+        assert out.shape == x.shape
+        assert aux is None
+
+        @jax.jit
+        def grad_fn(variables, x):
+            return jax.grad(lambda v, x: jnp.mean(block.apply(v, x)[0] ** 2))(variables, x)
+
+        grads = grad_fn(variables, x)
+        for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
+            g = grads["params"][name]
+            g = g.value if hasattr(g, "value") else g
+            assert jnp.all(jnp.isfinite(g)), f"{name} grad NaN/Inf"
+            assert jnp.any(g != 0.0), f"{name} grad zero"
diff --git a/tests/jax/test_multiprocess_moe_vjp.py b/tests/jax/test_multiprocess_moe_vjp.py
new file mode 100644
index 0000000000..26c35d72b5
--- /dev/null
+++ b/tests/jax/test_multiprocess_moe_vjp.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Multi-process (one-GPU-per-process) tests for the unified MoE custom_vjp.
+
+This is the **multiprocess companion** to
+``test_distributed_moe_vjp.py``. The two files exercise the same code
+paths in :func:`transformer_engine.jax.moe.moe`, but they bootstrap JAX
+very differently:
+
+* ``test_distributed_moe_vjp.py`` is the **single-process, multi-GPU**
+  variant: one Python process sees all 4 GPUs as local devices, and
+  ``shard_map`` dispatches work to each. This is the simplest setup
+  but suffers from a CUDA-driver-level deadlock when **lazy Triton
+  module loading** interleaves with **active NCCL collectives** -- see
+  ``past_JAX_XLA_deadlock.txt`` for Olli Lupton's writeup
+  (nvbug/5564750). Workaround used there: ``CUDA_LAUNCH_BLOCKING=1``
+  to serialize every CUDA launch across all device threads.
+
+* THIS file is the **multi-process, one-GPU-per-process** variant.
+  Each pytest process binds to exactly one device via
+  ``jax.distributed.initialize(..., local_device_ids=process_id)`` and
+  the four processes form a global mesh through JAX's distributed
+  runtime. Because every process has its **own** CUDA driver context,
+  there is no global module-load lock shared between the threads
+  driving different GPUs, and the lazy-load deadlock window does not
+  exist. No ``CUDA_LAUNCH_BLOCKING=1`` is needed.
+
+Why we keep BOTH files in tree
+------------------------------
+
+* The single-process file remains the simpler thing to read and is
+  what most developers will reach for during dev-loop iteration; the
+  ``CUDA_LAUNCH_BLOCKING=1`` workaround keeps it green.
+* The multi-process file is what we want CI to run for guaranteed
+  green-without-workarounds correctness, and it is what
+  ``run_multiprocess_moe_vjp.sh`` (sister script to
+  ``examples/jax/encoder/run_test_multiprocessing_encoder.sh``)
+  invokes.
+* If/when the upstream JAX FFI bug is fixed (or XLA gains an
+  ``initialize`` stage for Triton custom_calls), we can collapse the
+  two by dropping ``CUDA_LAUNCH_BLOCKING=1`` from the single-process
+  file and deleting this one.
+
+How to run
+----------
+
+You typically do NOT invoke pytest on this file directly -- the
+``run_multiprocess_moe_vjp.sh`` launcher forks N pytest processes (one
+per visible GPU), passing ``--num-process=N --process-id=i`` to each.
+Driving it directly with only one process will skip every test
+because :func:`jax.distributed.initialize` will report
+``jax.device_count() == 1``.
+
+    bash tests/jax/run_multiprocess_moe_vjp.sh
+
+CI invocation lives in ``qa/L0_jax_distributed_unittest/test.sh``
+alongside the single-process file's invocation.
+"""
+
+import os
+
+# Same allocator config as the single-process variant: NCCL needs HBM
+# headroom that JAX's default 90% preallocation does not leave. Set
+# before any jax import below.
+os.environ.setdefault("XLA_PYTHON_CLIENT_PREALLOCATE", "false")
+os.environ.setdefault("XLA_PYTHON_CLIENT_MEM_FRACTION", "0.5")
+
+import sys
+import time
+
+import faulthandler
+import signal
+
+faulthandler.enable()
+try:
+    faulthandler.register(signal.SIGUSR1, all_threads=True, chain=False)
+except (AttributeError, ValueError):
+    pass
+
+_WATCHDOG_SECS = int(os.environ.get("MOE_VJP_WATCHDOG_SECS", "0") or "0")
+if _WATCHDOG_SECS > 0:
+    faulthandler.dump_traceback_later(_WATCHDOG_SECS, repeat=True)
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import pytest
+
+from jax.experimental import mesh_utils
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+from flax.linen import partitioning as nn_partitioning
+
+
+# Per-process distributed bootstrap. Each pytest invocation initializes
+# JAX with exactly one local device (its assigned GPU). Once
+# initialized, the four processes form one global mesh of 4 devices.
+def _init_distributed(num_process: int, process_id: int) -> bool:
+    """Initialize jax.distributed for this pytest process.
+
+    Returns True if initialization succeeded (i.e. this is a real
+    multi-process launch), False if num_process == 0 / 1 meaning the
+    file is being collected without a launcher and tests should be
+    skipped at module level.
+    """
+    if num_process <= 1:
+        return False
+    coord = os.environ.get("MOE_VJP_COORDINATOR_ADDRESS", "127.0.0.1:1234")
+    jax.distributed.initialize(
+        coordinator_address=coord,
+        num_processes=num_process,
+        process_id=process_id,
+        local_device_ids=process_id,
+    )
+    assert jax.local_device_count() == 1, "one GPU per process is the whole point"
+    assert (
+        jax.device_count() == num_process
+    ), f"global device_count {jax.device_count()} != num_process {num_process}"
+    return True
+
+
+# Read --num-process / --process-id BEFORE pytest collects any tests so
+# we can fast-skip the whole module when not in a multiprocess launch.
+def _read_mp_options():
+    # Use pytest's option lookup via the request fixture isn't available
+    # at module top-level; parse argv ourselves the same way encoder
+    # test does. CLI form is e.g. "pytest ... --num-process=4 --process-id=0".
+    num = int(os.environ.get("MP_NUM_PROCESS", "0") or "0")
+    pid = int(os.environ.get("MP_PROCESS_ID", "0") or "0")
+    for i, a in enumerate(sys.argv):
+        if a.startswith("--num-process="):
+            num = int(a.split("=", 1)[1])
+        elif a == "--num-process" and i + 1 < len(sys.argv):
+            num = int(sys.argv[i + 1])
+        elif a.startswith("--process-id="):
+            pid = int(a.split("=", 1)[1])
+        elif a == "--process-id" and i + 1 < len(sys.argv):
+            pid = int(sys.argv[i + 1])
+    return num, pid
+
+
+_MP_NUM_PROCESS, _MP_PROCESS_ID = _read_mp_options()
+_MP_ACTIVE = _init_distributed(_MP_NUM_PROCESS, _MP_PROCESS_ID)
+
+if not _MP_ACTIVE:
+    # Skip the entire module if not launched via the multiprocess
+    # runner. Lets `pytest tests/jax/` collect this file harmlessly.
+    pytest.skip(
+        "test_multiprocess_moe_vjp.py requires the multiprocess launcher "
+        "(run_multiprocess_moe_vjp.sh). Skipping.",
+        allow_module_level=True,
+    )
+
+
+NUM_DEVICES_REQUIRED = 4
+EP_AXIS = "ep"
+FSDP_AXIS = "fsdp"
+EP_SIZE = 2
+FSDP_SIZE = 2
+
+LOGICAL_AXIS_RULES = (
+    ("exp", EP_AXIS),
+    ("embed", FSDP_AXIS),
+    ("mlp", None),
+    ("batch", (EP_AXIS, FSDP_AXIS)),
+)
+
+
+@pytest.fixture(scope="module")
+def mesh():
+    if jax.device_count() < NUM_DEVICES_REQUIRED:
+        pytest.skip(
+            f"Need >={NUM_DEVICES_REQUIRED} devices for ep={EP_SIZE} x fsdp={FSDP_SIZE};"
+            f" have {jax.device_count()}"
+        )
+    devices = mesh_utils.create_device_mesh((EP_SIZE, FSDP_SIZE))
+    return Mesh(devices, axis_names=(EP_AXIS, FSDP_AXIS))
+
+
+@pytest.fixture(autouse=True, scope="function")
+def _inject_moe(request):
+    if not request.node.get_closest_marker("triton"):
+        yield
+        return
+    from transformer_engine.jax.flax import _MoEBlock as MoEBlock
+    from transformer_engine.jax.moe import PermutationBackend
+    from transformer_engine.jax.sharding import MeshResource, global_shard_guard
+
+    mod = sys.modules[__name__]
+    mod.MoEBlock = MoEBlock
+    mod.PermutationBackend = PermutationBackend
+    mod.MeshResource = MeshResource
+    mod.global_shard_guard = global_shard_guard
+    yield
+
+
+# -----------------------------------------------------------------------------
+# Helpers (identical to the single-process file's helpers; copied rather
+# than imported because importing the single-process file would trigger
+# its own faulthandler/setup and confuse pytest collection).
+# -----------------------------------------------------------------------------
+
+
+def _make_block(
+    *,
+    num_experts,
+    num_experts_per_tok,
+    intermediate_size,
+    permutation_backend,
+    aux_loss_coeff=0.0,
+    dtype=jnp.bfloat16,
+    align_size=0,
+):
+    return MoEBlock(  # noqa: F821
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        intermediate_size=intermediate_size,
+        permutation_backend=permutation_backend,
+        data_parallelism_axes=(FSDP_AXIS,),
+        aux_loss_coeff=aux_loss_coeff,
+        dtype=dtype,
+        _align_size=align_size,
+    )
+
+
+def _shard_inputs(x, mesh):
+    return jax.lax.with_sharding_constraint(
+        x, NamedSharding(mesh, P((EP_AXIS, FSDP_AXIS), None, None))
+    )
+
+
+def _hb(msg):
+    if _MP_PROCESS_ID == 0:
+        print(f"  [{time.strftime('%H:%M:%S')}] {msg}", flush=True)
+
+
+def _init_apply(block, mesh, x, key):
+    with mesh, global_shard_guard(  # noqa: F821
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x = _shard_inputs(x, mesh)
+        _hb("  -> jit(block.init)")
+        variables = jax.jit(block.init)(key, x)
+        jax.block_until_ready(jax.tree_util.tree_leaves(variables)[0])
+        _hb("  -> jit(block.apply)")
+        output, aux = jax.jit(block.apply)(variables, x)
+        jax.block_until_ready(output)
+    return variables, output, aux
+
+
+def _grad_step(block, variables, mesh, x):
+    with mesh, global_shard_guard(  # noqa: F821
+        MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+    ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+        x = _shard_inputs(x, mesh)
+
+        def loss_fn(variables, x):
+            output, aux = block.apply(variables, x)
+            main = jnp.mean(output.astype(jnp.float32) ** 2)
+            return main + (aux.astype(jnp.float32) if aux is not None else 0.0)
+
+        _hb("  -> jit(grad(loss_fn))")
+        grads = jax.jit(jax.grad(loss_fn))(variables, x)
+        jax.block_until_ready(jax.tree_util.tree_leaves(grads)[0])
+        return grads
+
+
+def _unwrap(x):
+    return x.value if hasattr(x, "value") else x
+
+
+def _local_shard(x):
+    """Return the local (this-process) shard of a global JAX Array as numpy.
+
+    Every assertion in this file is structural ("is this finite", "is this
+    non-zero", "is parity within 5e-2"). For all of these, checking the
+    *local* shard on each process is just as valid as gathering everything
+    to the host -- if any rank has NaN, that rank's assertion fires; if
+    any rank's parity diverges, that rank's assertion fires. We avoid
+    triggering a cross-process collective, which under JAX multi-host can
+    deadlock if procs disagree on the order in which they emit it (we hit
+    this on a first attempt with ``multihost_utils.process_allgather``).
+
+    ``arr.addressable_data(0)`` returns the local-device view of the
+    sharded array. With one GPU per process (which is the whole point of
+    this multiprocess launcher), there is exactly one addressable shard.
+    """
+    return np.asarray(jax.device_get(x.addressable_data(0)))
+
+
+# -----------------------------------------------------------------------------
+# Smoke shapes (identical to the single-process file's SMOKE_* constants).
+# -----------------------------------------------------------------------------
+
+SMOKE_BATCH = EP_SIZE * FSDP_SIZE  # 4 -- one micro-batch per device
+SMOKE_SEQ = 16
+SMOKE_HIDDEN = 32
+SMOKE_INTER = 64
+SMOKE_NUM_EXPERTS = 4
+SMOKE_TOPK = 2
+
+
+@pytest.mark.triton
+class TestMoeVjpMultiprocessSmoke:
+    """Level 2 smoke under the multiprocess launcher (one GPU/process).
+
+    Mirrors :class:`TestMoeVjpDistributedSmoke` from the
+    single-process file. Same assertions, same shapes -- only the
+    JAX bootstrap differs.
+    """
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_fwd_and_bwd_smoke(self, mesh, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        block = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=backend,
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(0),
+            (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+        variables, output, aux = _init_apply(block, mesh, x, jax.random.PRNGKey(1))
+        # Local-shard checks (see _local_shard docstring for why).
+        out_local = _local_shard(output)
+        assert output.dtype == x.dtype
+        assert np.all(np.isfinite(out_local)), "output has NaN/Inf"
+        assert aux is None
+        grads = _grad_step(block, variables, mesh, x)
+        for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
+            g_local = _local_shard(_unwrap(grads["params"][name]))
+            assert np.all(np.isfinite(g_local)), f"{name} grad has NaN/Inf"
+            assert np.any(g_local != 0.0), f"{name} grad is identically zero"
+
+    @pytest.mark.parametrize("backend_name", ["pure_jax", "triton"])
+    def test_aux_loss_smoke(self, mesh, backend_name):
+        backend = PermutationBackend(backend_name)  # noqa: F821
+        block = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=backend,
+            aux_loss_coeff=1e-2,
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(4),
+            (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+        variables, output, aux = _init_apply(block, mesh, x, jax.random.PRNGKey(5))
+        out_local = _local_shard(output)
+        assert np.all(np.isfinite(out_local)), "output has NaN/Inf under aux"
+        assert aux is not None
+        assert aux.shape == ()
+        aux_local = _local_shard(aux)
+        assert np.isfinite(aux_local), "aux is NaN/Inf"
+        grads = _grad_step(block, variables, mesh, x)
+        g_gate_local = _local_shard(_unwrap(grads["params"]["gate_kernel"]))
+        assert np.all(np.isfinite(g_gate_local)), "gate grad NaN/Inf under aux"
+
+    def test_pure_jax_triton_parity(self, mesh):
+        block_pj = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=PermutationBackend.PURE_JAX,  # noqa: F821
+        )
+        block_tr = _make_block(
+            num_experts=SMOKE_NUM_EXPERTS,
+            num_experts_per_tok=SMOKE_TOPK,
+            intermediate_size=SMOKE_INTER,
+            permutation_backend=PermutationBackend.TRITON,  # noqa: F821
+        )
+        x = jax.random.normal(
+            jax.random.PRNGKey(6),
+            (SMOKE_BATCH, SMOKE_SEQ, SMOKE_HIDDEN),
+            dtype=jnp.bfloat16,
+        )
+        variables, out_pj, _ = _init_apply(block_pj, mesh, x, jax.random.PRNGKey(7))
+        with mesh, global_shard_guard(  # noqa: F821
+            MeshResource(ep_resource=EP_AXIS, fsdp_resource=FSDP_AXIS)  # noqa: F821
+        ), nn_partitioning.axis_rules(LOGICAL_AXIS_RULES):
+            x_sh = _shard_inputs(x, mesh)
+            out_tr, _ = jax.jit(block_tr.apply)(variables, x_sh)
+
+        out_pj_local = _local_shard(out_pj)
+        out_tr_local = _local_shard(out_tr)
+        diff = float(np.max(np.abs(out_pj_local - out_tr_local)))
+        assert diff < 5e-2, f"forward parity breach: max_abs_diff={diff}"
+
+        grads_pj = _grad_step(block_pj, variables, mesh, x)
+        grads_tr = _grad_step(block_tr, variables, mesh, x)
+        for name in ("gate_kernel", "wi_0", "wi_1", "wo"):
+            g_pj = _local_shard(_unwrap(grads_pj["params"][name]))
+            g_tr = _local_shard(_unwrap(grads_tr["params"][name]))
+            d = float(np.max(np.abs(g_pj - g_tr)))
+            assert d < 5e-2, f"grad parity breach on {name}: max_abs_diff={d}"
diff --git a/transformer_engine/common/triton/permutation.py b/transformer_engine/common/triton/permutation.py
index 75bb85f5ec..f562b5f131 100644
--- a/transformer_engine/common/triton/permutation.py
+++ b/transformer_engine/common/triton/permutation.py
@@ -4,6 +4,8 @@
 
 """Efficient Permutation kernels written with OpenAI Triton."""
 
+import os
+
 import triton
 import triton.language as tl
 
@@ -12,6 +14,45 @@
 from packaging import version
 
 
+_PERMUTATION_AUTOTUNE_BLOCK_SIZES = (64, 128, 256, 512, 1024, 2048, 4096)
+
+
+def _permutation_autotune_configs():
+    """Build the autotune ``configs`` list shared by every permutation
+    Triton kernel below.
+
+    Honours the ``NVTE_TRITON_PERMUTATION_BLOCK_SIZES`` environment
+    variable (comma-separated list of ints) so callers can shrink the
+    autotune space for tests / CI. With a single value the kernel
+    effectively skips autotuning entirely -- useful for the MoE VJP
+    smoke suite, which only cares about correctness and would otherwise
+    pay several minutes of cold-start MLIR->LLVM->PTX->cubin compile per
+    config-per-kernel. The default (unset) preserves the original
+    7-config sweep used in production.
+
+    Raises ``ValueError`` on a malformed env var so silent typos can't
+    accidentally pin everyone to a single suboptimal config in
+    production.
+    """
+    override = os.environ.get("NVTE_TRITON_PERMUTATION_BLOCK_SIZES")
+    if override:
+        try:
+            block_sizes = tuple(int(s) for s in override.split(",") if s.strip())
+        except ValueError as e:
+            raise ValueError(
+                "NVTE_TRITON_PERMUTATION_BLOCK_SIZES must be a comma-"
+                f"separated list of ints, got: {override!r}"
+            ) from e
+        if not block_sizes:
+            raise ValueError(
+                "NVTE_TRITON_PERMUTATION_BLOCK_SIZES is set but parsed empty;"
+                f" raw value was {override!r}"
+            )
+    else:
+        block_sizes = _PERMUTATION_AUTOTUNE_BLOCK_SIZES
+    return [triton.Config({"BLOCK_SIZE": bs}) for bs in block_sizes]
+
+
 # The following three argsort related kernels are adapted from
 # the issue https://github.com/triton-lang/triton/issues/3698
 
@@ -295,15 +336,7 @@ def _permute_kernel(
 
 try:
     _permute_kernel = triton.autotune(
-        configs=[
-            triton.Config({"BLOCK_SIZE": 64}),
-            triton.Config({"BLOCK_SIZE": 128}),
-            triton.Config({"BLOCK_SIZE": 256}),
-            triton.Config({"BLOCK_SIZE": 512}),
-            triton.Config({"BLOCK_SIZE": 1024}),
-            triton.Config({"BLOCK_SIZE": 2048}),
-            triton.Config({"BLOCK_SIZE": 4096}),
-        ],
+        configs=_permutation_autotune_configs(),
         key=["hidden_size"],
     )(_permute_kernel)
 except RuntimeError:
@@ -416,15 +449,7 @@ def _unpermute_kernel(
 
 try:
     _unpermute_kernel = triton.autotune(
-        configs=[
-            triton.Config({"BLOCK_SIZE": 64}),
-            triton.Config({"BLOCK_SIZE": 128}),
-            triton.Config({"BLOCK_SIZE": 256}),
-            triton.Config({"BLOCK_SIZE": 512}),
-            triton.Config({"BLOCK_SIZE": 1024}),
-            triton.Config({"BLOCK_SIZE": 2048}),
-            triton.Config({"BLOCK_SIZE": 4096}),
-        ],
+        configs=_permutation_autotune_configs(),
         key=["hidden_size"],
     )(_unpermute_kernel)
 except RuntimeError:
@@ -525,15 +550,7 @@ def _unpermute_bwd_with_merging_probs_kernel(
 
 try:
     _unpermute_bwd_with_merging_probs_kernel = triton.autotune(
-        configs=[
-            triton.Config({"BLOCK_SIZE": 64}),
-            triton.Config({"BLOCK_SIZE": 128}),
-            triton.Config({"BLOCK_SIZE": 256}),
-            triton.Config({"BLOCK_SIZE": 512}),
-            triton.Config({"BLOCK_SIZE": 1024}),
-            triton.Config({"BLOCK_SIZE": 2048}),
-            triton.Config({"BLOCK_SIZE": 4096}),
-        ],
+        configs=_permutation_autotune_configs(),
         key=["hidden_size"],
     )(_unpermute_bwd_with_merging_probs_kernel)
 except RuntimeError:
@@ -643,15 +660,7 @@ def _sort_chunks_by_map_kernel(
 
 try:
     _sort_chunks_by_map_kernel = triton.autotune(
-        configs=[
-            triton.Config({"BLOCK_SIZE": 64}),
-            triton.Config({"BLOCK_SIZE": 128}),
-            triton.Config({"BLOCK_SIZE": 256}),
-            triton.Config({"BLOCK_SIZE": 512}),
-            triton.Config({"BLOCK_SIZE": 1024}),
-            triton.Config({"BLOCK_SIZE": 2048}),
-            triton.Config({"BLOCK_SIZE": 4096}),
-        ],
+        configs=_permutation_autotune_configs(),
         key=["hidden_size"],
     )(_sort_chunks_by_map_kernel)
 except RuntimeError:
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 4ff6d07986..94b2de9573 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -2024,9 +2024,14 @@ def grouped_gemm_copy_group_sizes(
     return out
 
 
-@cache
 def _should_enforce_v2_grouped_gemm() -> bool:
-    """Read NVTE_JAX_ENFORCE_V2_GROUPED_GEMM once per process (cached)."""
+    """Read NVTE_JAX_ENFORCE_V2_GROUPED_GEMM.
+
+    Not cached so tests can flip the env var with ``monkeypatch.setenv``
+    and have it picked up on the next call. This is called only on
+    grouped-GEMM dispatch (not in any tight loop), so the per-call
+    ``getenv`` cost is negligible.
+    """
     val = os.getenv("NVTE_JAX_ENFORCE_V2_GROUPED_GEMM", "0")
     try:
         return bool(int(val))
diff --git a/transformer_engine/jax/flax/__init__.py b/transformer_engine/jax/flax/__init__.py
index 92a968f061..adf9c8911b 100644
--- a/transformer_engine/jax/flax/__init__.py
+++ b/transformer_engine/jax/flax/__init__.py
@@ -9,6 +9,7 @@
     make_dot_general_cls,
     make_grouped_dense_cls,
 )
+from .moe import _MoEBlock
 from .transformer import extend_logical_axis_rules
 from .transformer import DotProductAttention, MultiHeadAttention, RelativePositionBiases
 from .transformer import TransformerLayer, TransformerLayerType
@@ -18,6 +19,7 @@
     "LayerNorm",
     "LayerNormDenseGeneral",
     "LayerNormMLP",
+    "_MoEBlock",
     "wrap_function_in_te_state_module",
     "make_dot_general_cls",
     "make_grouped_dense_cls",
diff --git a/transformer_engine/jax/flax/moe.py b/transformer_engine/jax/flax/moe.py
new file mode 100644
index 0000000000..f02d6650a0
--- /dev/null
+++ b/transformer_engine/jax/flax/moe.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Flax Linen MoE block for TransformerEngine JAX.
+
+This module exposes :class:`_MoEBlock`, an experimental Flax Linen layer
+that is a thin wrapper around the framework-agnostic functional MoE entry
+point :func:`transformer_engine.jax.moe.moe`. The wrapper's only job is
+to:
+
+1. Register the gate kernel, per-expert FFN kernels, and optional biases
+   as ``self.param`` slots (with the right
+   :func:`flax.linen.with_logical_partitioning` annotations so JAX's
+   sharding layer FSDPs the params correctly).
+2. Resolve the EP axis name from the active
+   :class:`transformer_engine.jax.sharding.MeshResource`.
+3. Forward all knobs to :func:`moe`.
+
+All routing, dispatch, FFN, combine, and aux-loss logic lives in
+``moe.py`` under a *single* ``jax.custom_vjp`` so future fusions
+(FP8-on-the-wire EP, fused ``ragged_all_to_all + grouped_gemm``, gate +
+route + dispatch fusion) can land without touching this wrapper.
+
+The class is intentionally underscore-prefixed; the public ``MoEBlock``
+alias will be introduced once TE's NCCL-backed EP component (and the
+recipe-driven alignment follow-up) stabilises (target: the TE release
+following the 2.16 code freeze).
+"""
+
+from typing import Any, Callable, NewType, Optional, Tuple, Union
+
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from jax.sharding import PartitionSpec as P  # noqa: F401  (re-exported for convenience)
+
+from ..moe import PermutationBackend, moe
+from ..quantize import noop_quantizer_set
+from ..router import ScoreFunction
+from ..sharding import get_active_resource_axis
+from .module import TransformerEngineBase
+
+PRNGKey = Any
+Shape = Tuple[int, ...]
+DType = NewType("DType", jnp.dtype)
+Array = NewType("Array", jnp.ndarray)
+Initializer = Callable[[PRNGKey, Shape, DType], Array]
+
+
+__all__ = ["PermutationBackend", "_MoEBlock"]
+
+
+class _MoEBlock(TransformerEngineBase):
+    """Experimental Flax MoE layer over TransformerEngine.
+
+    See module docstring for the design (this class is a thin Flax
+    wrapper around :func:`transformer_engine.jax.moe.moe`). Constructor
+    knob set kept compatible with the previous bespoke implementation so
+    existing call sites need no changes.
+
+    Parameters
+    ----------
+    num_experts : int
+        Total number of experts. Under EP this must be divisible by the
+        EP mesh axis size.
+    num_experts_per_tok : int
+        Top-k value for routing.
+    intermediate_size : int
+        Hidden dim of the per-expert FFN (the inner ``mlp`` axis).
+    activation_type : str
+        Activation between ``layer_w0 @ wi_0`` and the elementwise
+        product with ``layer_w0 @ wi_1``. Default ``"silu"``.
+
+    score_function : Union[str, ScoreFunction]
+        ``"softmax"`` (default) or ``"sigmoid"`` for the routing scores.
+    use_pre_softmax : bool
+        Apply softmax before topk (vs. after).
+    num_groups, group_topk : Optional[int]
+        Grouped top-k knobs (DeepSeek-style). ``None`` disables grouping.
+    scaling_factor : float
+        Multiplier on the routing weights.
+    use_expert_bias : bool
+        If ``True``, registers a per-expert routing bias (shape ``[E]``).
+        Only meaningful with ``score_function="sigmoid"``; the underlying
+        primitive validates the pairing.
+    aux_loss_coeff : float
+        If ``> 0``, return the MoE auxiliary load-balancing loss scalar
+        in addition to the main output.
+
+    gate_kernel_axes, wi_kernel_axes, wo_kernel_axes, input_axes :
+        Logical sharding axis tuples (consumed by Flax's
+        :func:`with_logical_partitioning` and our internal
+        :func:`with_sharding_constraint_by_logical_axes`).
+    data_parallelism_axes : tuple[str, ...]
+        FSDP axes over which the input *batch* dim is sharded IN
+        ADDITION to the EP axis. Empty (default) means activations are
+        replicated across non-EP axes within an EP group; set e.g.
+        ``("fsdp",)`` for true FSDP-of-batch where each device owns a
+        unique slice of the batch.
+    permutation_backend : PermutationBackend
+        ``PURE_JAX`` (default) or ``TRITON``.
+    _align_size : int
+        Per-expert group-size alignment (``0`` disables; required > 0
+        for quantized grouped GEMM). Internal knob; will be inferred
+        from the active quantization recipe in a follow-up PR.
+
+    dtype : jnp.dtype
+        Compute / parameter dtype.
+    kernel_init, bias_init, expert_bias_init : Initializers.
+    use_bias : bool
+        Register per-expert FFN biases.
+
+    Quantization is currently configured via the standard TE autocast
+    context (``fp8_autocast``/``with_quantizer_set``); per-call
+    quantizer sets can also be passed through ``__call__``'s
+    ``quantizer_sets`` keyword once we stabilise the recipe pipeline.
+    """
+
+    # Architecture
+    num_experts: int = 8
+    num_experts_per_tok: int = 2
+    intermediate_size: int = 2048
+    activation_type: str = "silu"
+
+    # Routing
+    score_function: Union[str, ScoreFunction] = "softmax"
+    use_pre_softmax: bool = False
+    num_groups: Optional[int] = None
+    group_topk: Optional[int] = None
+    scaling_factor: float = 1.0
+    use_expert_bias: bool = False
+    aux_loss_coeff: float = 0.0
+
+    # Sharding (logical axes)
+    gate_kernel_axes: Tuple[Optional[str], ...] = ()
+    wi_kernel_axes: Tuple[Optional[str], ...] = ("exp", "embed", "mlp")
+    wo_kernel_axes: Tuple[Optional[str], ...] = ("exp", "mlp", "embed")
+    input_axes: Tuple[Optional[str], ...] = ()
+
+    # Parallelism
+    data_parallelism_axes: Tuple[str, ...] = ()
+
+    # Permutation
+    permutation_backend: PermutationBackend = PermutationBackend.PURE_JAX
+    _align_size: int = 0
+
+    # Dtypes / init / misc
+    dtype: DType = jnp.float32
+    kernel_init: Optional[Initializer] = None
+    bias_init: Initializer = nn.initializers.zeros
+    expert_bias_init: Initializer = nn.initializers.zeros
+    use_bias: bool = False
+
+    def __post_init__(self):
+        if self.kernel_init is None:
+            object.__setattr__(
+                self,
+                "kernel_init",
+                nn.initializers.variance_scaling(
+                    1.0, "fan_in", "truncated_normal", dtype=self.dtype
+                ),
+            )
+        if not isinstance(self.permutation_backend, PermutationBackend):
+            raise TypeError(
+                "permutation_backend must be a PermutationBackend, got"
+                f" {self.permutation_backend!r}"
+            )
+        super().__post_init__()
+
+    @nn.compact
+    def __call__(self, inputs: Array) -> Tuple[Array, Optional[Array]]:
+        """Run the MoE forward pass.
+
+        Parameters
+        ----------
+        inputs : jnp.ndarray
+            ``[batch, sequence, hidden]``.
+
+        Returns
+        -------
+        output : jnp.ndarray
+            ``[batch, sequence, hidden]``.
+        aux_loss : Optional[jnp.ndarray]
+            Scalar load-balancing loss when ``aux_loss_coeff > 0``,
+            else ``None``.
+        """
+        assert (
+            inputs.ndim == 3
+        ), f"_MoEBlock expects [batch, sequence, hidden] input, got shape {inputs.shape}"
+        _, _, hidden_size = inputs.shape
+
+        # Param registrations -- must run OUTSIDE any JAX transform that
+        # alters the variable scope (e.g. shard_map). The functional
+        # ``moe(...)`` opens its own shard_map internally for the EP
+        # path, so registering params here is correct.
+        gate_kernel = self.param(
+            "gate_kernel",
+            nn.with_logical_partitioning(self.kernel_init, self.gate_kernel_axes),
+            (hidden_size, self.num_experts),
+            self.dtype,
+        )
+        wi_0 = self.param(
+            "wi_0",
+            nn.with_logical_partitioning(self.kernel_init, self.wi_kernel_axes),
+            (self.num_experts, hidden_size, self.intermediate_size),
+            self.dtype,
+        )
+        wi_1 = self.param(
+            "wi_1",
+            nn.with_logical_partitioning(self.kernel_init, self.wi_kernel_axes),
+            (self.num_experts, hidden_size, self.intermediate_size),
+            self.dtype,
+        )
+        wo = self.param(
+            "wo",
+            nn.with_logical_partitioning(self.kernel_init, self.wo_kernel_axes),
+            (self.num_experts, self.intermediate_size, hidden_size),
+            self.dtype,
+        )
+        wi_0_bias = wi_1_bias = wo_bias = None
+        if self.use_bias:
+            wi_0_bias = self.param(
+                "wi_0_bias",
+                nn.with_logical_partitioning(self.bias_init, ("exp", "mlp")),
+                (self.num_experts, self.intermediate_size),
+                self.dtype,
+            )
+            wi_1_bias = self.param(
+                "wi_1_bias",
+                nn.with_logical_partitioning(self.bias_init, ("exp", "mlp")),
+                (self.num_experts, self.intermediate_size),
+                self.dtype,
+            )
+            wo_bias = self.param(
+                "wo_bias",
+                nn.with_logical_partitioning(self.bias_init, ("exp", "embed")),
+                (self.num_experts, hidden_size),
+                self.dtype,
+            )
+        expert_bias = None
+        if self.use_expert_bias:
+            expert_bias = self.param(
+                "expert_bias",
+                nn.with_logical_partitioning(self.expert_bias_init, ("exp",)),
+                (self.num_experts,),
+                self.dtype,
+            )
+
+        ep_axis = get_active_resource_axis("ep_resource")
+
+        return moe(
+            inputs,
+            gate_kernel,
+            wi_0,
+            wi_1,
+            wo,
+            wi_0_bias,
+            wi_1_bias,
+            wo_bias,
+            expert_bias,
+            num_experts=self.num_experts,
+            num_experts_per_tok=self.num_experts_per_tok,
+            activation_type=self.activation_type,
+            score_function=self.score_function,
+            use_pre_softmax=self.use_pre_softmax,
+            num_groups=self.num_groups,
+            group_topk=self.group_topk,
+            scaling_factor=self.scaling_factor,
+            aux_loss_coeff=self.aux_loss_coeff,
+            permutation_backend=self.permutation_backend,
+            align_size=self._align_size,
+            gate_inside_vjp=True,
+            ep_axis=ep_axis,
+            data_parallelism_axes=self.data_parallelism_axes,
+            input_axes=self.input_axes,
+            gate_kernel_axes=self.gate_kernel_axes,
+            wi_kernel_axes=self.wi_kernel_axes,
+            wo_kernel_axes=self.wo_kernel_axes,
+            quantizer_sets=(noop_quantizer_set, noop_quantizer_set, noop_quantizer_set),
+            dtype=self.dtype,
+        )
diff --git a/transformer_engine/jax/moe.py b/transformer_engine/jax/moe.py
new file mode 100644
index 0000000000..6bad1f2a0c
--- /dev/null
+++ b/transformer_engine/jax/moe.py
@@ -0,0 +1,2000 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+"""Functional Mixture-of-Experts (MoE) entry point with a single fused VJP.
+
+This module exposes :func:`moe`, the framework-agnostic flat function that
+implements an entire MoE block (gate -> top-k routing -> token dispatch ->
+per-expert FFN -> token combine, plus optional expert parallelism via a
+shard_map / ragged_all_to_all collective) under a *single*
+``jax.custom_vjp``. It is the moral analog of
+:func:`transformer_engine.jax.layernorm_mlp.layernorm_mlp` for MoE: one
+custom_vjp boundary covers the whole block so future fusions (FP8 over the
+EP wire, fused ``ragged_all_to_all + grouped_gemm``, gate+route+dispatch
+fusion) can land without re-architecting the call site.
+
+Design rationale
+----------------
+
+The earlier MoE block (:class:`transformer_engine.jax.flax.moe._MoEBlock`)
+composed many narrower custom_vjps -- one per :func:`grouped_dense`, one
+per :func:`token_dispatch`, etc. Every nested custom_vjp is a place where
+a quantized :class:`ScaledTensor` cannot survive (JAX requires custom_vjp
+inputs / outputs to be plain ``jnp.ndarray`` ish pytrees). To enable
+end-to-end FP8 flow -- in particular FP8 carried over the EP
+ragged_all_to_all -- the dispatch's quantize, the a2a, the per-expert
+FFN, the inverse a2a, and the combine all have to live inside the same
+VJP. This file collapses them into one.
+
+Implementation conventions
+--------------------------
+
+* No nested ``custom_vjp``. Every primitive's ``_fwd`` and ``_bwd`` is
+  called directly (e.g. :func:`tex.fused_topk_with_score_function_fwd` /
+  ``_bwd``, :func:`unpermute_with_mask_map`,
+  :func:`unpermute_bwd_with_merging_probs`,
+  :func:`sort_chunks_by_map(is_forward=False)`,
+  forward + reverse :func:`jax.lax.ragged_all_to_all`) so the outer
+  ``_moe_bwd_rule`` controls the bwd graph end-to-end without invoking
+  ``jax.vjp`` for re-linearization.
+* The fwd/bwd context (``ctx``) is a plain ``dict`` whose keys depend on
+  the static configuration (permutation backend, EP active or not,
+  presence of biases, aux loss enabled). The ``_moe_fwd_rule`` builds a
+  matching ``ctx_specs`` dict in lockstep when opening the EP shard_map
+  so ``out_specs`` structurally matches the body's return.
+* :func:`_dispatch` is the helper that wraps
+  ``permute -> a2a -> local_permute`` (forward); :func:`_combine` is its
+  inverse. Their ``_bwd`` siblings drive the inverse collectives in the
+  bwd rule. None of these helpers form a custom_vjp boundary.
+"""
+
+from enum import Enum
+from functools import partial
+from typing import Any, Callable, NewType, Optional, Tuple, Union
+
+import jax
+import jax.numpy as jnp
+from jax.sharding import PartitionSpec as P
+
+from . import cpp_extensions as tex
+from .permutation import (
+    PureJaxPermState,
+    compute_ragged_all_to_all_params,
+    compute_reverse_ragged_all_to_all_params,
+    pure_jax_token_combine,
+    pure_jax_token_dispatch,
+    routing_map_to_selected_experts,
+)
+from .quantize import (
+    QuantizerSet,
+    ScaledTensor,
+    TensorUsage,
+    noop_quantizer_set,
+    with_sharding_constraint_by_logical_axes,
+)
+from .router import ScoreFunction, _validate_score_function
+from .sharding import _get_mesh
+from .triton_extensions.permutation import (
+    make_chunk_sort_map,
+    make_row_id_map,
+    permute_with_mask_map,
+    permute_with_mask_map_and_pad,
+    sort_chunks_by_map,
+    unpermute_bwd_with_merging_probs,
+    unpermute_bwd_with_merging_probs_and_unpad,
+    unpermute_with_mask_map,
+    unpermute_with_mask_map_and_unpad,
+)
+from .flax.module import _convert_to_activation_function
+
+PRNGKey = Any
+Shape = Tuple[int, ...]
+DType = NewType("DType", jnp.dtype)
+Array = NewType("Array", jnp.ndarray)
+
+
+__all__ = ["moe", "PermutationBackend"]
+
+
+# =============================================================================
+# Enums
+# =============================================================================
+
+
+class PermutationBackend(Enum):
+    """Token-dispatch / combine backend used by :func:`moe`.
+
+    * ``PURE_JAX``: ``jnp.argsort`` + gather paths compiled as plain XLA;
+      typically faster than ``TRITON`` in current testing because XLA can
+      fuse the ops with surrounding work.
+    * ``TRITON``: TE's fused Triton kernels.
+    """
+
+    PURE_JAX = "pure_jax"
+    TRITON = "triton"
+
+
+# =============================================================================
+# ctx / dispatch-state key conventions
+# =============================================================================
+#
+# Both ``ctx`` (carried fwd_rule -> bwd_rule) and the dispatch state
+# (carried _dispatch -> _combine / _dispatch_bwd / _combine_bwd) are plain
+# python dicts. Using a dict (rather than a flax_struct.dataclass) lets us
+# vary the populated keys with the static config without breaking
+# ``shard_map``'s ``out_specs`` structural match: the spec dict and the
+# value dict are built with the SAME keys via :func:`_build_ctx_specs`.
+#
+# Below is the key glossary so the rest of the file reads cleanly.
+#
+# DispatchState (dict): values are jnp.ndarray unless noted
+#   Always present:
+#     "group_sizes"             [n_groups]   per-expert token counts
+#                                            (n_groups = E for no-EP,
+#                                             E_local for EP)
+#     "ep_active"               bool         (carried as a Python flag,
+#                                             not in the dict; passed
+#                                             alongside)
+#   PURE_JAX backend:
+#     "sorted_indices"          [num_real + padding] argsort indices
+#     "routing_weights"         [num_tokens, topk]   per-token-per-expert weights
+#   TRITON backend:
+#     "row_id_map"              [num_tokens, 2*E + 1]
+#     "pad_offsets"             [E] or None
+#     "merging_probs"           [num_tokens, E]
+#   EP-only:
+#     "all_shards_tokens_per_expert" [num_ep, E]
+#     "local_perm_row_id_map"   [recv_buffer_rows]
+#     "local_perm_inv_row_id_map" [recv_buffer_rows]
+#
+# NOTE: per-shard compile-time-constant shapes (num_real_tokens,
+# padding_size, pre/post_a2a_buffer_shape) are NOT stored in this
+# dict; they are recomputed in _body_fwd/_body_bwd via
+# _compute_static_shape_info and passed as Python ints / int tuples to
+# the dispatch/combine helpers. Storing them in the dict would cause
+# JAX's pytree-flatten across the shard_map boundary to coerce them
+# into JitTracer 0-d arrays, which breaks Python-level control flow
+# (e.g. ``if padding > 0``) and ``jnp.zeros(shape)`` in the bwd.
+#
+# MoECtx (dict): values are jnp.ndarray / ScaledTensor unless noted
+#   Always present:
+#     "x"                       [B, S, H]
+#     "gate_kernel"             [H, E] (only meaningful when gate_inside_vjp=True)
+#     "logits_2d"               [T, E]   T = local-batch * S
+#     "saved_scores"            [T, E]   from fused_topk fwd primitive
+#     "routing_map"             [T, E]
+#     "dispatch"                DispatchState dict
+#     "casted_sorted_x_lhs_trans"   ScaledTensor or ndarray
+#     "casted_wi_0_rhs_trans"   ScaledTensor or ndarray
+#     "casted_wi_1_rhs_trans"   ScaledTensor or ndarray
+#     "layer_w0"                ndarray  (pre-activation)
+#     "layer_w1"                ndarray
+#     "casted_intermediate_lhs_trans" ScaledTensor or ndarray
+#     "casted_wo_rhs_trans"     ScaledTensor or ndarray
+#     "expert_outputs"          ndarray  (FFN output, needed for TRITON
+#                                         combine_bwd's
+#                                         unpermute_bwd_with_merging_probs)
+#     "local_group_sizes"       [n_groups] -- mirrors dispatch.group_sizes
+#                                              but kept here for FFN bwd
+#                                              convenience
+#   Optional:
+#     "expert_bias"             [E]   only when expert_bias was provided
+#     "wi_0_bias_shape"         tuple -- only when bias is used (carried
+#                                        non-diff via static side; here
+#                                        only if needed)
+#     "aux_const_buf"           ndarray  -- only when aux_loss_coeff > 0
+#     "aux_tokens_per_expert"   [E]      -- ditto
+#     "aux_logits_for_score"    [global_T, E] -- ditto, may be the
+#                                                gathered global logits
+#                                                or the local logits
+
+
+# =============================================================================
+# Static shape helper
+# =============================================================================
+#
+# A set of per-shard shape/size values that the dispatch and combine
+# helpers (both fwd and bwd) need. They're all derivable from existing
+# static args, so we recompute them in both ``_body_fwd`` and
+# ``_body_bwd`` and pass them as Python ints / int-tuples through
+# explicit kwargs. We MUST NOT stash them inside the dynamic
+# ``state`` / ``ctx`` dict: when the dict crosses the EP shard_map's
+# out_specs/in_specs boundary, JAX's pytree-flatten coerces any Python
+# int leaves into traced 0-d arrays, which then breaks dependent Python
+# code in the bwd (e.g. ``if padding > 0`` and ``jnp.zeros(shape)``).
+
+
+def _compute_static_shape_info(
+    *,
+    batch_size: int,
+    sequence_length: int,
+    hidden: int,
+    num_experts: int,
+    num_experts_per_tok: int,
+    align_size: int,
+    ep_active: bool,
+    num_ep: int = 1,
+    fsdp_sizes: Tuple[int, ...] = (),
+    recv_buffer_rows: int = 0,
+    batch_is_per_shard: bool = True,
+) -> dict:
+    """Compute per-shard compile-time-constant shape info used by both
+    dispatch/combine fwd and dispatch/combine bwd.
+
+    Returned dict has Python ints / int tuples (NOT jnp arrays) so the
+    caller can pass them as ordinary static keyword args. See the
+    module-level comment above for why this matters.
+
+    ``batch_is_per_shard`` controls whether ``batch_size`` is already
+    sharded (True -- e.g. when this is called from inside a shard_map
+    body, where ``x.shape[0]`` reports the per-shard batch size) or
+    global (False -- e.g. when computing from x.shape outside the
+    shard_map body).
+
+    Keys
+    ----
+    num_real_tokens : int
+        Per-shard count of real (non-padding) permuted tokens, i.e.
+        ``per_shard_num_tokens * num_experts_per_tok``.
+    padding_size : int
+        Per-shard number of alignment-padding tokens appended to the
+        sort buffer (``num_experts * (align_size - 1)`` when
+        ``align_size > 0``, else ``0``). Matches the convention used
+        by ``pure_jax_token_dispatch``.
+    pre_a2a_buffer_shape : tuple[int, int]
+        ``(num_real_tokens + padding_size, hidden)`` -- the per-shard
+        shape of the sorted-inputs buffer that is sent over the EP
+        ragged_all_to_all in the fwd direction.
+    post_a2a_buffer_shape : Optional[tuple[int, int]]
+        ``(recv_buffer_rows, hidden)`` when EP is active, ``None``
+        otherwise.
+    """
+    import math
+
+    if ep_active and not batch_is_per_shard:
+        dp_size = math.prod(fsdp_sizes) if fsdp_sizes else 1
+        per_shard_batch = batch_size // (num_ep * dp_size)
+    else:
+        per_shard_batch = batch_size
+    per_shard_num_tokens = per_shard_batch * sequence_length
+    num_real_tokens = per_shard_num_tokens * num_experts_per_tok
+    padding_size = num_experts * (align_size - 1) if align_size > 0 else 0
+    pre_a2a_buffer_shape = (num_real_tokens + padding_size, hidden)
+    post_a2a_buffer_shape = (recv_buffer_rows, hidden) if ep_active else None
+    return dict(
+        num_real_tokens=num_real_tokens,
+        padding_size=padding_size,
+        pre_a2a_buffer_shape=pre_a2a_buffer_shape,
+        post_a2a_buffer_shape=post_a2a_buffer_shape,
+    )
+
+
+# =============================================================================
+# Dispatch / combine helpers (no VJP boundary -- pure Python)
+# =============================================================================
+
+
+def _dispatch(
+    inputs_2d: jnp.ndarray,
+    sparse_probs: jnp.ndarray,
+    routing_map: jnp.ndarray,
+    *,
+    backend: PermutationBackend,
+    num_experts: int,
+    num_experts_per_tok: int,
+    align_size: int,
+    # EP-only:
+    ep_active: bool,
+    ep_axis: Optional[str],
+    num_ep: int,
+    recv_buffer_rows: int,
+    shard_id: Optional[jnp.ndarray] = None,
+) -> Tuple[jnp.ndarray, dict]:
+    """``permute -> (a2a -> local_permute) iff ep_active``.
+
+    Returns ``(sorted_x, state)`` where ``sorted_x`` has shape
+    ``[buffer_rows, hidden]`` -- ``E`` groups (no-EP) or ``E_local`` groups
+    (EP) -- and ``state`` is a dict carrying everything :func:`_combine`
+    and the bwd helpers need to reverse the operation.
+
+    Bypasses the ``custom_vjp``-wrapped public ``token_dispatch`` /
+    ``pure_jax_token_dispatch`` wrappers (well, mostly: PURE_JAX still
+    composes through ``pure_jax_token_dispatch`` because that helper has
+    no ``custom_vjp`` itself -- only its inner ``_sort_activations`` does,
+    which is fine since we never auto-diff through it from this layer).
+    For TRITON we call the underlying ``permute_with_mask_map`` /
+    ``permute_with_mask_map_and_pad`` primitives directly.
+    """
+    num_tokens, hidden = inputs_2d.shape
+    topk = num_experts_per_tok
+    state: dict = {}
+
+    # ------------------------------------------------------------------
+    # Step 1: global permute (every shard routes its own tokens over the
+    # full expert axis). Backend-specific.
+    # ------------------------------------------------------------------
+    if backend is PermutationBackend.PURE_JAX:
+        selected_experts, routing_weights = routing_map_to_selected_experts(
+            sparse_probs, routing_map, topk
+        )
+        sorted_inputs, perm_state, group_sizes = pure_jax_token_dispatch(
+            inputs_2d,
+            selected_experts,
+            num_experts=num_experts,
+            num_experts_per_tok=topk,
+            align_size=align_size,
+        )
+        # NOTE: ``perm_state.num_real_tokens`` and ``perm_state.padding_size``
+        # are compile-time Python ints; intentionally NOT stored in
+        # ``state`` (would be coerced to JitTracer 0-d arrays under
+        # the EP shard_map's pytree flatten). Recompute via
+        # ``_compute_static_shape_info`` in the bwd / EP-combine
+        # call sites that need them.
+        state["sorted_indices"] = perm_state.sorted_indices
+        state["routing_weights"] = routing_weights
+    else:
+        # TRITON backend -- inline the underlying primitive sequence
+        # (mirrors ``_token_dispatch_fwd_rule`` but exposes the residuals
+        # to our ctx instead of saving them inside another custom_vjp).
+        num_out_tokens = num_tokens * topk
+        row_id_map = make_row_id_map(routing_map, num_tokens, num_experts)
+        tokens_per_expert = jnp.sum(routing_map, axis=0).astype(jnp.int32)
+        if align_size > 0:
+            target_tokens_per_expert = (
+                jnp.ceil(tokens_per_expert / align_size) * align_size
+            ).astype(jnp.int32)
+            pad_lengths = target_tokens_per_expert - tokens_per_expert
+            cum_pad = jnp.cumsum(pad_lengths)
+            pad_offsets = jnp.concatenate([jnp.array([0], dtype=cum_pad.dtype), cum_pad[:-1]])
+            worst_case_out_tokens = (
+                (num_out_tokens + num_experts * (align_size - 1)) // align_size
+            ) * align_size
+            sorted_inputs, _ = permute_with_mask_map_and_pad(
+                inputs_2d,
+                row_id_map,
+                None,
+                pad_offsets,
+                num_tokens,
+                num_experts,
+                worst_case_out_tokens,
+                hidden,
+                align_size=align_size,
+            )
+            group_sizes = target_tokens_per_expert
+        else:
+            sorted_inputs, _ = permute_with_mask_map(
+                inputs_2d,
+                row_id_map,
+                None,
+                num_tokens,
+                num_experts,
+                num_out_tokens,
+                hidden,
+            )
+            pad_offsets = None
+            group_sizes = tokens_per_expert
+        state["row_id_map"] = row_id_map
+        state["pad_offsets"] = pad_offsets
+        state["merging_probs"] = sparse_probs
+
+    state["group_sizes"] = group_sizes
+
+    if not ep_active:
+        return sorted_inputs, state
+
+    # ------------------------------------------------------------------
+    # Step 2 (EP only): all_gather per-expert counts so every shard knows
+    # the [num_ep, num_experts] token-count matrix.
+    # ------------------------------------------------------------------
+    all_shards_tokens_per_expert = jax.lax.all_gather(
+        group_sizes[None, :],
+        axis_name=ep_axis,
+        axis=0,
+        tiled=True,
+    )
+
+    # ------------------------------------------------------------------
+    # Step 3 (EP only): forward ragged_all_to_all over the EP axis.
+    # ------------------------------------------------------------------
+    in_off, send_sz, out_off, recv_sz = compute_ragged_all_to_all_params(
+        all_shards_tokens_per_expert, shard_id, num_ep
+    )
+    pre_a2a_buffer_shape = sorted_inputs.shape
+    post_a2a_buffer_shape = (recv_buffer_rows, hidden)
+    recv_buf = jnp.zeros(post_a2a_buffer_shape, dtype=sorted_inputs.dtype)
+    x_recv = jax.lax.ragged_all_to_all(
+        sorted_inputs, recv_buf, in_off, send_sz, out_off, recv_sz, axis_name=ep_axis
+    )
+
+    # ------------------------------------------------------------------
+    # Step 4 (EP only): local permute -- (source_shard, expert) ->
+    # (expert, shard). Inlined ``local_permute_after_a2a`` so we control
+    # both the row_id_map and its inverse for the bwd.
+    # ------------------------------------------------------------------
+    num_experts_local = num_experts // num_ep
+    local_expert_start = shard_id * num_experts_local
+    local_expert_columns = jax.lax.dynamic_slice(
+        all_shards_tokens_per_expert,
+        start_indices=(0, local_expert_start),
+        slice_sizes=(num_ep, num_experts_local),
+    )
+    split_sizes = local_expert_columns.reshape(-1)  # source-major
+    indices_matrix = jnp.arange(num_ep * num_experts_local, dtype=jnp.int32).reshape(
+        num_ep, num_experts_local
+    )
+    sorted_chunk_indices = indices_matrix.T.reshape(-1)  # source-major -> expert-major
+    num_chunks = num_ep * num_experts_local
+    # Build a SINGLE row_id_map. ``is_forward=True`` permutes
+    # source-major -> expert-major; ``is_forward=False`` is the exact
+    # inverse (this is exactly what ``_sort_chunks_by_index_bwd_rule``
+    # uses on the saved residual). _MoEBlock builds two row_id_maps
+    # only because it calls ``sort_chunks_by_index`` twice -- once in
+    # ``local_permute_after_a2a`` and again in ``local_unpermute_before_a2a``;
+    # each of those wrappers calls ``make_chunk_sort_map`` internally.
+    # Here we share one map across (fwd permute, fwd inverse-permute,
+    # bwd permute, bwd inverse-permute).
+    local_perm_row_id_map = make_chunk_sort_map(
+        split_sizes, sorted_chunk_indices, recv_buffer_rows, num_chunks
+    )
+    sorted_x, _ = sort_chunks_by_map(
+        x_recv, local_perm_row_id_map, None, recv_buffer_rows, hidden, is_forward=True
+    )
+    local_group_sizes = jnp.sum(local_expert_columns, axis=0)
+
+    state["all_shards_tokens_per_expert"] = all_shards_tokens_per_expert
+    state["local_perm_row_id_map"] = local_perm_row_id_map
+    # NOTE: pre_a2a_buffer_shape and post_a2a_buffer_shape are compile-
+    # time int tuples; intentionally NOT stored in ``state`` (would be
+    # coerced to JitTracer 0-d arrays under the EP shard_map's pytree
+    # flatten). Recompute via ``_compute_static_shape_info`` in the
+    # bwd call sites that need them.
+    # For EP, we override ``group_sizes`` to be the per-local-expert
+    # counts (the FFN runs over E_local groups, not E). The original
+    # global ``group_sizes`` lives inside ``all_shards_tokens_per_expert``
+    # if anyone needs it for diagnostics.
+    state["group_sizes"] = local_group_sizes
+
+    return sorted_x, state
+
+
+def _combine(
+    expert_outputs: jnp.ndarray,
+    state: dict,
+    *,
+    backend: PermutationBackend,
+    ep_active: bool,
+    batch_size: int,
+    sequence_length: int,
+    dtype: jnp.dtype,
+    num_experts_per_tok: int,
+    # Per-shard compile-time-constant shape info (Python ints / int tuples).
+    # Computed by _compute_static_shape_info in the caller, passed here
+    # rather than stored in ``state`` to survive shard_map crossings.
+    num_real_tokens: int,
+    padding_size: int,
+    pre_a2a_buffer_shape: Tuple[int, int],
+    # EP-only:
+    ep_axis: Optional[str],
+    shard_id: Optional[jnp.ndarray] = None,
+    num_ep: int = 1,
+) -> jnp.ndarray:
+    """Inverse of :func:`_dispatch`. Returns ``[B, S, H]``."""
+    if ep_active:
+        # Step 1 (EP): inverse local permute. Reuse the SAME row_id_map
+        # built in _dispatch by setting is_forward=False (this is the
+        # exact inverse, identical to what
+        # ``_sort_chunks_by_index_bwd_rule`` does with the saved residual).
+        recv_buffer_rows, hidden = expert_outputs.shape
+        x_send_back, _ = sort_chunks_by_map(
+            expert_outputs,
+            state["local_perm_row_id_map"],
+            None,
+            recv_buffer_rows,
+            hidden,
+            is_forward=False,
+        )
+        # Step 2 (EP): reverse ragged_all_to_all.
+        in_off_r, send_sz_r, out_off_r, recv_sz_r = compute_reverse_ragged_all_to_all_params(
+            state["all_shards_tokens_per_expert"], shard_id, num_ep
+        )
+        send_back_buf = jnp.zeros(pre_a2a_buffer_shape, dtype=expert_outputs.dtype)
+        expert_outputs = jax.lax.ragged_all_to_all(
+            x_send_back,
+            send_back_buf,
+            in_off_r,
+            send_sz_r,
+            out_off_r,
+            recv_sz_r,
+            axis_name=ep_axis,
+        )
+
+    # Step 3: global combine.
+    if backend is PermutationBackend.PURE_JAX:
+        # Reuse the reference pure-jax implementation; it has no
+        # custom_vjp on its outer surface so we can call it freely.
+        perm_state = PureJaxPermState(
+            sorted_indices=state["sorted_indices"],
+            num_real_tokens=num_real_tokens,
+            padding_size=padding_size,
+        )
+        return pure_jax_token_combine(
+            expert_outputs,
+            perm_state,
+            state["routing_weights"],
+            num_experts_per_tok=num_experts_per_tok,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+        )
+    # TRITON
+    num_tokens = state["row_id_map"].shape[0]
+    num_experts = (state["row_id_map"].shape[1] - 1) // 2
+    hidden = expert_outputs.shape[-1]
+    if state["pad_offsets"] is not None:
+        out_2d, _ = unpermute_with_mask_map_and_unpad(
+            expert_outputs,
+            state["row_id_map"],
+            state["merging_probs"],
+            None,
+            state["pad_offsets"],
+            num_tokens,
+            num_experts,
+            hidden,
+        )
+    else:
+        out_2d, _ = unpermute_with_mask_map(
+            expert_outputs,
+            state["row_id_map"],
+            state["merging_probs"],
+            None,
+            num_tokens,
+            num_experts,
+            hidden,
+        )
+    return out_2d.reshape(batch_size, sequence_length, hidden).astype(dtype)
+
+
+def _combine_bwd(
+    d_output: jnp.ndarray,
+    state: dict,
+    expert_outputs: jnp.ndarray,
+    *,
+    backend: PermutationBackend,
+    ep_active: bool,
+    batch_size: int,
+    sequence_length: int,
+    dtype: jnp.dtype,
+    num_experts: int,
+    num_experts_per_tok: int,
+    # Per-shard compile-time-constant shape info (Python ints / int tuples).
+    # See ``_compute_static_shape_info`` and the note in ``_dispatch``
+    # for why these are kwargs rather than state-dict entries.
+    num_real_tokens: int,
+    padding_size: int,
+    post_a2a_buffer_shape: Optional[Tuple[int, int]],
+    # EP-only:
+    ep_axis: Optional[str],
+    shard_id: Optional[jnp.ndarray] = None,
+    num_ep: int = 1,
+) -> Tuple[jnp.ndarray, Optional[jnp.ndarray]]:
+    """Inverse of :func:`_combine` on the cotangent.
+
+    Returns ``(d_expert_outputs, d_routing_weights_or_merging_probs)``.
+
+    ``expert_outputs`` is the *forward* output of the FFN (same value the
+    fwd handed to :func:`_combine`). It's required by the TRITON
+    combine_bwd kernel; for PURE_JAX we don't need it but accept it for
+    a symmetric signature.
+    """
+    # Step 3 inverse: global combine bwd.
+    d_output_2d = d_output.reshape(-1, d_output.shape[-1])
+    if backend is PermutationBackend.PURE_JAX:
+        # The pure-jax combine is:
+        #   unsort = _sort_activations(expert_outputs, argsort(sorted_indices))
+        #   if pad: unsort = unsort[:num_real]
+        #   reshape -> einsum BKE,BK -> BE -> reshape to BSE
+        # Hand-derive the bwd in plain JAX (no custom_vjp involved):
+        unsort_indices = jnp.argsort(state["sorted_indices"])
+        topk = num_experts_per_tok
+        num_real = num_real_tokens
+        padding = padding_size
+        # Recover the unsorted intermediate that the fwd produced (we
+        # need it for the d_routing_weights pullback). Apply the same
+        # gather the fwd did.
+        unsort_intermediate = expert_outputs[unsort_indices]
+        if padding > 0:
+            unsort_intermediate = unsort_intermediate[:num_real]
+        # Bwd of einsum/reshape:
+        # output[B, E] = sum_K intermediate[B, K, E] * weights[B, K]
+        # d_intermediate[B, K, E] = d_output[B, E] * weights[B, K]
+        # d_weights[B, K]         = sum_E d_output[B, E] * intermediate[B, K, E]
+        rw = state["routing_weights"].reshape(-1, topk)
+        intermediate_3d = unsort_intermediate.reshape(rw.shape[0], topk, -1)
+        rw_cast = rw.astype(intermediate_3d.dtype)
+        d_intermediate_3d = jnp.einsum("BE,BK -> BKE", d_output_2d, rw_cast)
+        d_routing_weights = jnp.einsum("BE,BKE -> BK", d_output_2d, intermediate_3d).astype(
+            state["routing_weights"].dtype
+        )
+        d_routing_weights = d_routing_weights.reshape(state["routing_weights"].shape)
+        d_unsort_intermediate = d_intermediate_3d.reshape(num_real, -1)
+        # Pad back with zeros if the fwd stripped padding.
+        if padding > 0:
+            d_unsort_intermediate = jnp.concatenate(
+                [
+                    d_unsort_intermediate,
+                    jnp.zeros(
+                        (padding, d_unsort_intermediate.shape[-1]),
+                        dtype=d_unsort_intermediate.dtype,
+                    ),
+                ],
+                axis=0,
+            )
+        # Bwd of the gather is gather-by-original-indices:
+        #   sorted = unsort[argsort(sorted_indices)]
+        #   d_sorted = scatter d_unsort via argsort(sorted_indices)
+        #            = d_unsort[sorted_indices]  (gather by original sorted_indices,
+        #              which is the inverse of argsort(sorted_indices)).
+        d_expert_outputs_global = d_unsort_intermediate[state["sorted_indices"]]
+    else:
+        # TRITON combine bwd: requires fwd_input (expert_outputs).
+        num_tokens = state["row_id_map"].shape[0]
+        n_experts = (state["row_id_map"].shape[1] - 1) // 2
+        hidden = d_output_2d.shape[-1]
+        num_out_tokens = expert_outputs.shape[0]
+        if state["pad_offsets"] is not None:
+            d_expert_outputs_global, d_merging_probs = unpermute_bwd_with_merging_probs_and_unpad(
+                d_output_2d,
+                state["row_id_map"],
+                expert_outputs,
+                state["merging_probs"],
+                state["pad_offsets"],
+                num_tokens,
+                n_experts,
+                num_out_tokens,
+                hidden,
+            )
+            # The kernel only writes positions tokens map to; padded
+            # positions may contain NaN. Replace with zeros (matches
+            # ``_token_combine_bwd_rule``).
+            d_expert_outputs_global = jnp.where(
+                jnp.isnan(d_expert_outputs_global), 0.0, d_expert_outputs_global
+            )
+        else:
+            d_expert_outputs_global, d_merging_probs = unpermute_bwd_with_merging_probs(
+                d_output_2d,
+                state["row_id_map"],
+                expert_outputs,
+                state["merging_probs"],
+                num_tokens,
+                n_experts,
+                num_out_tokens,
+                hidden,
+            )
+        d_routing_weights = d_merging_probs
+
+    if not ep_active:
+        return d_expert_outputs_global, d_routing_weights
+
+    # Step 2 (EP) inverse: bwd of reverse ragged_all_to_all is a forward
+    # ragged_all_to_all using the SAME forward parameters (sender /
+    # receiver roles swap from the reverse direction back to forward).
+    in_off_f, send_sz_f, out_off_f, recv_sz_f = compute_ragged_all_to_all_params(
+        state["all_shards_tokens_per_expert"], shard_id, num_ep
+    )
+    recv_buf_for_bwd = jnp.zeros(post_a2a_buffer_shape, dtype=d_expert_outputs_global.dtype)
+    d_x_send_back = jax.lax.ragged_all_to_all(
+        d_expert_outputs_global,
+        recv_buf_for_bwd,
+        in_off_f,
+        send_sz_f,
+        out_off_f,
+        recv_sz_f,
+        axis_name=ep_axis,
+    )
+    # Step 1 (EP) inverse: combine fwd applied is_forward=False; the
+    # bwd is is_forward=True with the SAME row_id_map.
+    recv_buffer_rows, hidden = d_x_send_back.shape
+    d_expert_outputs, _ = sort_chunks_by_map(
+        d_x_send_back,
+        state["local_perm_row_id_map"],
+        None,
+        recv_buffer_rows,
+        hidden,
+        is_forward=True,
+    )
+    return d_expert_outputs, d_routing_weights
+
+
+def _dispatch_bwd(
+    d_sorted_x: jnp.ndarray,
+    state: dict,
+    inputs_2d_shape: Tuple[int, ...],
+    *,
+    backend: PermutationBackend,
+    ep_active: bool,
+    num_experts: int,
+    num_experts_per_tok: int,
+    # Per-shard compile-time-constant shape info (Python ints / int tuples).
+    # See ``_compute_static_shape_info`` and the note in ``_dispatch``
+    # for why these are kwargs rather than state-dict entries.
+    num_real_tokens: int,
+    padding_size: int,
+    pre_a2a_buffer_shape: Tuple[int, int],
+    # EP-only:
+    ep_axis: Optional[str],
+    shard_id: Optional[jnp.ndarray] = None,
+    num_ep: int = 1,
+) -> jnp.ndarray:
+    """Inverse of :func:`_dispatch` on the cotangent. Returns ``d_inputs_2d``.
+
+    The probs path through dispatch is always discarded (PURE_JAX never
+    threads probs through dispatch; TRITON technically does but the
+    caller drops ``permuted_probs``, so its cotangent is structurally
+    zero). The probs gradient instead flows back through
+    :func:`_combine_bwd`.
+    """
+    if ep_active:
+        # Step 4 inverse: dispatch fwd applied is_forward=True; bwd is
+        # is_forward=False with the SAME row_id_map.
+        recv_buffer_rows, hidden = d_sorted_x.shape
+        d_x_recv, _ = sort_chunks_by_map(
+            d_sorted_x,
+            state["local_perm_row_id_map"],
+            None,
+            recv_buffer_rows,
+            hidden,
+            is_forward=False,
+        )
+        # Step 3 inverse: bwd of forward ragged_a2a is the reverse-direction
+        # ragged_a2a using the SAME params with sender/receiver swapped.
+        in_off_r, send_sz_r, out_off_r, recv_sz_r = compute_reverse_ragged_all_to_all_params(
+            state["all_shards_tokens_per_expert"], shard_id, num_ep
+        )
+        recv_buf_pre = jnp.zeros(pre_a2a_buffer_shape, dtype=d_x_recv.dtype)
+        d_sorted_x = jax.lax.ragged_all_to_all(
+            d_x_recv,
+            recv_buf_pre,
+            in_off_r,
+            send_sz_r,
+            out_off_r,
+            recv_sz_r,
+            axis_name=ep_axis,
+        )
+
+    # Step 1 inverse: global permute bwd.
+    if backend is PermutationBackend.PURE_JAX:
+        # Fwd was: replicated = repeat(inputs_2d, topk, axis=0)
+        #          padded = pad(replicated, (0, padding_size))
+        #          sorted = padded[sorted_indices]
+        # Bwd:     d_padded = scatter via sorted_indices
+        #                   = d_sorted[argsort(sorted_indices)]
+        #          d_replicated = d_padded[:num_real]
+        #          d_inputs_2d  = d_replicated.reshape(T, topk, H).sum(axis=1)
+        sorted_indices = state["sorted_indices"]
+        num_real = num_real_tokens
+        padding = padding_size
+        topk = num_experts_per_tok
+        unsort_indices = jnp.argsort(sorted_indices)
+        d_padded = d_sorted_x[unsort_indices]
+        if padding > 0:
+            d_replicated = d_padded[:num_real]
+        else:
+            d_replicated = d_padded
+        num_tokens = inputs_2d_shape[0]
+        hidden = inputs_2d_shape[-1]
+        d_inputs_2d = d_replicated.reshape(num_tokens, topk, hidden).sum(axis=1)
+        return d_inputs_2d
+
+    # TRITON: bwd is unpermute_with_mask_map[_and_unpad].
+    num_tokens = inputs_2d_shape[0]
+    hidden = inputs_2d_shape[-1]
+    if state["pad_offsets"] is not None:
+        d_inputs_2d, _ = unpermute_with_mask_map_and_unpad(
+            d_sorted_x,
+            state["row_id_map"],
+            None,
+            None,
+            state["pad_offsets"],
+            num_tokens,
+            num_experts,
+            hidden,
+        )
+    else:
+        d_inputs_2d, _ = unpermute_with_mask_map(
+            d_sorted_x,
+            state["row_id_map"],
+            None,
+            None,
+            num_tokens,
+            num_experts,
+            hidden,
+        )
+    return d_inputs_2d
+
+
+# =============================================================================
+# Per-shard body
+# =============================================================================
+
+
+def _body_fwd(
+    captured: dict,
+    *,
+    # Statics
+    num_experts: int,
+    num_experts_per_tok: int,
+    activation_type: str,
+    score_function: ScoreFunction,
+    use_pre_softmax: bool,
+    num_groups: Optional[int],
+    group_topk: Optional[int],
+    scaling_factor: float,
+    aux_loss_coeff: float,
+    permutation_backend: PermutationBackend,
+    align_size: int,
+    gate_inside_vjp: bool,
+    quantizer_sets: Tuple[QuantizerSet, QuantizerSet, QuantizerSet],
+    dtype: jnp.dtype,
+    # EP-only statics
+    ep_active: bool,
+    ep_axis: Optional[str],
+    data_parallelism_axes: Tuple[str, ...],
+    fsdp_sizes: Tuple[int, ...],
+    num_ep: int,
+    num_experts_local: int,
+    recv_buffer_rows: int,
+) -> Tuple[jnp.ndarray, jnp.ndarray, dict]:
+    """Per-shard forward body. Returns ``(output, aux_loss, ctx_dict)``.
+
+    ``aux_loss`` is always materialized (zeros scalar when disabled) so
+    the ``shard_map``'s ``out_specs`` has a static structure.
+    """
+    if not gate_inside_vjp:
+        raise NotImplementedError(
+            "gate_inside_vjp=False is deferred to a follow-up PR; for now"
+            " the gate GEMM lives inside the MoE VJP."
+        )
+
+    x = captured["inputs"]
+    gate_kernel = captured["gate_kernel"]
+    wi_0 = captured["wi_0"]
+    wi_1 = captured["wi_1"]
+    wo = captured["wo"]
+    wi_0_bias = captured.get("wi_0_bias")
+    wi_1_bias = captured.get("wi_1_bias")
+    wo_bias = captured.get("wo_bias")
+    expert_bias = captured.get("expert_bias")
+
+    batch_size, sequence_length, hidden = x.shape
+
+    # ---------------- Stage 1: gate ----------------
+    gate_kernel_cast = gate_kernel.astype(x.dtype)
+    gate_logits = jnp.einsum("bsh,he->bse", x, gate_kernel_cast)
+    logits_2d = gate_logits.reshape(-1, num_experts)
+    inputs_2d = x.reshape(-1, hidden)
+
+    # ---------------- Stage 2: routing ----------------
+    # Under EP, expert_bias is sharded P(ep_axis); the router needs the
+    # full E-dim view, so all_gather it.
+    if ep_active and expert_bias is not None:
+        full_expert_bias = jax.lax.all_gather(expert_bias, axis_name=ep_axis, tiled=True)
+    else:
+        full_expert_bias = expert_bias
+    # Pass an empty array sentinel when expert_bias is unused (the
+    # underlying primitive expects a real ndarray, not None).
+    eb_arg = (
+        full_expert_bias if full_expert_bias is not None else jnp.zeros((0,), dtype=jnp.float32)
+    )
+    sparse_probs, routing_map, saved_scores = tex.fused_topk_with_score_function_fwd(
+        logits_2d,
+        topk=num_experts_per_tok,
+        use_pre_softmax=use_pre_softmax,
+        num_groups=-1 if num_groups is None else num_groups,
+        group_topk=-1 if group_topk is None else group_topk,
+        scaling_factor=scaling_factor,
+        score_function=score_function,
+        expert_bias=eb_arg,
+        compute_aux_scores=False,
+    )
+    sparse_probs = sparse_probs.astype(dtype)
+
+    # ---------------- Stage 2b: aux loss ----------------
+    if aux_loss_coeff > 0.0:
+        if ep_active:
+            collective_axes: Any = (
+                ep_axis if not data_parallelism_axes else (ep_axis, *data_parallelism_axes)
+            )
+            global_logits_2d = jax.lax.all_gather(
+                logits_2d, axis_name=collective_axes, axis=0, tiled=True
+            )
+            _, global_routing_map, _ = tex.fused_topk_with_score_function_fwd(
+                global_logits_2d,
+                topk=num_experts_per_tok,
+                use_pre_softmax=use_pre_softmax,
+                num_groups=-1 if num_groups is None else num_groups,
+                group_topk=-1 if group_topk is None else group_topk,
+                scaling_factor=scaling_factor,
+                score_function=score_function,
+                expert_bias=eb_arg,
+                compute_aux_scores=False,
+            )
+            aux_tokens_per_expert = jnp.sum(global_routing_map.astype(jnp.int32), axis=0)
+            aux_logits_for_score = global_logits_2d
+        else:
+            aux_tokens_per_expert = jnp.sum(routing_map.astype(jnp.int32), axis=0)
+            aux_logits_for_score = logits_2d
+        # Aux-side scores: clean per-expert scores (no grouped routing,
+        # no bias). compute_aux_scores=True takes a separate path that
+        # ignores the grouping knobs.
+        aux_probs, _aux_routing_map, aux_saved_scores = tex.fused_topk_with_score_function_fwd(
+            aux_logits_for_score.astype(jnp.float32),
+            topk=num_experts_per_tok,
+            use_pre_softmax=False,
+            num_groups=-1,
+            group_topk=-1,
+            scaling_factor=1.0,
+            score_function=score_function,
+            expert_bias=jnp.zeros((0,), dtype=jnp.float32),
+            compute_aux_scores=True,
+        )
+        aux_loss, aux_const_buf = tex.fused_moe_aux_loss_fwd(
+            aux_probs.astype(jnp.float32),
+            aux_tokens_per_expert.astype(jnp.int32),
+            topk=num_experts_per_tok,
+            coeff=aux_loss_coeff,
+        )
+    else:
+        aux_loss = jnp.zeros((), dtype=dtype)
+        aux_const_buf = None
+        aux_tokens_per_expert = None
+        aux_logits_for_score = None
+        aux_saved_scores = None
+
+    # ---------------- Stage 3: dispatch ----------------
+    shard_id = jax.lax.axis_index(ep_axis) if ep_active else None
+    sorted_x, dispatch_state = _dispatch(
+        inputs_2d,
+        sparse_probs,
+        routing_map,
+        backend=permutation_backend,
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        align_size=align_size,
+        ep_active=ep_active,
+        ep_axis=ep_axis,
+        num_ep=num_ep,
+        recv_buffer_rows=recv_buffer_rows,
+        shard_id=shard_id,
+    )
+    local_group_sizes = dispatch_state["group_sizes"]
+
+    # ---------------- Stage 4: per-expert FFN (inlined) ----------------
+    q_set_w0, q_set_w1, q_set_wo = quantizer_sets
+    if q_set_w0 == noop_quantizer_set:
+        wi_0 = wi_0.astype(sorted_x.dtype)
+    if q_set_w1 == noop_quantizer_set:
+        wi_1 = wi_1.astype(sorted_x.dtype)
+    if q_set_wo == noop_quantizer_set:
+        wo = wo.astype(sorted_x.dtype)
+
+    # GEMM 1: layer_w0 = sorted_x @ wi_0
+    casted_sorted_x_w0 = tex.grouped_quantize(
+        sorted_x, q_set_w0.x, local_group_sizes, flatten_axis=-1
+    )
+    casted_wi_0 = tex.grouped_quantize(wi_0, q_set_w0.kernel, flatten_axis=-1)
+    layer_w0 = tex.grouped_gemm(
+        casted_sorted_x_w0.get_tensor(usage=TensorUsage.LHS),
+        casted_wi_0.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((1,), (1,)),
+        bias=wi_0_bias,
+    )
+    casted_sorted_x_lhs_trans = casted_sorted_x_w0.get_tensor(usage=TensorUsage.LHS_TRANS)
+    casted_wi_0_rhs_trans = casted_wi_0.get_tensor(usage=TensorUsage.RHS_TRANS)
+    if isinstance(casted_sorted_x_lhs_trans, ScaledTensor):
+        casted_sorted_x_lhs_trans = casted_sorted_x_lhs_trans.checkpoint(q_set_w0.x)
+    if isinstance(casted_wi_0_rhs_trans, ScaledTensor):
+        casted_wi_0_rhs_trans = casted_wi_0_rhs_trans.checkpoint(q_set_w0.kernel)
+
+    # GEMM 2: layer_w1 = sorted_x @ wi_1
+    casted_sorted_x_w1 = tex.grouped_quantize(
+        sorted_x, q_set_w1.x, local_group_sizes, flatten_axis=-1
+    )
+    casted_wi_1 = tex.grouped_quantize(wi_1, q_set_w1.kernel, flatten_axis=-1)
+    layer_w1 = tex.grouped_gemm(
+        casted_sorted_x_w1.get_tensor(usage=TensorUsage.LHS),
+        casted_wi_1.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((1,), (1,)),
+        bias=wi_1_bias,
+    )
+    casted_wi_1_rhs_trans = casted_wi_1.get_tensor(usage=TensorUsage.RHS_TRANS)
+    if isinstance(casted_wi_1_rhs_trans, ScaledTensor):
+        casted_wi_1_rhs_trans = casted_wi_1_rhs_trans.checkpoint(q_set_w1.kernel)
+
+    # Activation: intermediate = act(layer_w0) * layer_w1
+    act_fn = _convert_to_activation_function(activation_type)
+    intermediate = act_fn(layer_w0) * layer_w1
+
+    # GEMM 3: expert_outputs = intermediate @ wo
+    casted_intermediate = tex.grouped_quantize(
+        intermediate, q_set_wo.x, local_group_sizes, flatten_axis=-1
+    )
+    casted_wo = tex.grouped_quantize(wo, q_set_wo.kernel, flatten_axis=-1)
+    expert_outputs = tex.grouped_gemm(
+        casted_intermediate.get_tensor(usage=TensorUsage.LHS),
+        casted_wo.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((1,), (1,)),
+        bias=wo_bias,
+    )
+    casted_intermediate_lhs_trans = casted_intermediate.get_tensor(usage=TensorUsage.LHS_TRANS)
+    casted_wo_rhs_trans = casted_wo.get_tensor(usage=TensorUsage.RHS_TRANS)
+    if isinstance(casted_intermediate_lhs_trans, ScaledTensor):
+        casted_intermediate_lhs_trans = casted_intermediate_lhs_trans.checkpoint(q_set_wo.x)
+    if isinstance(casted_wo_rhs_trans, ScaledTensor):
+        casted_wo_rhs_trans = casted_wo_rhs_trans.checkpoint(q_set_wo.kernel)
+
+    # ---------------- Stage 5: combine ----------------
+    # Compute per-shard static shape info once and pass through both
+    # _combine and (later) the bwd helpers via kwargs -- never via the
+    # state dict, which gets pytree-flattened across shard_map and would
+    # coerce Python ints into JitTracer 0-d arrays.
+    _static_shape = _compute_static_shape_info(
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        hidden=hidden,
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        align_size=align_size,
+        ep_active=ep_active,
+        num_ep=num_ep,
+        fsdp_sizes=fsdp_sizes,
+        recv_buffer_rows=recv_buffer_rows,
+    )
+    output = _combine(
+        expert_outputs,
+        dispatch_state,
+        backend=permutation_backend,
+        ep_active=ep_active,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        dtype=dtype,
+        num_experts_per_tok=num_experts_per_tok,
+        num_real_tokens=_static_shape["num_real_tokens"],
+        padding_size=_static_shape["padding_size"],
+        pre_a2a_buffer_shape=_static_shape["pre_a2a_buffer_shape"],
+        ep_axis=ep_axis,
+        shard_id=shard_id,
+        num_ep=num_ep,
+    )
+
+    # ---------------- Build ctx dict ----------------
+    ctx: dict = {
+        "x": x,
+        "gate_kernel": gate_kernel,
+        "logits_2d": logits_2d,
+        "saved_scores": saved_scores,
+        "routing_map": routing_map,
+        "dispatch": dispatch_state,
+        "casted_sorted_x_lhs_trans": casted_sorted_x_lhs_trans,
+        "casted_wi_0_rhs_trans": casted_wi_0_rhs_trans,
+        "casted_wi_1_rhs_trans": casted_wi_1_rhs_trans,
+        "layer_w0": layer_w0,
+        "layer_w1": layer_w1,
+        "casted_intermediate_lhs_trans": casted_intermediate_lhs_trans,
+        "casted_wo_rhs_trans": casted_wo_rhs_trans,
+        "expert_outputs": expert_outputs,
+        "local_group_sizes": local_group_sizes,
+    }
+    if expert_bias is not None:
+        ctx["expert_bias"] = expert_bias
+    if wi_0_bias is not None:
+        ctx["has_wi_bias"] = True  # NOTE: this is python bool; we DON'T store it
+        # (we only store array leaves in ctx; structural flags travel via statics).
+        del ctx["has_wi_bias"]
+    if aux_loss_coeff > 0.0:
+        ctx["aux_const_buf"] = aux_const_buf
+        ctx["aux_tokens_per_expert"] = aux_tokens_per_expert
+        ctx["aux_logits_for_score"] = aux_logits_for_score
+        ctx["aux_saved_scores"] = aux_saved_scores
+
+    return output, aux_loss, ctx
+
+
+def _body_bwd(
+    ctx: dict,
+    dy_pair: Tuple[jnp.ndarray, jnp.ndarray],
+    *,
+    num_experts: int,
+    num_experts_per_tok: int,
+    activation_type: str,
+    score_function: ScoreFunction,
+    use_pre_softmax: bool,
+    num_groups: Optional[int],
+    group_topk: Optional[int],
+    scaling_factor: float,
+    aux_loss_coeff: float,
+    permutation_backend: PermutationBackend,
+    align_size: int,
+    gate_inside_vjp: bool,
+    quantizer_sets: Tuple[QuantizerSet, QuantizerSet, QuantizerSet],
+    dtype: jnp.dtype,
+    ep_active: bool,
+    ep_axis: Optional[str],
+    data_parallelism_axes: Tuple[str, ...],
+    fsdp_sizes: Tuple[int, ...],
+    num_ep: int,
+    num_experts_local: int,
+    recv_buffer_rows: int,
+    # Static side info (kept here rather than inside ctx because they're
+    # python flags / shapes, not array leaves):
+    has_wi_bias: bool,
+    has_wo_bias: bool,
+    has_expert_bias: bool,
+    x_shape: Tuple[int, ...],
+) -> dict:
+    """Per-shard backward body. Returns a dict of grads keyed identically
+    to the ``captured`` dict consumed by :func:`_body_fwd`."""
+    if not gate_inside_vjp:
+        raise NotImplementedError("gate_inside_vjp=False is deferred to a follow-up PR.")
+
+    d_output, d_aux_loss = dy_pair
+    q_set_w0, q_set_w1, q_set_wo = quantizer_sets
+    batch_size, sequence_length, hidden = x_shape
+    shard_id = jax.lax.axis_index(ep_axis) if ep_active else None
+
+    # Recompute per-shard static shape info from existing statics
+    # (Python ints / int tuples). Plumbed via kwargs to _combine_bwd
+    # and _dispatch_bwd -- NOT through the ctx dict, because the
+    # dict gets pytree-flattened across the bwd shard_map's in_specs
+    # and Python ints would be coerced into JitTracer 0-d arrays
+    # (breaking ``if padding > 0`` and ``jnp.zeros(shape)`` callsites).
+    # ``batch_size`` here is the GLOBAL batch size (captured in
+    # ``x_shape`` by the outer fwd rule), hence ``batch_is_per_shard=False``.
+    _static_shape = _compute_static_shape_info(
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        hidden=hidden,
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        align_size=align_size,
+        ep_active=ep_active,
+        num_ep=num_ep,
+        fsdp_sizes=fsdp_sizes,
+        recv_buffer_rows=recv_buffer_rows,
+        batch_is_per_shard=False,
+    )
+
+    # Compute per-shard input shape: under the EP shard_map body, the
+    # gradient tensors live at per-shard shape, so the dispatch_bwd
+    # reshape target and ``d_x_from_dispatch.reshape(x_shape)`` below
+    # must use the per-shard shape rather than the captured global
+    # ``x_shape``.
+    if ep_active:
+        import math as _math  # local import keeps the no-EP path zero-overhead.
+
+        dp_size = _math.prod(fsdp_sizes) if fsdp_sizes else 1
+        per_shard_batch = batch_size // (num_ep * dp_size)
+        per_shard_x_shape: Tuple[int, ...] = (per_shard_batch, sequence_length, hidden)
+    else:
+        per_shard_x_shape = x_shape
+
+    # ---------------- Combine bwd ----------------
+    d_expert_outputs, d_routing_weights = _combine_bwd(
+        d_output,
+        ctx["dispatch"],
+        ctx["expert_outputs"],
+        backend=permutation_backend,
+        ep_active=ep_active,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        dtype=dtype,
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        num_real_tokens=_static_shape["num_real_tokens"],
+        padding_size=_static_shape["padding_size"],
+        post_a2a_buffer_shape=_static_shape["post_a2a_buffer_shape"],
+        ep_axis=ep_axis,
+        shard_id=shard_id,
+        num_ep=num_ep,
+    )
+
+    # ---------------- FFN bwd: GEMM 3 (wo) ----------------
+    casted_d_eo = tex.grouped_quantize(
+        d_expert_outputs, q_set_wo.dgrad, ctx["local_group_sizes"], flatten_axis=-1
+    )
+    d_intermediate = tex.grouped_gemm(
+        casted_d_eo.get_tensor(usage=TensorUsage.LHS),
+        ctx["casted_wo_rhs_trans"],
+        contracting_dims=((1,), (2,)),
+    )
+    d_wo = tex.grouped_gemm(
+        ctx["casted_intermediate_lhs_trans"],
+        casted_d_eo.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((0,), (0,)),
+    )
+    d_wo_bias = (
+        tex.grouped_dbias(d_expert_outputs, ctx["local_group_sizes"]) if has_wo_bias else None
+    )
+
+    # ---------------- Activation bwd ----------------
+    # intermediate = act(layer_w0) * layer_w1
+    # d(layer_w0) = vjp(act, layer_w0)(d_intermediate * layer_w1)
+    # d(layer_w1) = d_intermediate * act(layer_w0)
+    act_fn = _convert_to_activation_function(activation_type)
+    act_w0, dact_w0_pullback = jax.vjp(act_fn, ctx["layer_w0"])
+    d_layer_w1 = d_intermediate * act_w0
+    (d_layer_w0,) = dact_w0_pullback(d_intermediate * ctx["layer_w1"])
+
+    # ---------------- FFN bwd: GEMM 2 (wi_1) ----------------
+    casted_d_layer_w1 = tex.grouped_quantize(
+        d_layer_w1, q_set_w1.dgrad, ctx["local_group_sizes"], flatten_axis=-1
+    )
+    d_sorted_x_from_w1 = tex.grouped_gemm(
+        casted_d_layer_w1.get_tensor(usage=TensorUsage.LHS),
+        ctx["casted_wi_1_rhs_trans"],
+        contracting_dims=((1,), (2,)),
+    )
+    d_wi_1 = tex.grouped_gemm(
+        ctx["casted_sorted_x_lhs_trans"],
+        casted_d_layer_w1.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((0,), (0,)),
+    )
+    d_wi_1_bias = tex.grouped_dbias(d_layer_w1, ctx["local_group_sizes"]) if has_wi_bias else None
+
+    # ---------------- FFN bwd: GEMM 1 (wi_0) ----------------
+    casted_d_layer_w0 = tex.grouped_quantize(
+        d_layer_w0, q_set_w0.dgrad, ctx["local_group_sizes"], flatten_axis=-1
+    )
+    d_sorted_x_from_w0 = tex.grouped_gemm(
+        casted_d_layer_w0.get_tensor(usage=TensorUsage.LHS),
+        ctx["casted_wi_0_rhs_trans"],
+        contracting_dims=((1,), (2,)),
+    )
+    d_wi_0 = tex.grouped_gemm(
+        ctx["casted_sorted_x_lhs_trans"],
+        casted_d_layer_w0.get_tensor(usage=TensorUsage.RHS),
+        contracting_dims=((0,), (0,)),
+    )
+    d_wi_0_bias = tex.grouped_dbias(d_layer_w0, ctx["local_group_sizes"]) if has_wi_bias else None
+
+    d_sorted_x = d_sorted_x_from_w0 + d_sorted_x_from_w1
+
+    # ---------------- Dispatch bwd ----------------
+    inputs_2d_shape = (per_shard_x_shape[0] * per_shard_x_shape[1], hidden)
+    d_inputs_2d = _dispatch_bwd(
+        d_sorted_x,
+        ctx["dispatch"],
+        inputs_2d_shape=inputs_2d_shape,
+        backend=permutation_backend,
+        ep_active=ep_active,
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        num_real_tokens=_static_shape["num_real_tokens"],
+        padding_size=_static_shape["padding_size"],
+        pre_a2a_buffer_shape=_static_shape["pre_a2a_buffer_shape"],
+        ep_axis=ep_axis,
+        shard_id=shard_id,
+        num_ep=num_ep,
+    )
+    d_x_from_dispatch = d_inputs_2d.reshape(per_shard_x_shape)
+
+    # ---------------- Routing bwd ----------------
+    # The probs cotangent comes from _combine_bwd. For PURE_JAX it's the
+    # cotangent of routing_weights (post-routing_map_to_selected_experts);
+    # we need to bridge back to sparse_probs. For TRITON it's already the
+    # cotangent of merging_probs == sparse_probs.
+    if d_routing_weights is not None:
+        if permutation_backend is PermutationBackend.PURE_JAX:
+            # routing_map_to_selected_experts:
+            #   selected_experts = argsort(routing_map)[..., -topk:]
+            #   weights = take_along_axis(sparse_probs, selected_experts, axis=-1)
+            # routing_map is bool (non-diff); the gradient of weights
+            # w.r.t. sparse_probs is a scatter-into-zero along the
+            # selected_experts indices.
+            selected_experts = jnp.argsort(ctx["routing_map"], axis=-1)[..., -num_experts_per_tok:]
+            d_sparse_probs = jnp.zeros_like(ctx["saved_scores"]).astype(d_routing_weights.dtype)
+            d_sparse_probs = jnp.take_along_axis(d_sparse_probs, selected_experts, axis=-1)
+            # Actually scatter: build via jnp.zeros + .at[].set
+            d_sparse_probs = jnp.zeros(ctx["routing_map"].shape, dtype=d_routing_weights.dtype)
+            d_sparse_probs = d_sparse_probs.at[
+                jnp.arange(ctx["routing_map"].shape[0])[:, None], selected_experts
+            ].set(d_routing_weights)
+        else:
+            d_sparse_probs = d_routing_weights.astype(jnp.float32)
+    else:
+        d_sparse_probs = jnp.zeros(ctx["routing_map"].shape, dtype=jnp.float32)
+
+    # Topk bwd primitive: returns d_logits (no d_expert_bias).
+    d_logits_2d_main = tex.fused_topk_with_score_function_bwd(
+        ctx["routing_map"],
+        ctx["saved_scores"],
+        d_sparse_probs.astype(ctx["saved_scores"].dtype),
+        topk=num_experts_per_tok,
+        use_pre_softmax=use_pre_softmax,
+        scaling_factor=scaling_factor,
+        score_function=score_function,
+        compute_aux_scores=False,
+    )
+
+    # ---------------- Aux loss bwd ----------------
+    if aux_loss_coeff > 0.0:
+        # Step 1: aux_loss bwd -> d_aux_probs
+        aux_num_tokens = ctx["aux_logits_for_score"].shape[0]
+        d_aux_probs = tex.fused_moe_aux_loss_bwd(
+            ctx["aux_const_buf"],
+            ctx["aux_tokens_per_expert"].astype(jnp.int32),
+            d_aux_loss.reshape(()),
+            num_tokens=aux_num_tokens,
+        )
+        # Step 2: aux-side topk bwd (compute_aux_scores=True path).
+        # The routing_map argument is ignored in this branch (the kernel
+        # uses saved_scores); pass any shape-correct integer tensor.
+        d_aux_logits = tex.fused_topk_with_score_function_bwd(
+            jnp.zeros(ctx["aux_logits_for_score"].shape, dtype=jnp.bool_),
+            ctx["aux_saved_scores"],
+            d_aux_probs.astype(ctx["aux_saved_scores"].dtype),
+            topk=num_experts_per_tok,
+            use_pre_softmax=False,
+            scaling_factor=1.0,
+            score_function=score_function,
+            compute_aux_scores=True,
+        )
+        # Step 3: under EP the aux logits were all_gathered along
+        # ``(ep_axis, *data_parallelism_axes)`` (the latter being FSDP
+        # axes that shard the batch). The bwd is the inverse of that
+        # multi-axis tiled all_gather: ``dynamic_slice`` to pick out
+        # this shard's local rows from the global cotangent.
+        #
+        # JAX's convention for tiled ``all_gather(axis_name=(a, b, ...))``
+        # is row-major over the tuple: the shard at mesh position
+        # ``(i_a, i_b, ...)`` writes to rows
+        # ``[(i_a * size_b * ... + i_b * ... + ...) * local_T :
+        #    + local_T)``. We invert that by computing the same flat
+        # index here and slicing.
+        if ep_active:
+            local_T_aux = ctx["logits_2d"].shape[0]
+            flat_shard = shard_id  # ep is the outermost axis in the gather tuple
+            for ax, sz in zip(data_parallelism_axes, fsdp_sizes):
+                flat_shard = flat_shard * sz + jax.lax.axis_index(ax)
+            d_aux_logits_local = jax.lax.dynamic_slice(
+                d_aux_logits.astype(ctx["logits_2d"].dtype),
+                start_indices=(flat_shard * local_T_aux, 0),
+                slice_sizes=(local_T_aux, num_experts),
+            )
+        else:
+            d_aux_logits_local = d_aux_logits.astype(d_logits_2d_main.dtype)
+        d_logits_2d = d_logits_2d_main + d_aux_logits_local.astype(d_logits_2d_main.dtype)
+    else:
+        d_logits_2d = d_logits_2d_main
+
+    # ---------------- Gate bwd ----------------
+    d_gate_logits = d_logits_2d.reshape(per_shard_x_shape[0], per_shard_x_shape[1], num_experts)
+    gate_kernel_cast = ctx["gate_kernel"].astype(ctx["x"].dtype)
+    d_x_from_gate = jnp.einsum("bse,he->bsh", d_gate_logits, gate_kernel_cast)
+    d_gate_kernel = jnp.einsum("bsh,bse->he", ctx["x"], d_gate_logits).astype(
+        ctx["gate_kernel"].dtype
+    )
+    d_x = d_x_from_gate + d_x_from_dispatch
+
+    grads: dict = {
+        "inputs": d_x,
+        "gate_kernel": d_gate_kernel,
+        "wi_0": d_wi_0,
+        "wi_1": d_wi_1,
+        "wo": d_wo,
+    }
+    if has_wi_bias:
+        grads["wi_0_bias"] = d_wi_0_bias
+        grads["wi_1_bias"] = d_wi_1_bias
+    if has_wo_bias:
+        grads["wo_bias"] = d_wo_bias
+    if has_expert_bias:
+        # expert_bias has no gradient through topk (the topk bwd returns
+        # None for it). Emit a structural zero so the outer rule has
+        # something to package.
+        grads["expert_bias"] = jnp.zeros_like(ctx["expert_bias"])
+    return grads
+
+
+# =============================================================================
+# Spec builders for shard_map (lockstep with ctx_dict / captured_dict)
+# =============================================================================
+
+
+def _build_in_specs(
+    ep_axis: str,
+    batch_pspec_axis: Any,
+    *,
+    has_bias: bool,
+    has_expert_bias: bool,
+) -> dict:
+    """Build the ``in_specs`` dict for the EP fwd shard_map."""
+    specs: dict = {
+        "inputs": P(batch_pspec_axis, None, None),
+        "gate_kernel": P(),
+        "wi_0": P(ep_axis, None, None),
+        "wi_1": P(ep_axis, None, None),
+        "wo": P(ep_axis, None, None),
+    }
+    if has_bias:
+        for name in ("wi_0_bias", "wi_1_bias", "wo_bias"):
+            specs[name] = P(ep_axis, None)
+    if has_expert_bias:
+        specs["expert_bias"] = P(ep_axis)
+    return specs
+
+
+def _build_dispatch_specs(
+    ep_axis: str,
+    *,
+    backend: PermutationBackend,
+    ep_active: bool,
+) -> dict:
+    """Build the spec dict for a DispatchState dict returned by
+    :func:`_dispatch` from inside a shard_map. Keys must match what
+    :func:`_dispatch` actually populates for the given (backend, ep_active)."""
+    specs: dict = {"group_sizes": P()}
+    if backend is PermutationBackend.PURE_JAX:
+        specs["sorted_indices"] = P()
+        specs["routing_weights"] = P()
+    else:
+        specs["row_id_map"] = P()
+        specs["pad_offsets"] = P()
+        specs["merging_probs"] = P()
+    if ep_active:
+        specs["all_shards_tokens_per_expert"] = P()
+        specs["local_perm_row_id_map"] = P()
+    # NOTE: per-shard compile-time-constant shape info
+    # (num_real_tokens, padding_size, pre/post_a2a_buffer_shape)
+    # is intentionally NOT in the state dict; see _compute_static_shape_info.
+    return specs
+
+
+def _build_ctx_specs(
+    ep_axis: str,
+    batch_pspec_axis: Any,
+    *,
+    backend: PermutationBackend,
+    ep_active: bool,
+    has_bias: bool,
+    has_expert_bias: bool,
+    aux_loss_enabled: bool,
+) -> dict:
+    """Build the spec dict for the ``ctx`` returned by :func:`_body_fwd`."""
+    specs: dict = {
+        # Per-shard local activations along the batch axis.
+        "x": P(batch_pspec_axis, None, None),
+        "gate_kernel": P(),
+        "logits_2d": P(batch_pspec_axis, None),
+        "saved_scores": P(batch_pspec_axis, None),
+        "routing_map": P(batch_pspec_axis, None),
+        "dispatch": _build_dispatch_specs(ep_axis, backend=backend, ep_active=ep_active),
+        # FFN residuals: the LHS_TRANS / RHS_TRANS variants of
+        # grouped_quantize have leading "rows"/"experts" dims that are
+        # already shard-local (post-dispatch). Use P(ep_axis,...) on
+        # leading dim; that works whether the leaf is a plain ndarray
+        # or a ScaledTensor (shard_map applies the spec leaf-wise to
+        # the registered ScaledTensor pytree).
+        "casted_sorted_x_lhs_trans": P(),
+        "casted_wi_0_rhs_trans": P(ep_axis, None, None),
+        "casted_wi_1_rhs_trans": P(ep_axis, None, None),
+        "layer_w0": P(),
+        "layer_w1": P(),
+        "casted_intermediate_lhs_trans": P(),
+        "casted_wo_rhs_trans": P(ep_axis, None, None),
+        "expert_outputs": P(),
+        "local_group_sizes": P(),
+    }
+    if has_expert_bias:
+        specs["expert_bias"] = P(ep_axis)
+    if aux_loss_enabled:
+        specs["aux_const_buf"] = P()
+        specs["aux_tokens_per_expert"] = P()
+        specs["aux_logits_for_score"] = P()
+        specs["aux_saved_scores"] = P()
+    return specs
+
+
+def _build_grads_specs(
+    ep_axis: str,
+    batch_pspec_axis: Any,
+    *,
+    has_bias: bool,
+    has_expert_bias: bool,
+) -> dict:
+    """Spec dict for the grads dict returned by :func:`_body_bwd`."""
+    return _build_in_specs(
+        ep_axis,
+        batch_pspec_axis,
+        has_bias=has_bias,
+        has_expert_bias=has_expert_bias,
+    )
+
+
+# =============================================================================
+# Top-level VJP rules
+# =============================================================================
+
+
+def _moe_fwd_rule(
+    # IMPORTANT — calling convention for jax.custom_vjp fwd rule.
+    #
+    # JAX uses ``_argnums_partial`` (jax/_src/api_util.py) when wiring up
+    # the fwd rule. That helper preserves the ORIGINAL positional order
+    # of the decorated function: dyn (= diff) args sit at their original
+    # positions and static (= nondiff) args fill the remaining slots in
+    # nondiff_argnums order. So the fwd rule MUST take args in the
+    # SAME positional order as ``_moe`` -- diff first (positions 0..8),
+    # then nondiff (positions 9..28), all POSITIONAL (no ``*,`` -- they
+    # arrive as positional, not as kwargs).
+    #
+    # NOTE: this is the OPPOSITE convention from ``_moe_bwd_rule``, which
+    # uses ``prepend_static_args`` -- there the static args come FIRST,
+    # followed by ``ctx`` and ``dy_pair``.
+    x,
+    gate_kernel,
+    wi_0,
+    wi_1,
+    wo,
+    wi_0_bias,
+    wi_1_bias,
+    wo_bias,
+    expert_bias,
+    num_experts,
+    num_experts_per_tok,
+    activation_type,
+    score_function,
+    use_pre_softmax,
+    num_groups,
+    group_topk,
+    scaling_factor,
+    aux_loss_coeff,
+    permutation_backend,
+    align_size,
+    gate_inside_vjp,
+    ep_axis,
+    data_parallelism_axes,
+    input_axes,
+    gate_kernel_axes,
+    wi_kernel_axes,
+    wo_kernel_axes,
+    quantizer_sets,
+    dtype,
+):
+    x = with_sharding_constraint_by_logical_axes(x, input_axes)
+    ep_active = ep_axis is not None
+    body_kwargs = dict(
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        activation_type=activation_type,
+        score_function=score_function,
+        use_pre_softmax=use_pre_softmax,
+        num_groups=num_groups,
+        group_topk=group_topk,
+        scaling_factor=scaling_factor,
+        aux_loss_coeff=aux_loss_coeff,
+        permutation_backend=permutation_backend,
+        align_size=align_size,
+        gate_inside_vjp=gate_inside_vjp,
+        quantizer_sets=quantizer_sets,
+        dtype=dtype,
+        ep_axis=ep_axis,
+        data_parallelism_axes=data_parallelism_axes,
+    )
+    captured: dict = {
+        "inputs": x,
+        "gate_kernel": gate_kernel,
+        "wi_0": wi_0,
+        "wi_1": wi_1,
+        "wo": wo,
+    }
+    has_bias = wi_0_bias is not None
+    has_expert_bias = expert_bias is not None
+    if has_bias:
+        captured["wi_0_bias"] = wi_0_bias
+        captured["wi_1_bias"] = wi_1_bias
+        captured["wo_bias"] = wo_bias
+    if has_expert_bias:
+        captured["expert_bias"] = expert_bias
+
+    if not ep_active:
+        output, aux_loss, ctx = _body_fwd(
+            captured,
+            **body_kwargs,
+            ep_active=False,
+            fsdp_sizes=(),
+            num_ep=1,
+            num_experts_local=num_experts,
+            recv_buffer_rows=0,
+        )
+        # Carry static side info into ctx for the bwd rule (as Python
+        # objects on the dict; not part of the tree pytree leaves).
+        ctx["__static__"] = dict(
+            has_wi_bias=has_bias,
+            has_wo_bias=has_bias,
+            has_expert_bias=has_expert_bias,
+            x_shape=x.shape,
+            num_experts_local=num_experts,
+            recv_buffer_rows=0,
+        )
+        return (output, aux_loss), ctx
+
+    # ---------------- EP path ----------------
+    from jax.experimental.shard_map import shard_map
+
+    mesh = _get_mesh()
+    if mesh is None or mesh.empty:
+        raise ValueError("moe(...) requires an active jax.sharding.Mesh when ep_axis is set.")
+    num_ep = mesh.shape[ep_axis]
+    if num_experts % num_ep != 0:
+        raise ValueError(f"num_experts={num_experts} must be divisible by EP size={num_ep}")
+    num_experts_local = num_experts // num_ep
+
+    if not data_parallelism_axes:
+        batch_pspec_axis: Any = ep_axis
+    else:
+        batch_pspec_axis = (ep_axis, *data_parallelism_axes)
+    dp_size = 1
+    for ax in data_parallelism_axes:
+        dp_size *= mesh.shape[ax]
+
+    global_batch_size, sequence_length, _hidden = x.shape
+    topk = num_experts_per_tok
+    if global_batch_size % (num_ep * dp_size) != 0:
+        raise ValueError(f"batch={global_batch_size} not divisible by ep*dp={num_ep * dp_size}")
+    recv_buffer_rows = (global_batch_size // dp_size) * sequence_length * topk
+    if align_size > 0:
+        recv_buffer_rows += num_experts * (align_size - 1)
+
+    in_specs = _build_in_specs(
+        ep_axis,
+        batch_pspec_axis,
+        has_bias=has_bias,
+        has_expert_bias=has_expert_bias,
+    )
+    output_spec = P(batch_pspec_axis, None, None)
+    aux_spec = P()
+    ctx_spec = _build_ctx_specs(
+        ep_axis,
+        batch_pspec_axis,
+        backend=permutation_backend,
+        ep_active=True,
+        has_bias=has_bias,
+        has_expert_bias=has_expert_bias,
+        aux_loss_enabled=(aux_loss_coeff > 0.0),
+    )
+
+    _fsdp_sizes: Tuple[int, ...] = tuple(mesh.shape[ax] for ax in data_parallelism_axes)
+
+    def _shardmap_body(captured_local):
+        return _body_fwd(
+            captured_local,
+            **body_kwargs,
+            ep_active=True,
+            fsdp_sizes=_fsdp_sizes,
+            num_ep=num_ep,
+            num_experts_local=num_experts_local,
+            recv_buffer_rows=recv_buffer_rows,
+        )
+
+    output, aux_loss, ctx = shard_map(
+        _shardmap_body,
+        mesh=mesh,
+        in_specs=(in_specs,),
+        out_specs=(output_spec, aux_spec, ctx_spec),
+        check_rep=False,
+    )(captured)
+    ctx["__static__"] = dict(
+        has_wi_bias=has_bias,
+        has_wo_bias=has_bias,
+        has_expert_bias=has_expert_bias,
+        x_shape=x.shape,
+        num_experts_local=num_experts_local,
+        recv_buffer_rows=recv_buffer_rows,
+    )
+    return (output, aux_loss), ctx
+
+
+def _moe_bwd_rule(
+    num_experts,
+    num_experts_per_tok,
+    activation_type,
+    score_function,
+    use_pre_softmax,
+    num_groups,
+    group_topk,
+    scaling_factor,
+    aux_loss_coeff,
+    permutation_backend,
+    align_size,
+    gate_inside_vjp,
+    ep_axis,
+    data_parallelism_axes,
+    input_axes,
+    gate_kernel_axes,
+    wi_kernel_axes,
+    wo_kernel_axes,
+    quantizer_sets,
+    dtype,
+    ctx,
+    dy_pair,
+):
+    static = ctx.pop("__static__")
+    has_wi_bias = static["has_wi_bias"]
+    has_wo_bias = static["has_wo_bias"]
+    has_expert_bias = static["has_expert_bias"]
+    x_shape = static["x_shape"]
+    num_experts_local = static["num_experts_local"]
+    recv_buffer_rows = static["recv_buffer_rows"]
+
+    ep_active = ep_axis is not None
+    mesh = _get_mesh() if ep_active else None
+    fsdp_sizes: Tuple[int, ...] = (
+        tuple(mesh.shape[ax] for ax in data_parallelism_axes) if ep_active else ()
+    )
+    body_kwargs = dict(
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        activation_type=activation_type,
+        score_function=score_function,
+        use_pre_softmax=use_pre_softmax,
+        num_groups=num_groups,
+        group_topk=group_topk,
+        scaling_factor=scaling_factor,
+        aux_loss_coeff=aux_loss_coeff,
+        permutation_backend=permutation_backend,
+        align_size=align_size,
+        gate_inside_vjp=gate_inside_vjp,
+        quantizer_sets=quantizer_sets,
+        dtype=dtype,
+        ep_axis=ep_axis,
+        data_parallelism_axes=data_parallelism_axes,
+        fsdp_sizes=fsdp_sizes,
+        num_ep=1 if not ep_active else mesh.shape[ep_axis],
+        num_experts_local=num_experts_local,
+        recv_buffer_rows=recv_buffer_rows,
+        has_wi_bias=has_wi_bias,
+        has_wo_bias=has_wo_bias,
+        has_expert_bias=has_expert_bias,
+        x_shape=x_shape,
+    )
+
+    if not ep_active:
+        grads = _body_bwd(ctx, dy_pair, ep_active=False, **body_kwargs)
+        # Apply sharding constraints on grads.
+        grads["gate_kernel"] = with_sharding_constraint_by_logical_axes(
+            grads["gate_kernel"], gate_kernel_axes
+        )
+        grads["wi_0"] = with_sharding_constraint_by_logical_axes(grads["wi_0"], wi_kernel_axes)
+        grads["wi_1"] = with_sharding_constraint_by_logical_axes(grads["wi_1"], wi_kernel_axes)
+        grads["wo"] = with_sharding_constraint_by_logical_axes(grads["wo"], wo_kernel_axes)
+        grads["inputs"] = with_sharding_constraint_by_logical_axes(grads["inputs"], input_axes)
+        return _grads_dict_to_tuple(grads, has_wi_bias, has_wo_bias, has_expert_bias)
+
+    from jax.experimental.shard_map import shard_map
+
+    if not data_parallelism_axes:
+        batch_pspec_axis: Any = ep_axis
+    else:
+        batch_pspec_axis = (ep_axis, *data_parallelism_axes)
+    ctx_spec = _build_ctx_specs(
+        ep_axis,
+        batch_pspec_axis,
+        backend=permutation_backend,
+        ep_active=True,
+        has_bias=has_wi_bias,
+        has_expert_bias=has_expert_bias,
+        aux_loss_enabled=(aux_loss_coeff > 0.0),
+    )
+    dy_specs = (P(batch_pspec_axis, None, None), P())
+    grads_spec = _build_grads_specs(
+        ep_axis, batch_pspec_axis, has_bias=has_wi_bias, has_expert_bias=has_expert_bias
+    )
+
+    def _bwd_body(ctx_local, dy_local):
+        return _body_bwd(ctx_local, dy_local, ep_active=True, **body_kwargs)
+
+    grads = shard_map(
+        _bwd_body,
+        mesh=mesh,
+        in_specs=(ctx_spec, dy_specs),
+        out_specs=grads_spec,
+        check_rep=False,
+    )(ctx, dy_pair)
+
+    grads["gate_kernel"] = with_sharding_constraint_by_logical_axes(
+        grads["gate_kernel"], gate_kernel_axes
+    )
+    grads["wi_0"] = with_sharding_constraint_by_logical_axes(grads["wi_0"], wi_kernel_axes)
+    grads["wi_1"] = with_sharding_constraint_by_logical_axes(grads["wi_1"], wi_kernel_axes)
+    grads["wo"] = with_sharding_constraint_by_logical_axes(grads["wo"], wo_kernel_axes)
+    grads["inputs"] = with_sharding_constraint_by_logical_axes(grads["inputs"], input_axes)
+    return _grads_dict_to_tuple(grads, has_wi_bias, has_wo_bias, has_expert_bias)
+
+
+def _grads_dict_to_tuple(
+    grads: dict, has_wi_bias: bool, has_wo_bias: bool, has_expert_bias: bool
+) -> Tuple:
+    """Pack the body_bwd's grads dict into the positional tuple JAX expects."""
+    return (
+        grads["inputs"],
+        grads["gate_kernel"],
+        grads["wi_0"],
+        grads["wi_1"],
+        grads["wo"],
+        grads.get("wi_0_bias") if has_wi_bias else None,
+        grads.get("wi_1_bias") if has_wi_bias else None,
+        grads.get("wo_bias") if has_wo_bias else None,
+        grads.get("expert_bias") if has_expert_bias else None,
+    )
+
+
+# =============================================================================
+# custom_vjp + public entry
+# =============================================================================
+
+
+@partial(jax.custom_vjp, nondiff_argnums=tuple(range(9, 29)))
+def _moe(
+    x,
+    gate_kernel,
+    wi_0,
+    wi_1,
+    wo,
+    wi_0_bias,
+    wi_1_bias,
+    wo_bias,
+    expert_bias,
+    num_experts,
+    num_experts_per_tok,
+    activation_type,
+    score_function,
+    use_pre_softmax,
+    num_groups,
+    group_topk,
+    scaling_factor,
+    aux_loss_coeff,
+    permutation_backend,
+    align_size,
+    gate_inside_vjp,
+    ep_axis,
+    data_parallelism_axes,
+    input_axes,
+    gate_kernel_axes,
+    wi_kernel_axes,
+    wo_kernel_axes,
+    quantizer_sets,
+    dtype,
+):
+    # Call in `_moe`'s own signature order to match what JAX will pass
+    # the fwd rule via ``_argnums_partial``. See the comment block at
+    # the top of ``_moe_fwd_rule`` for why this differs from
+    # ``_moe_bwd_rule``'s convention.
+    output_pair, _ = _moe_fwd_rule(
+        x,
+        gate_kernel,
+        wi_0,
+        wi_1,
+        wo,
+        wi_0_bias,
+        wi_1_bias,
+        wo_bias,
+        expert_bias,
+        num_experts,
+        num_experts_per_tok,
+        activation_type,
+        score_function,
+        use_pre_softmax,
+        num_groups,
+        group_topk,
+        scaling_factor,
+        aux_loss_coeff,
+        permutation_backend,
+        align_size,
+        gate_inside_vjp,
+        ep_axis,
+        data_parallelism_axes,
+        input_axes,
+        gate_kernel_axes,
+        wi_kernel_axes,
+        wo_kernel_axes,
+        quantizer_sets,
+        dtype,
+    )
+    return output_pair
+
+
+_moe.defvjp(_moe_fwd_rule, _moe_bwd_rule)
+
+
+def moe(
+    x: jnp.ndarray,
+    gate_kernel: jnp.ndarray,
+    wi_0: jnp.ndarray,
+    wi_1: jnp.ndarray,
+    wo: jnp.ndarray,
+    wi_0_bias: Optional[jnp.ndarray] = None,
+    wi_1_bias: Optional[jnp.ndarray] = None,
+    wo_bias: Optional[jnp.ndarray] = None,
+    expert_bias: Optional[jnp.ndarray] = None,
+    *,
+    # Architecture
+    num_experts: int,
+    num_experts_per_tok: int,
+    activation_type: str = "silu",
+    # Routing
+    score_function: Union[str, ScoreFunction] = "softmax",
+    use_pre_softmax: bool = False,
+    num_groups: Optional[int] = None,
+    group_topk: Optional[int] = None,
+    scaling_factor: float = 1.0,
+    aux_loss_coeff: float = 0.0,
+    # Permutation
+    permutation_backend: PermutationBackend = PermutationBackend.PURE_JAX,
+    align_size: int = 0,
+    # Gate placement (Phuong: "perhaps as an option")
+    gate_inside_vjp: bool = True,
+    # Parallelism (resolved by caller from MeshResource)
+    ep_axis: Optional[str] = None,
+    data_parallelism_axes: Tuple[str, ...] = (),
+    # Logical axes for sharding constraints
+    input_axes: Tuple[Optional[str], ...] = (),
+    gate_kernel_axes: Tuple[Optional[str], ...] = (),
+    wi_kernel_axes: Tuple[Optional[str], ...] = ("exp", "embed", "mlp"),
+    wo_kernel_axes: Tuple[Optional[str], ...] = ("exp", "mlp", "embed"),
+    # Quantization
+    quantizer_sets: Tuple[QuantizerSet, QuantizerSet, QuantizerSet] = (
+        noop_quantizer_set,
+        noop_quantizer_set,
+        noop_quantizer_set,
+    ),
+    dtype: jnp.dtype = jnp.float32,
+) -> Tuple[jnp.ndarray, Optional[jnp.ndarray]]:
+    """Run a full MoE block under a single fused custom_vjp.
+
+    Parameters and return are documented at the call site of
+    ``_MoEBlock.__call__``. See module docstring for design rationale.
+    """
+    if not isinstance(permutation_backend, PermutationBackend):
+        raise TypeError(
+            f"permutation_backend must be a PermutationBackend, got {permutation_backend!r}"
+        )
+    # Normalize string score_function ("softmax" / "sigmoid") to the
+    # ScoreFunction enum once here. The underlying primitive
+    # ``tex.fused_topk_with_score_function_fwd`` expects an int-coercible
+    # value (the enum has integer .value), and the public router wrapper
+    # we bypass also normalizes here.
+    score_function = _validate_score_function(score_function)
+
+    output, aux_loss = _moe(
+        x,
+        gate_kernel,
+        wi_0,
+        wi_1,
+        wo,
+        wi_0_bias,
+        wi_1_bias,
+        wo_bias,
+        expert_bias,
+        num_experts=num_experts,
+        num_experts_per_tok=num_experts_per_tok,
+        activation_type=activation_type,
+        score_function=score_function,
+        use_pre_softmax=use_pre_softmax,
+        num_groups=num_groups,
+        group_topk=group_topk,
+        scaling_factor=scaling_factor,
+        aux_loss_coeff=aux_loss_coeff,
+        permutation_backend=permutation_backend,
+        align_size=align_size,
+        gate_inside_vjp=gate_inside_vjp,
+        ep_axis=ep_axis,
+        data_parallelism_axes=data_parallelism_axes,
+        input_axes=input_axes,
+        gate_kernel_axes=gate_kernel_axes,
+        wi_kernel_axes=wi_kernel_axes,
+        wo_kernel_axes=wo_kernel_axes,
+        quantizer_sets=quantizer_sets,
+        dtype=dtype,
+    )
+    if aux_loss_coeff <= 0.0:
+        aux_loss = None
+    return output, aux_loss
diff --git a/transformer_engine/jax/permutation.py b/transformer_engine/jax/permutation.py
index 81972aac0f..157575a441 100644
--- a/transformer_engine/jax/permutation.py
+++ b/transformer_engine/jax/permutation.py
@@ -7,6 +7,19 @@
 This module provides high-level token dispatch and combine operations for
 Mixture of Experts (MoE) models with proper automatic differentiation support.
 
+Two backends are offered:
+
+* Triton-backed ``token_dispatch`` / ``token_combine`` - uses the
+  Triton kernels in ``transformer_engine.jax.triton_extensions.permutation``.
+* Pure-JAX ``pure_jax_token_dispatch`` / ``pure_jax_token_combine`` - uses
+  only ``jnp.argsort`` + gather and is therefore compiled as plain XLA.
+  Despite the name, this path is often *faster* than the Triton kernels in
+  current testing because XLA can fuse the ops with surrounding work.
+
+Both backends support optional alignment padding (``align_size > 0``) so each
+expert's group size is a multiple of ``align_size``, which is required for
+quantized grouped GEMMs.
+
 Token Dispatch (Permute):
     - Forward: Permute tokens according to routing map (scatter to experts)
     - Backward: Unpermute gradients (gather from experts)
@@ -17,7 +30,7 @@
 """
 
 from functools import partial
-from typing import Optional, Tuple
+from typing import NamedTuple, Optional, Tuple
 
 import jax
 import jax.numpy as jnp
@@ -38,6 +51,15 @@
     "token_dispatch",
     "token_combine",
     "sort_chunks_by_index",
+    "pure_jax_token_dispatch",
+    "pure_jax_token_combine",
+    "PureJaxPermState",
+    # Ragged-all-to-all expert-parallelism helpers
+    "compute_ragged_all_to_all_params",
+    "compute_reverse_ragged_all_to_all_params",
+    "local_permute_after_a2a",
+    "local_unpermute_before_a2a",
+    "routing_map_to_selected_experts",
 ]
 
 
@@ -655,3 +677,642 @@ def _sort_chunks_by_index_bwd_rule(
 
 
 _sort_chunks_by_index.defvjp(_sort_chunks_by_index_fwd_rule, _sort_chunks_by_index_bwd_rule)
+
+
+# =============================================================================
+# Pure-JAX token dispatch / combine
+# =============================================================================
+#
+# The following implementations use only ``jnp.argsort`` + gather and compile
+# to plain XLA. They are a drop-in alternative to ``token_dispatch`` /
+# ``token_combine`` above, differing only in input/output conventions (the
+# Triton path takes ``routing_map`` and ``sparse_probs`` over all experts; the
+# pure-JAX path takes dense ``selected_experts`` and per-token ``weights`` of
+# shape ``[..., topk]``).
+#
+# Note: despite Triton being fused and pure-JAX being a sequence of XLA ops,
+# the pure-JAX backend is often *faster* in current testing because XLA can
+# fuse these ops into the surrounding work.
+
+
+# -----------------------------------------------------------------------------
+# Custom-VJP argsort-based gather.
+#
+# ``inputs[sort_indices]`` has a known inverse: ``output[argsort(sort_indices)]``.
+# Using a custom VJP lets the backward pass exploit that inverse instead of
+# relying on the compiler to discover it from the scatter-style default
+# gradient of a gather, which is typically less efficient.
+
+
+@jax.custom_vjp
+def _sort_activations(inputs: jax.Array, sort_indices: jax.Array) -> jax.Array:
+    """Sort ``inputs`` along the leading dim by ``sort_indices``."""
+    assert (
+        inputs.shape[0] == sort_indices.shape[0]
+    ), f"inputs.shape[0]={inputs.shape[0]} must match sort_indices.shape[0]={sort_indices.shape[0]}"
+    with jax.named_scope("pure_jax_sort_activations"):
+        return inputs[sort_indices, ...]
+
+
+def _sort_activations_fwd(
+    inputs: jax.Array, sort_indices: jax.Array
+) -> Tuple[jax.Array, jax.Array]:
+    return _sort_activations(inputs, sort_indices), sort_indices
+
+
+def _sort_activations_bwd(residuals: jax.Array, grads: jax.Array) -> Tuple[jax.Array, None]:
+    sort_indices = residuals
+    # Inverse permutation: gather-by-argsort undoes the forward gather.
+    return _sort_activations(grads, jnp.argsort(sort_indices)), None
+
+
+_sort_activations.defvjp(_sort_activations_fwd, _sort_activations_bwd)
+
+
+def routing_map_to_selected_experts(
+    sparse_probs: jnp.ndarray,
+    routing_map: jnp.ndarray,
+    topk: int,
+) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    """Convert ``(sparse_probs, routing_map)`` from TE's fused router to the
+    ``(selected_experts, weights)`` format consumed by
+    :func:`pure_jax_token_dispatch`.
+
+    ``routing_map`` is a boolean mask of shape ``[num_tokens, num_experts]``
+    with exactly ``topk`` ``True`` positions per row.
+    """
+    # Argsort on a bool tensor places ``True`` rows last (False=0 < True=1),
+    # so the last ``topk`` indices are the selected expert IDs.
+    selected_experts = jnp.argsort(routing_map, axis=-1)[..., -topk:]
+    weights = jnp.take_along_axis(sparse_probs, selected_experts, axis=-1)
+    return selected_experts, weights
+
+
+# -----------------------------------------------------------------------------
+# Permutation state carried from dispatch to combine.
+
+
+class PureJaxPermState(NamedTuple):
+    """Opaque state produced by :func:`pure_jax_token_dispatch`.
+
+    Attributes
+    ----------
+    sorted_indices : jnp.ndarray
+        The argsort indices used in the forward sort. Needed to reverse the
+        permutation in :func:`pure_jax_token_combine`. Shape
+        ``[num_real_tokens + padding_size]``.
+    num_real_tokens : int
+        Number of real (non-padding) permuted tokens, i.e.
+        ``batch_size * sequence_length * num_experts_per_tok``. Compile-time
+        constant.
+    padding_size : int
+        Number of alignment-padding tokens appended to the sort buffer. Equals
+        ``num_experts * (align_size - 1)`` when ``align_size > 0``, else ``0``.
+        Compile-time constant.
+    """
+
+    sorted_indices: jax.Array
+    num_real_tokens: int
+    padding_size: int
+
+
+# -----------------------------------------------------------------------------
+# Dispatch (permute)
+
+
+def pure_jax_token_dispatch(
+    inputs: jnp.ndarray,
+    selected_experts: jnp.ndarray,
+    num_experts: int,
+    num_experts_per_tok: int,
+    align_size: int = 0,
+    roll_to_expert_id: Optional[int] = None,
+) -> Tuple[jnp.ndarray, PureJaxPermState, jnp.ndarray]:
+    """Pure-JAX ``argsort``-based token dispatch.
+
+    Parameters
+    ----------
+    inputs : jnp.ndarray
+        Input tensor of shape ``[num_tokens, hidden_size]`` (or
+        ``[batch, seq, hidden]``; it will be flattened).
+    selected_experts : jnp.ndarray
+        Per-token expert IDs, shape ``[num_tokens, num_experts_per_tok]`` (or
+        ``[batch, seq, num_experts_per_tok]``). Integer dtype.
+    num_experts : int
+        Total number of experts.
+    num_experts_per_tok : int
+        Top-k. Must equal ``selected_experts.shape[-1]``.
+    align_size : int, default 0
+        Alignment for each expert's group size. ``0`` disables padding; a value
+        ``> 0`` appends a static-size padding buffer so each resulting group
+        size is a multiple of ``align_size`` (required for quantized grouped
+        GEMM).
+    roll_to_expert_id : Optional[int]
+        If provided, rotates expert IDs by ``-roll_to_expert_id`` modulo
+        ``num_experts`` before the sort (ring-of-experts EP). The returned
+        ``group_sizes`` is rolled to match.
+
+    Returns
+    -------
+    sorted_inputs : jnp.ndarray
+        Permuted tokens grouped by expert, shape
+        ``[num_real_tokens + padding_size, hidden_size]``.
+    perm_state : PureJaxPermState
+        State needed by :func:`pure_jax_token_combine`.
+    group_sizes : jnp.ndarray
+        Token count per expert, shape ``[num_experts]``. Each entry is a
+        multiple of ``align_size`` when ``align_size > 0``.
+    """
+    assert num_experts_per_tok == selected_experts.shape[-1], (
+        f"num_experts_per_tok={num_experts_per_tok} must match"
+        f" selected_experts.shape[-1]={selected_experts.shape[-1]}"
+    )
+    assert align_size >= 0, f"align_size must be >= 0, got {align_size}"
+
+    hidden_size = inputs.shape[-1]
+    inputs_2d = inputs.reshape(-1, hidden_size)
+    num_tokens = inputs_2d.shape[0]
+    num_real_tokens = num_tokens * num_experts_per_tok
+
+    flatten_selected_experts = jnp.ravel(selected_experts)
+
+    if align_size > 0:
+        # Per-expert token count, and how many extra tokens each expert needs
+        # to become aligned to ``align_size``. Using
+        # ``(align - count % align) % align`` gives 0 (not ``align``) when
+        # already aligned, so we never exceed the per-expert slot capacity of
+        # ``align_size - 1``.
+        token_count_per_expert = jnp.bincount(flatten_selected_experts, length=num_experts)
+        padding_tokens_required_per_expert = (
+            align_size - (token_count_per_expert % align_size)
+        ) % align_size
+
+        # Build a static-size padding buffer of shape
+        # ``[num_experts * (align_size - 1)]``. Each expert ``i`` owns a slot
+        # of ``align_size - 1`` positions (worst-case padding, which occurs
+        # when ``token_count[i] % align_size == 1``). Within slot ``i``,
+        # positions ``[0, padding_needed)`` are assigned expert ``i`` and act
+        # as real padding; the rest are assigned to ``num_experts - 1`` as
+        # overflow placeholders that keep the buffer statically sized for JIT.
+        max_padding_per_expert = align_size - 1
+        max_total_padding_size = num_experts * max_padding_per_expert
+        positions = jnp.arange(max_total_padding_size)
+        expert_for_pos = positions // max_padding_per_expert
+        offset_in_slot = positions % max_padding_per_expert
+        padding_needed = padding_tokens_required_per_expert[expert_for_pos]
+        flatten_padding_selected_experts = jnp.where(
+            offset_in_slot < padding_needed,
+            expert_for_pos,
+            num_experts - 1,
+        )
+
+        flatten_selected_experts = jnp.concatenate(
+            [flatten_selected_experts, flatten_padding_selected_experts], axis=0
+        )
+
+        if roll_to_expert_id is not None:
+            flatten_selected_experts = (flatten_selected_experts - roll_to_expert_id) % num_experts
+
+        sorted_selected_experts = jnp.argsort(flatten_selected_experts)
+
+        replicated_inputs_2d = jnp.repeat(inputs_2d, num_experts_per_tok, axis=0)
+        # Pad inputs with zeros so the sort operand shape matches the expanded
+        # selected-experts vector.
+        replicated_inputs_2d = jnp.pad(
+            replicated_inputs_2d,
+            pad_width=((0, max_total_padding_size), (0, 0)),
+            mode="constant",
+            constant_values=0.0,
+        )
+
+        sorted_inputs = _sort_activations(replicated_inputs_2d, sorted_selected_experts)
+
+        # Compute ``group_sizes`` directly from counts rather than via
+        # ``bincount(flatten_selected_experts)``: the overflow placeholder
+        # tokens would inflate ``group_sizes[num_experts - 1]``, breaking the
+        # alignment guarantee. Direct computation gives each expert exactly
+        # ``ceil(count / align) * align`` tokens.
+        group_sizes = token_count_per_expert + padding_tokens_required_per_expert
+
+        if roll_to_expert_id is not None:
+            group_sizes = jnp.roll(group_sizes, -roll_to_expert_id)
+
+        padding_size = max_total_padding_size
+    else:
+        if roll_to_expert_id is not None:
+            flatten_selected_experts = (flatten_selected_experts - roll_to_expert_id) % num_experts
+
+        sorted_selected_experts = jnp.argsort(flatten_selected_experts)
+
+        replicated_inputs_2d = jnp.repeat(inputs_2d, num_experts_per_tok, axis=0)
+        sorted_inputs = _sort_activations(replicated_inputs_2d, sorted_selected_experts)
+
+        group_sizes = jnp.bincount(flatten_selected_experts, length=num_experts)
+        if roll_to_expert_id is not None:
+            group_sizes = jnp.roll(group_sizes, -roll_to_expert_id)
+
+        padding_size = 0
+
+    perm_state = PureJaxPermState(
+        sorted_indices=sorted_selected_experts,
+        num_real_tokens=num_real_tokens,
+        padding_size=padding_size,
+    )
+    return sorted_inputs, perm_state, group_sizes
+
+
+# -----------------------------------------------------------------------------
+# Combine (unpermute + weighted sum)
+
+
+def pure_jax_token_combine(
+    expert_outputs: jnp.ndarray,
+    perm_state: PureJaxPermState,
+    routing_weights: jnp.ndarray,
+    num_experts_per_tok: int,
+    batch_size: int,
+    sequence_length: int,
+) -> jnp.ndarray:
+    """Pure-JAX ``argsort``-based token combine.
+
+    Reverses the permutation performed by :func:`pure_jax_token_dispatch`,
+    strips any alignment-padding rows appended during dispatch, and applies a
+    per-token weighted sum across the top-k experts.
+
+    Parameters
+    ----------
+    expert_outputs : jnp.ndarray
+        Output of the expert FFN, shape
+        ``[num_real_tokens + padding_size, hidden_size]``.
+    perm_state : PureJaxPermState
+        State returned by :func:`pure_jax_token_dispatch`.
+    routing_weights : jnp.ndarray
+        Top-k routing weights, shape ``[batch*seq, num_experts_per_tok]``
+        (or broadcastable to it after a ``reshape``).
+    num_experts_per_tok : int
+        Top-k.
+    batch_size : int
+        Original batch size.
+    sequence_length : int
+        Original sequence length.
+
+    Returns
+    -------
+    output : jnp.ndarray
+        Combined output tensor of shape ``[batch_size, sequence_length, hidden_size]``.
+    """
+    # Reverse the permutation: ``output[argsort(sorted_indices)]`` undoes
+    # ``input[sorted_indices]``.
+    unsort_intermediate = _sort_activations(
+        expert_outputs,
+        jnp.argsort(perm_state.sorted_indices),
+    )
+
+    # Strip alignment padding tokens appended during dispatch. After unsorting,
+    # the first ``num_real_tokens`` rows hold the real per-(token, top-k)
+    # outputs; any trailing rows are padding placeholders (zeros) and must be
+    # discarded before the reshape below.
+    if perm_state.padding_size > 0:
+        unsort_intermediate = unsort_intermediate[: perm_state.num_real_tokens]
+
+    hidden_size = unsort_intermediate.shape[-1]
+    reshaped_weights = jnp.reshape(routing_weights, (-1, num_experts_per_tok))
+    reshaped_intermediate = jnp.reshape(
+        unsort_intermediate, (reshaped_weights.shape[0], num_experts_per_tok, hidden_size)
+    )
+
+    # Cast weights to match intermediate dtype (weighted sum happens in
+    # intermediate dtype; callers can upcast before calling if higher
+    # precision weight-sum is desired).
+    reshaped_weights = reshaped_weights.astype(reshaped_intermediate.dtype)
+    with jax.named_scope("pure_jax_weight_sum"):
+        output = jnp.einsum(
+            "BKE,BK -> BE",
+            reshaped_intermediate,
+            reshaped_weights,
+        )
+    return output.reshape(batch_size, sequence_length, hidden_size)
+
+
+# =============================================================================
+# Ragged-all-to-all expert-parallelism helpers
+# =============================================================================
+#
+# These helpers support the ragged-all-to-all (A2A / A2Av) EP strategy used by
+# :class:`transformer_engine.jax.flax._MoEBlock`. The forward EP path looks
+# like::
+#
+#     route -> global_permute -> AG(group_sizes, ep)
+#                             -> ragged_all_to_all(fwd, ep)
+#                             -> local_permute_after_a2a
+#                             -> grouped_dense x3 + activation
+#                             -> local_unpermute_before_a2a
+#                             -> ragged_all_to_all(reverse, ep)
+#                             -> global_combine
+#
+# The two ``compute_*_ragged_all_to_all_params`` functions translate
+# ``all_shards_tokens_per_expert`` (an EP-axis ``all_gather`` of each shard's
+# global ``group_sizes``) into the four ``ragged_all_to_all`` arguments
+# (``input_offsets``, ``send_sizes``, ``output_offsets``, ``recv_sizes``).
+# ``shard_id`` may be a traced value (e.g. from :func:`jax.lax.axis_index`),
+# which is why every slice into ``all_shards_tokens_per_expert`` uses
+# :func:`jax.lax.dynamic_slice`.
+#
+# These functions are pure JAX (no MaxText / TE dependencies) and equivalent
+# to :func:`maxtext.layers.te_permutation.compute_ragged_all_to_all_params`
+# / :func:`compute_reverse_ragged_all_to_all_params`.
+
+
+def compute_ragged_all_to_all_params(
+    all_shards_tokens_per_expert: jnp.ndarray,
+    shard_id: jnp.ndarray,
+    num_expert_shards: int,
+) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray, jnp.ndarray]:
+    """Forward-direction ragged_all_to_all parameters.
+
+    Computes the four index/size arrays that :func:`jax.lax.ragged_all_to_all`
+    consumes for the **forward** EP shuffle, where each shard sends its
+    expert-grouped tokens to the shard that owns those experts.
+
+    Parameters
+    ----------
+    all_shards_tokens_per_expert : jnp.ndarray
+        Per-shard, per-expert token counts gathered across the EP axis. Shape
+        ``[num_expert_shards, num_experts]`` and integer dtype.
+    shard_id : jnp.ndarray
+        Index of the current shard along the EP axis (typically
+        :func:`jax.lax.axis_index` of the EP axis). Must be a 0-d integer.
+    num_expert_shards : int
+        Static EP-axis size. Must match
+        ``all_shards_tokens_per_expert.shape[0]``.
+
+    Returns
+    -------
+    input_offsets : jnp.ndarray
+        Shape ``[num_expert_shards]``. Cumulative ``send_sizes`` (with a
+        leading 0) -- where in the local source buffer each destination
+        shard's chunk begins.
+    send_sizes : jnp.ndarray
+        Shape ``[num_expert_shards]``. ``send_sizes[i]`` is the number of
+        tokens this shard sends to shard ``i`` (= the sum of token counts
+        for the experts owned by shard ``i``).
+    output_offsets : jnp.ndarray
+        Shape ``[num_expert_shards]``. ``output_offsets[i]`` is the row in
+        shard ``i``'s receive buffer where this shard's contribution should
+        land. Sender-side semantics, per :func:`jax.lax.ragged_all_to_all`.
+    recv_sizes : jnp.ndarray
+        Shape ``[num_expert_shards]``. ``recv_sizes[i]`` is the number of
+        tokens shard ``i`` sends to this shard.
+    """
+    num_experts = all_shards_tokens_per_expert.shape[1]
+    assert (
+        num_experts % num_expert_shards == 0
+    ), f"num_experts={num_experts} must be divisible by num_expert_shards={num_expert_shards}"
+    local_expert_size = num_experts // num_expert_shards
+
+    # This shard's row of the gathered table, reshaped so axis 0 indexes the
+    # destination shard and axis 1 indexes its local experts.
+    local_tokens_per_expert = jax.lax.dynamic_slice(
+        all_shards_tokens_per_expert,
+        start_indices=(shard_id, 0),
+        slice_sizes=(1, num_experts),
+    ).squeeze(0)
+    local_reshaped = local_tokens_per_expert.reshape(num_expert_shards, local_expert_size)
+
+    # send_sizes[i] = sum of token counts for shard i's experts in our buffer.
+    send_sizes = jnp.sum(local_reshaped, axis=1)
+    input_offsets = jnp.concatenate(
+        [
+            jnp.array([0], dtype=send_sizes.dtype),
+            jnp.cumsum(send_sizes)[:-1],
+        ]
+    )
+
+    # recv_sizes[i] = how many tokens shard i sends to this shard, i.e. the
+    # sum across our local-expert columns of shard i's row.
+    local_expert_start = shard_id * local_expert_size
+    local_expert_columns = jax.lax.dynamic_slice(
+        all_shards_tokens_per_expert,
+        start_indices=(0, local_expert_start),
+        slice_sizes=(num_expert_shards, local_expert_size),
+    )
+    recv_sizes = jnp.sum(local_expert_columns, axis=1)
+
+    # output_offsets uses sender-side semantics for ragged_all_to_all:
+    # output_offsets[j] = row in shard j's buffer where THIS shard's chunk
+    # should be placed. That's the cumulative sum (over source shards 0..j-1)
+    # of how many tokens those earlier source shards already sent to shard j.
+    sends_to_target = jnp.sum(
+        all_shards_tokens_per_expert.reshape(
+            num_expert_shards, num_expert_shards, local_expert_size
+        ),
+        axis=2,
+    )  # [src_shard, dst_shard]
+    zero_row = jnp.zeros((1, num_expert_shards), dtype=sends_to_target.dtype)
+    cumulated = jnp.cumsum(
+        jnp.concatenate([zero_row, sends_to_target], axis=0),
+        axis=0,
+        dtype=sends_to_target.dtype,
+    )  # [src_shard + 1, dst_shard]; row r = total sent by sources 0..r-1
+    output_offsets = jax.lax.dynamic_slice(
+        cumulated,
+        start_indices=(shard_id, 0),
+        slice_sizes=(1, num_expert_shards),
+    ).squeeze(0)
+
+    return input_offsets, send_sizes, output_offsets, recv_sizes
+
+
+def compute_reverse_ragged_all_to_all_params(
+    all_shards_tokens_per_expert: jnp.ndarray,
+    shard_id: jnp.ndarray,
+    num_expert_shards: int,
+) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray, jnp.ndarray]:
+    """Reverse-direction ragged_all_to_all parameters.
+
+    Mirror of :func:`compute_ragged_all_to_all_params` for the **reverse**
+    EP shuffle that returns expert outputs to their source shards. The
+    sender / receiver roles are swapped: what we received in the forward
+    shuffle we now send back, and vice versa.
+
+    Parameters and shapes are identical to
+    :func:`compute_ragged_all_to_all_params`.
+    """
+    num_experts = all_shards_tokens_per_expert.shape[1]
+    assert (
+        num_experts % num_expert_shards == 0
+    ), f"num_experts={num_experts} must be divisible by num_expert_shards={num_expert_shards}"
+    local_expert_size = num_experts // num_expert_shards
+
+    local_expert_start = shard_id * local_expert_size
+
+    # In reverse, what we received becomes what we send. send_sizes[i] is how
+    # many tokens we send back to source shard i (= what shard i originally
+    # sent us, summed across our local experts).
+    local_expert_columns = jax.lax.dynamic_slice(
+        all_shards_tokens_per_expert,
+        start_indices=(0, local_expert_start),
+        slice_sizes=(num_expert_shards, local_expert_size),
+    )
+    send_sizes = jnp.sum(local_expert_columns, axis=1)
+    input_offsets = jnp.concatenate(
+        [
+            jnp.array([0], dtype=send_sizes.dtype),
+            jnp.cumsum(send_sizes)[:-1],
+        ]
+    )
+
+    # recv_sizes[i] = how many tokens we receive back from shard i (= what
+    # we originally sent to shard i in the forward).
+    local_tokens_per_expert = jax.lax.dynamic_slice(
+        all_shards_tokens_per_expert,
+        start_indices=(shard_id, 0),
+        slice_sizes=(1, num_experts),
+    ).squeeze(0)
+    local_reshaped = local_tokens_per_expert.reshape(num_expert_shards, local_expert_size)
+    recv_sizes = jnp.sum(local_reshaped, axis=1)
+
+    # output_offsets: the reverse sends-to-target matrix is the transpose of
+    # the forward one (row i = what shard i sends in reverse = what shard i
+    # received in forward). Cumsum down source-shard axis, then index our row.
+    fwd_sends_to = jnp.sum(
+        all_shards_tokens_per_expert.reshape(
+            num_expert_shards, num_expert_shards, local_expert_size
+        ),
+        axis=2,
+    )  # forward: [src, dst]
+    rev_sends_to = jnp.transpose(fwd_sends_to)  # reverse: [src, dst]
+    zero_row = jnp.zeros((1, num_expert_shards), dtype=rev_sends_to.dtype)
+    rev_cumulated = jnp.cumsum(
+        jnp.concatenate([zero_row, rev_sends_to], axis=0),
+        axis=0,
+        dtype=rev_sends_to.dtype,
+    )
+    output_offsets = jax.lax.dynamic_slice(
+        rev_cumulated,
+        start_indices=(shard_id, 0),
+        slice_sizes=(1, num_expert_shards),
+    ).squeeze(0)
+
+    return input_offsets, send_sizes, output_offsets, recv_sizes
+
+
+# -----------------------------------------------------------------------------
+# Local permute / unpermute
+# -----------------------------------------------------------------------------
+#
+# After the forward ragged_all_to_all the receive buffer is laid out as
+# ``[from_shard_0_chunk | from_shard_1_chunk | ... ]`` and within each chunk
+# tokens are sorted by local-expert id. To feed ``grouped_dense`` we want
+# ``[expert_0_block | expert_1_block | ... ]`` where each expert's block
+# contains tokens from every source shard. ``local_permute_after_a2a``
+# performs that reorder; ``local_unpermute_before_a2a`` undoes it before the
+# reverse ragged_all_to_all.
+#
+# Implementation uses :func:`sort_chunks_by_index`, which is Triton-backed
+# (see ``transformer_engine.jax.triton_extensions.permutation``) and has a
+# paired custom-VJP backward. There is no pure-JAX alternative here -- the
+# global :func:`pure_jax_token_dispatch` / :func:`token_dispatch` choice is
+# unaffected by this; only the (small) post-A2A chunk reorder uses Triton
+# unconditionally.
+
+
+def local_permute_after_a2a(
+    x_recv: jnp.ndarray,
+    all_shards_tokens_per_expert: jnp.ndarray,
+    shard_id: jnp.ndarray,
+    num_expert_shards: int,
+) -> Tuple[jnp.ndarray, jnp.ndarray, dict]:
+    """Reorder tokens received via ragged_all_to_all so each local expert's
+    tokens are contiguous.
+
+    This is the EP-side complement to the global :func:`token_dispatch` /
+    :func:`pure_jax_token_dispatch`. Internally uses
+    :func:`sort_chunks_by_index` (Triton-backed) for both the forward sort
+    and -- via :func:`local_unpermute_before_a2a` -- the inverse.
+
+    Parameters
+    ----------
+    x_recv : jnp.ndarray
+        Output of the forward ``ragged_all_to_all`` of shape
+        ``[buffer_size, hidden_size]``. Layout: source-shard major, then
+        local-expert id within each source chunk.
+    all_shards_tokens_per_expert : jnp.ndarray
+        Per-shard, per-expert token counts of shape
+        ``[num_expert_shards, num_experts]``.
+    shard_id : jnp.ndarray
+        Current EP shard index (typically a traced
+        :func:`jax.lax.axis_index`).
+    num_expert_shards : int
+        Static EP-axis size.
+
+    Returns
+    -------
+    sorted_x : jnp.ndarray
+        Tokens reordered into expert-major layout. Same shape as ``x_recv``.
+    local_group_sizes : jnp.ndarray
+        Per-local-expert token counts of shape ``[local_expert_size]``.
+    state : dict
+        Opaque state for :func:`local_unpermute_before_a2a`.
+    """
+    num_experts = all_shards_tokens_per_expert.shape[1]
+    assert (
+        num_experts % num_expert_shards == 0
+    ), f"num_experts={num_experts} must be divisible by num_expert_shards={num_expert_shards}"
+    local_expert_size = num_experts // num_expert_shards
+    local_expert_start = shard_id * local_expert_size
+    local_expert_columns = jax.lax.dynamic_slice(
+        all_shards_tokens_per_expert,
+        start_indices=(0, local_expert_start),
+        slice_sizes=(num_expert_shards, local_expert_size),
+    )
+
+    # Flat sizes in source-major order, matching the receive buffer layout:
+    # [(s0,e0), (s0,e1), ..., (s1,e0), (s1,e1), ...]
+    split_sizes = local_expert_columns.reshape(-1)
+
+    # Permutation that maps source-major -> expert-major:
+    #   original index = s * E_local + e
+    #   target   index = e * num_shards + s
+    indices_matrix = jnp.arange(num_expert_shards * local_expert_size, dtype=jnp.int32).reshape(
+        num_expert_shards, local_expert_size
+    )
+    sorted_chunk_indices = indices_matrix.T.reshape(-1)
+
+    sorted_x, _ = sort_chunks_by_index(x_recv, split_sizes, sorted_chunk_indices)
+    sorted_split_sizes = split_sizes[sorted_chunk_indices]
+    inverse_chunk_indices = jnp.argsort(sorted_chunk_indices)
+    local_group_sizes = jnp.sum(local_expert_columns, axis=0)
+    state = {
+        "sorted_split_sizes": sorted_split_sizes,
+        "inverse_chunk_indices": inverse_chunk_indices,
+    }
+    return sorted_x, local_group_sizes, state
+
+
+def local_unpermute_before_a2a(
+    expert_outputs: jnp.ndarray,
+    state: dict,
+) -> jnp.ndarray:
+    """Inverse of :func:`local_permute_after_a2a`.
+
+    Parameters
+    ----------
+    expert_outputs : jnp.ndarray
+        Output of the local expert FFN of shape ``[buffer_size, hidden_size]``,
+        in expert-major layout.
+    state : dict
+        Opaque state returned by :func:`local_permute_after_a2a`.
+
+    Returns
+    -------
+    unsorted_x : jnp.ndarray
+        Tokens reordered back into source-shard-major layout, ready for the
+        reverse ``ragged_all_to_all``. Same shape as ``expert_outputs``.
+    """
+    out, _ = sort_chunks_by_index(
+        expert_outputs,
+        state["sorted_split_sizes"],
+        state["inverse_chunk_indices"],
+    )
+    return out
diff --git a/transformer_engine/jax/sharding.py b/transformer_engine/jax/sharding.py
index 9b13412c14..182a4a2e00 100644
--- a/transformer_engine/jax/sharding.py
+++ b/transformer_engine/jax/sharding.py
@@ -332,6 +332,7 @@ class MeshResource:
         fsdp_resource: Axis name for full-sharded data parallelism, default is None
         pp_resource: Axis name for pipeline parallelism (layer sharding), default is None
         cp_resource: Axis name for context parallelism (sequence sharding), default is None
+        ep_resource: Axis name for expert parallelism (MoE expert sharding), default is None
     """
 
     dp_resource: str = None
@@ -340,6 +341,7 @@ class MeshResource:
     fsdp_resource: str = None
     pp_resource: str = None
     cp_resource: str = None
+    ep_resource: str = None
 
 
 _GLOBAL_MESH_RESOURCE = None
@@ -379,6 +381,38 @@ def global_mesh_resource() -> MeshResource:
     return _GLOBAL_MESH_RESOURCE
 
 
+def get_active_resource_axis(resource_name: str) -> Optional[str]:
+    """Resolve a :class:`MeshResource` attribute to its mesh axis name,
+    or return ``None`` if that resource is not active.
+
+    "Active" means all three are true:
+
+    * a physical mesh is set (``is_mesh_available()``),
+    * the ``MeshResource`` attribute is non-``None``,
+    * the corresponding mesh axis has more than 1 device.
+
+    Mirrors the three-step ``is_X_enabled`` idiom in
+    :func:`get_sharding_map_logic_axis_to_mesh_axis` but returns the
+    axis name itself (or ``None``) so callers can use it directly in
+    collectives / ``shard_map`` specs.
+
+    Args:
+        resource_name: Attribute name on :class:`MeshResource`, e.g.
+            ``"fsdp_resource"`` or ``"ep_resource"``.
+
+    Returns:
+        The mesh axis name when active, else ``None``.
+    """
+    if not is_mesh_available():
+        return None
+    if _GLOBAL_MESH_RESOURCE is None:
+        return None
+    axis = getattr(_GLOBAL_MESH_RESOURCE, resource_name)
+    if axis is None or get_mesh_axis_size(axis) <= 1:
+        return None
+    return axis
+
+
 def all_reduce_sum_along_dp_fsdp(x: jnp.array, mesh: jax.sharding.Mesh):
     """Perform all-reduce sum operation along data parallelism and FSDP axes.