AztecProtocol · AztecBot · May 30, 2026 · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/barretenberg/ts/dev/msm-webgpu/main.ts b/barretenberg/ts/dev/msm-webgpu/main.ts
diff --git a/barretenberg/ts/dev/msm-webgpu/scripts/run-browserstack.mjs b/barretenberg/ts/dev/msm-webgpu/scripts/run-browserstack.mjs
@@ -60,6 +60,8 @@ const { values: argv } = parseArgs({
     "skip-tunnel": { type: "boolean", default: false },
     "list-targets": { type: "boolean", default: false },
     autorun: { type: "string", default: "msm-cross-check" },
+    accum: { type: "string" },
+    extra: { type: "string" },
     "emit-body-only": { type: "boolean", default: false },
     "external-worker-id-file": { type: "string" },
     help: { type: "boolean", default: false },
@@ -417,7 +419,13 @@ async function main() {
   qp.set("autorun", argv.autorun);
   qp.set("logn", String(argv.n ?? "16"));
   if (argv.reps) qp.set("reps", String(argv.reps));
-  const pageUrl = `${baseUrl}${pageMap[argv.page]}?${qp.toString()}`;
+  if (argv.accum) qp.set("accum", String(argv.accum));
+  let extraQs = "";
+  if (argv.extra) {
+    // Raw extra query params (e.g. "inv=loop&s=4"), appended verbatim.
+    extraQs = (argv.extra.startsWith("&") ? "" : "&") + String(argv.extra);
+  }
+  const pageUrl = `${baseUrl}${pageMap[argv.page]}?${qp.toString()}${extraQs}`;
   err(`page URL: ${pageUrl}`);
 
   // Generate a runId on the client side (the page makes its own random

diff --git a/barretenberg/ts/src/msm_webgpu/COOP_WALKER_DESIGN.md b/barretenberg/ts/src/msm_webgpu/COOP_WALKER_DESIGN.md
@@ -0,0 +1,135 @@
+# Cooperative-inversion bucket accumulator ("coop-walker")
+
+Re-architecture of the MSM bucket-accumulate stage for laptop + mobile GPUs.
+Grounded in the measured ground truth that the stream-walker accumulate kernel
+is **memory-bound / occupancy-limited**, not inversion-bound.
+
+## Measured starting point (not re-derived)
+
+- The stream-walker accumulate kernel is extremely memory-bound on real
+  hardware. safegcd inversion *looks* like ~47% of the walker wall only
+  because memory stalls dilate it — the identical safegcd is <30% of MsmV2,
+  which is not memory-starved. Lever = **hide memory latency**
+  (occupancy / coalescing / fewer dependent gathers), not cheaper inversion.
+- The stream-walker is per-thread bucket-monotonic. Each thread serially
+  walks a contiguous bucket range carrying **S** independent slot accumulators
+  in private registers, and stages forward-prefix products through a
+  `var<workgroup> pref_scratch` sized `TPB*S*2` vec4 = **16 KB at TPB=64,S=8**.
+  That long per-thread serial dependency chain + the 16 KB workgroup footprint
+  are why occupancy is low and memory latency is not hidden.
+- Mobile reality: 16 KB workgroup memory (Mali) / 32 KB (Apple, Adreno);
+  only 10 storage buffers per stage on many mobile adapters; Android Chrome
+  has no timestamp-query (wall-time only).
+
+## Why the walker is occupancy-starved
+
+Two coupled costs both scale with **S** (slots per thread):
+
+1. **Register pressure.** Per slot the walker keeps `acc_x[8] + acc_y[8]`
+   (16 u32) plus 8 bookkeeping `array<u32,S>` (cursor, bucket_end,
+   task_end_sort, task_end_cur, cur_sorted, cur_bucket, is_first, slot_done,
+   split_start). At S=8 that is ~150+ live registers per invocation → few
+   resident invocations → memory latency is exposed.
+2. **Workgroup memory.** `pref_scratch = TPB*S*2` vec4 = 16 KB at TPB=64,S=8.
+   On Mali (16 KB total shared) this caps the core to **one resident
+   workgroup**. No second workgroup means barriers and dependent gathers in
+   the resident workgroup stall the whole core.
+
+Shrinking S (the sibling "S-sweep") trades inversion amortization for
+occupancy but leaves the *structure* — long per-thread serial chain, per-slot
+carried state — intact. This design changes the structure instead.
+
+## The structural change: share one inversion across the workgroup
+
+Set **slots-per-thread = 1**. Each thread is a plain serial walker over one
+contiguous slice of the sorted bucket stream (reusing the existing
+`thread_cuts` partition unchanged). The batched-inversion that made affine
+adds cheap is moved from *per-thread over S slots* to *per-workgroup over TPB
+threads*:
+
+- Each round, every active thread produces exactly one `dx` for its pending
+  affine add (a retired thread contributes `dx = R`, Montgomery one, which is
+  inert).
+- The workgroup computes the batch inverse of the TPB `dx` values
+  cooperatively: an exclusive **prefix-product scan** and an exclusive
+  **suffix-product scan** in workgroup memory, then a **single** safegcd
+  inversion of the workgroup-wide product (one thread), then
+  `inv_dx_t = inv_total * pre[t] * suf[t]`.
+- Each thread applies its affine add with its `inv_dx_t` and advances.
+
+### What this buys, on every axis the ground truth cares about
+
+| Axis | stream-walker (TPB=64,S=8) | coop-walker (TPB=64,S=1) |
+|---|---|---|
+| Live registers / invocation | ~150+ (scales with S) | ~20 (one accumulator) |
+| Workgroup memory | 16 KB (`TPB*S*2` vec4) | ~6 KB (dx + pre + suf, `3*TPB*2` vec4) |
+| Independent adds in flight / round | S=8 per thread | TPB=64 per workgroup |
+| safegcd inversions | ≈ total_adds / S | ≈ total_adds / TPB (**~8× fewer**) |
+| Mali resident workgroups / core | 1 (16 KB cap) | ≥2 (6 KB) |
+
+Lower registers + lower workgroup memory → **higher occupancy** → more
+resident workgroups to hide memory latency (the MsmV2 win) while still
+**streaming** each point from global memory exactly once (the walker memory
+footprint — no pair-tree materialization). The cooperative scan adds
+`2*log2(TPB)` barriers per round, but with high occupancy those barriers are
+hidden by sibling workgroups — exactly the latency-hiding regime MsmV2 proves
+is reachable on this hardware.
+
+Fewer total inversions (~8×) is a bonus, not the point: the wall is memory,
+and a shorter per-invocation serial chain with far more resident invocations
+is what hides it.
+
+## I/O contract (drop-in for the existing pipeline)
+
+The coop kernel replaces only the `stream_walker` accumulate dispatch. It
+reuses the entire surrounding pipeline (decompose → transpose → planner →
+reduce → `walker_partials_index` → `walker_combine`) and keeps the exact same
+output contract:
+
+- A bucket fully owned within one thread's range → full EC sum written to
+  `bucket_sums[bucket_id]`, no partial.
+- A bucket split across a thread boundary → each thread writes its piece's
+  partial-sum to a unique slot (`2*t+0` split-start suffix, `2*t+1` task-end
+  prefix) with `partial_dest[slot] = bucket_id`; `walker_combine` sums them.
+- Unused partial slots → `partial_dest = NO_BUCKET`.
+
+Because there is no S sub-split, the coop kernel emits **fewer** partials than
+the walker (boundaries only at thread cuts, not task cuts), which also reduces
+exposure to the known `walker_combine` `dx==0` incomplete-affine-add bug.
+
+## Status
+
+- [x] Headless-SwiftShader GPU-vs-noble cross-check harness
+      (`autorun=msm-noble`), GREEN at logn=8 and logn=10 for walker and coop.
+- [x] coop-walker kernel + host wiring (selectable via `accum` knob, with the
+      inversion-granularity knob `G`).
+- [x] cross-check coop at logn 8/10, multiple configs incl. `accum:'auto'`.
+- [x] BrowserStack real-hardware A/B vs the stream-walker on real Apple M2,
+      Adreno (S25 Ultra), and Mali (Pixel 9 Pro XL). See PR #23739 for the
+      tables.
+
+### Measured outcome (the design's prediction was half-right)
+
+The occupancy thesis holds **on Adreno**: coop at **G=1** (each thread inverts
+its own dx — no workgroup memory, no in-loop barriers, one accumulator/thread,
+maximal occupancy) is **1.67–2.05× faster than the stream-walker** on a Galaxy
+S25 Ultra across logN 12/14/16. The win comes purely from occupancy hiding
+memory latency — and it does so *despite* G=1 doing ~S× MORE safegcd inversions
+than the walker's S-wide batch (the design's "fewer inversions" via the
+workgroup scan is irrelevant; the scan, G=TPB, is in fact the slowest coop mode
+and triggers a device-lost on Adreno at logN≥14).
+
+It does **not** generalise: on Mali (Pixel 9 Pro XL) coop G=1 wins only at logN
+12 and regresses at 14/16, and on Apple M2 it loses above logN 12 — neither
+hides G=1's extra inversions the way Adreno does. So `accum:'auto'` selects
+coop G=1 **only on Adreno/Qualcomm** and keeps the walker everywhere else.
+
+## Alternatives considered (documented, not pursued first)
+
+- **Stage points in bucket-sorted order** to remove the two-hop dependent
+  gather (`l0_index[cursor]` → `point_x[2*pt]`) from the hot loop and coalesce
+  reads. Rejected as the first move because the staging buffer (~n·64 B) adds
+  memory and a full extra streaming pass on an already memory-bound kernel;
+  worth revisiting as a workgroup-memory tile rather than a global buffer.
+- **Drop Montgomery form** for the modest muls/element. Orthogonal to the
+  occupancy problem; not the lever.
diff --git a/barretenberg/ts/src/msm_webgpu/cuzk/gpu.ts b/barretenberg/ts/src/msm_webgpu/cuzk/gpu.ts
@@ -48,9 +48,28 @@ export const get_device = async (): Promise<GPUDevice> => {
 
   const device = await adapter.requestDevice({ requiredFeatures, requiredLimits });
   const grantedLimits = device.limits as unknown as Record<string, number>;
+  // Stash the adapter's info for the per-device kernel selection in
+  // `resolveAccum` (the bucket-accumulate kernel choice keys off
+  // vendor/architecture). Newer Chrome exposes a read-only `device.adapterInfo`
+  // getter (which resolveAccum reads first); on engines without it we keep a
+  // copy under a private key. Never assign to `adapterInfo` — it is getter-only
+  // where present and assignment throws.
+  const adapterInfo =
+    (adapter as unknown as { info?: GPUAdapterInfo }).info ??
+    (typeof (adapter as unknown as { requestAdapterInfo?: () => Promise<GPUAdapterInfo> }).requestAdapterInfo === 'function'
+      ? await (adapter as unknown as { requestAdapterInfo: () => Promise<GPUAdapterInfo> }).requestAdapterInfo()
+      : undefined);
+  if (adapterInfo) {
+    try {
+      (device as unknown as { __adapterInfo?: GPUAdapterInfo }).__adapterInfo = adapterInfo;
+    } catch {
+      // ignore — resolveAccum falls back to the native device.adapterInfo getter
+    }
+  }
   console.log(
     `[gpu] requested maxComputeWorkgroupStorageSize=${wgStorageMax}B,` +
-      ` granted=${grantedLimits['maxComputeWorkgroupStorageSize']}B`,
+      ` granted=${grantedLimits['maxComputeWorkgroupStorageSize']}B` +
+      ` adapter="${adapterInfo?.vendor ?? ''}/${adapterInfo?.architecture ?? ''}"`,
   );
   return device;
 };

diff --git a/barretenberg/ts/src/msm_webgpu/cuzk/shader_manager.ts b/barretenberg/ts/src/msm_webgpu/cuzk/shader_manager.ts
@@ -20,6 +20,7 @@ import {
   ba_partial_sum as ba_partial_sum_shader,
   // Stream-walker (STREAM_WALKER_PLAN.md §6, plus C's KNOB 2 variant).
   ba_planner_partition_task as ba_planner_partition_task_shader,
+  ba_coop_walker as ba_coop_walker_shader,
   ba_stream_walker as ba_stream_walker_shader,
   ba_walker_combine as ba_walker_combine_shader,
   ba_walker_partials_index as ba_walker_partials_index_shader,
@@ -929,6 +930,48 @@ ${packLines.join('\n')}
     );
   }
 
+  // Cooperative-inversion bucket accumulator (coop-walker). Drop-in for the
+  // stream_walker dispatch (same bind group + indirect args): one task per
+  // thread, with the batched inversion shared across the workgroup via a
+  // prefix/suffix product scan + a single safegcd inversion per round.
+  public gen_ba_coop_walker_shader(
+    workgroup_size: number,
+    s: number,
+    variant: 'loop' | 'pk' = 'pk',
+    g: number = workgroup_size,
+  ): string {
+    const dec = this.decoupledPackUnpackWgsl();
+    const inverse_funcs = by_inverse_loop_funcs;
+    const inv_fn = variant === 'pk' ? 'fr_inv_by_loop_pk' : 'fr_inv_by_loop';
+    const { p8_consts, r8_csv, f8_words } = this.f8Context();
+    // Inversion granularity: G==TPB -> cooperative prefix/suffix scan;
+    // 1<G<TPB -> per-group serial Montgomery batch inversion; G==1 -> each
+    // thread inverts its own dx (no workgroup memory, no barriers).
+    const gClamped = Math.max(1, Math.min(g, workgroup_size));
+    const coop_local = gClamped === 1;
+    const coop_scan = gClamped >= workgroup_size;
+    const coop_group = !coop_local && !coop_scan;
+    return mustache.render(
+      ba_coop_walker_shader,
+      {
+        workgroup_size, s, inv_fn, g: gClamped,
+        coop_local, coop_scan, coop_group,
+        p8_consts, r8_csv, f8_words,
+        word_size: this.word_size, num_words: this.num_words, n0: this.n0,
+        p_limbs: this.p_limbs, r_limbs: this.r_limbs, r_cubed_limbs: this.r_cubed_limbs,
+        p_minus_2_limbs: this.p_minus_2_limbs, mask: this.mask,
+        two_pow_word_size: this.two_pow_word_size, p_inv_mod_2w: this.p_inv_mod_2w,
+        p_inv_by_a_lo: this.p_inv_by_a_lo,
+        dec_unpack: dec.unpack, dec_pack: dec.pack, recompile: this.recompile,
+      },
+      {
+        structs, bigint_funcs,
+        montgomery_product_funcs: this.mont_product_src,
+        field_funcs, field8_funcs, fr_pow_funcs, bigint_by_funcs, inverse_funcs,
+      },
+    );
+  }
+
   // Stream-walker partials indexer (task #19): one thread per partial slot.
   // Builds a per-bucket linked list in (bucket_head, nodes_slot, nodes_next)
   // using atomicCompareExchange — replaces walker_combine's O(num_dense ×

diff --git a/barretenberg/ts/src/msm_webgpu/msm_v2.ts b/barretenberg/ts/src/msm_webgpu/msm_v2.ts
@@ -64,6 +64,26 @@ export interface MsmConfig {
   profile?: boolean;
   /** Phase-2 hook — Jacobian-crossover threshold. Accepted but inert in Phase 1. */
   jacobianCrossover?: number;
+  /**
+   * Bucket-accumulate kernel.
+   * - 'walker' = per-thread S-slot stream-walker (each thread inverts its own
+   *   S-wide batch in parallel with every other thread).
+   * - 'coop' = cooperative-inversion accumulator (one task per thread, a single
+   *   safegcd inversion shared across the workgroup via a prefix/suffix scan).
+   * - 'auto' (default) = pick per device: 'coop' on memory-/register-starved
+   *   mobile GPUs (Adreno/Mali), 'walker' on cache-rich desktop GPUs. See
+   *   {@link resolveAccum}.
+   * Drop-in: all reuse the same bind group, indirect dispatch, and combine path.
+   */
+  accum?: 'walker' | 'coop' | 'auto';
+  /**
+   * Cooperative-inversion granularity for `accum: 'coop'`: number of threads
+   * that share ONE batched inversion. `WALKER_TPB` (default) = workgroup-wide
+   * prefix/suffix scan; `1 < G < TPB` = per-group serial Montgomery batch
+   * inversion (TPB/G concurrent inversions); `1` = each thread inverts its own
+   * dx (no workgroup memory, no barriers). Must divide the workgroup size.
+   */
+  coopG?: number;
   /**
    * Discarded warm-up `run()`s in `create()` — they ramp the GPU clock and pay
    * the shader-JIT / command-buffer cold start before the first timed run.
@@ -406,6 +426,55 @@ function pickReduceWg(c: number): number {
   return c <= 9 ? 32 : c <= 12 ? 64 : 128;
 }
 
+// `auto` selects the cooperative-inversion accumulator with G=1 only on Adreno
+// (Qualcomm) GPUs, where it is measured fastest; everything else uses the
+// walker. The choice is data-driven, not a blanket "mobile → coop":
+//   - Adreno (Galaxy S25 Ultra, Chrome 145): coop G=1 runs 1.67–2.05× faster
+//     than the stream-walker across logN 12/14/16, speedup decaying monotonically
+//     with G (g8≈1.5×, g16≈1.2–1.3×, g32≈parity). G=1 = each thread inverts its
+//     own dx: no workgroup memory, no in-loop barriers, one accumulator/thread,
+//     so occupancy is maximal — which on Adreno hides the cost of doing ~S× more
+//     safegcd inversions than the walker's S-wide batch. The workgroup-scan
+//     default (G=TPB) is the WORST coop mode there and triggered a device-lost
+//     at logN≥14.
+//   - Mali (Pixel 9 Pro XL, Chrome 145): coop G=1 wins only at logN 12 (1.31×)
+//     and REGRESSES at logN 14/16 (0.87×, 0.79×) — Mali does not hide G=1's ~S×
+//     extra inversions. So Mali stays on the walker.
+//   - Cache-rich desktop GPUs (Apple M2): the walker wins outright. Walker is
+//     never selected against here.
+// G=1 is the only coop granularity `auto` ever ships; the workgroup-scan default
+// (G=TPB=64) is kept only for explicit `accum:'coop'` benchmarking.
+const COOP_G_AUTO = 1;
+const COOP_G_DEFAULT = 64;
+
+// True for the GPU family where coop G=1 is measured fastest (Adreno/Qualcomm).
+// Newer Chrome exposes a read-only `device.adapterInfo` getter; engines without
+// it get the copy stashed by `get_device` under `__adapterInfo`.
+function coopWinsHere(device: GPUDevice): { coopWins: boolean; hay: string } {
+  const info = ((device as unknown as { adapterInfo?: GPUAdapterInfo }).adapterInfo ??
+    (device as unknown as { __adapterInfo?: GPUAdapterInfo }).__adapterInfo ??
+    {}) as Partial<GPUAdapterInfo>;
+  const hay = `${info.vendor ?? ''} ${info.architecture ?? ''} ${info.device ?? ''} ${info.description ?? ''}`.toLowerCase();
+  return { coopWins: /adreno|qualcomm|snapdragon/.test(hay), hay };
+}
+
+// Resolve the bucket-accumulate kernel + inversion granularity for this device.
+// Explicit `accum` is honoured (G defaults to the scan unless overridden); `auto`
+// (the default) picks coop G=1 on Adreno (measured 1.67–2.05× over the walker)
+// and the walker everywhere else. An explicit `coopG` always wins.
+function resolveAccum(
+  requestedAccum: 'walker' | 'coop' | 'auto' | undefined,
+  requestedG: number | undefined,
+  device: GPUDevice,
+): { accum: 'walker' | 'coop'; coopG: number } {
+  if (requestedAccum === 'walker') return { accum: 'walker', coopG: requestedG ?? COOP_G_DEFAULT };
+  if (requestedAccum === 'coop') return { accum: 'coop', coopG: requestedG ?? COOP_G_DEFAULT };
+  const { coopWins, hay } = coopWinsHere(device);
+  const accum: 'walker' | 'coop' = coopWins ? 'coop' : 'walker';
+  console.log(`[MsmV2] accum=auto -> '${accum}'${accum === 'coop' ? ` G=${requestedG ?? COOP_G_AUTO}` : ''} (adapter: "${hay.trim()}")`);
+  return { accum, coopG: requestedG ?? COOP_G_AUTO };
+}
+
 // Per-level GPU dispatch wiring for one prepared scalar set.
 interface LevelBind {
   plannerABind: GPUBindGroup;
@@ -1265,6 +1334,10 @@ export class MsmV2 {
   private streamWalkerPipe!: GPUComputePipeline;
   private streamWalkerLayout!: GPUBindGroupLayout;
   private streamWalkerBind!: GPUBindGroup;
+  // Cooperative-inversion accumulator (reuses streamWalkerLayout + bind).
+  private coopWalkerPipe!: GPUComputePipeline;
+  private accum: 'walker' | 'coop' = 'walker';
+  private coopG = 64;
   private walkerCombinePipe!: GPUComputePipeline;
   private walkerCombineLayout!: GPUBindGroupLayout;
   private walkerCombineBind!: GPUBindGroup;
@@ -1384,6 +1457,9 @@ export class MsmV2 {
     m.invVariant = config?.invVariant ?? DEFAULT_INV_VARIANT;
     m.addsub = config?.addsub ?? 'native';
     m.jacobianCrossover = config?.jacobianCrossover ?? 0;
+    const resolved = resolveAccum(config?.accum, config?.coopG, device);
+    m.accum = resolved.accum;
+    m.coopG = resolved.coopG;
     m.combineOnHost = config?.combineOnHost ?? true;
     const wantProfile = config?.profile ?? false;
     m.profile = wantProfile && device.features.has('timestamp-query');
@@ -1626,6 +1702,13 @@ export class MsmV2 {
     m.streamWalkerPipe = await compile(
       sm.gen_ba_stream_walker_shader(WALKER_TPB, STREAM_S, INV_VARIANT),
       `stream-walker`, m.streamWalkerLayout);
+    // coop-walker shares the indirect-dispatch grain (ceil(num_active/TPB))
+    // and the stream-walker bind group; only compiled when selected.
+    if (m.accum === 'coop') {
+      m.coopWalkerPipe = await compile(
+        sm.gen_ba_coop_walker_shader(WALKER_TPB, STREAM_S, INV_VARIANT, m.coopG),
+        `coop-walker`, m.streamWalkerLayout);
+    }
     m.walkerCombinePipe = await compile(
       sm.gen_ba_walker_combine_shader(STREAM_S, INV_VARIANT),
       `walker-combine`, m.walkerCombineLayout);
@@ -2369,7 +2452,8 @@ export class MsmV2 {
       // partition_task wrote the walker's indirect args to planner_meta[15..17]
       // (= byte offset 60 = 15 * 4).
       setPhase('stream_walker');
-      indirectDispatch(this.streamWalkerPipe, this.streamWalkerBind, spMeta, 15 * 4);
+      const accumPipe = this.accum === 'coop' ? this.coopWalkerPipe : this.streamWalkerPipe;
+      indirectDispatch(accumPipe, this.streamWalkerBind, spMeta, 15 * 4);
       // Task #19: per-bucket linked-list index (atomic CAS pass over
       // partial_dest) replaces walker_combine's O(num_dense × M_partials)
       // scan with O(M_partials) indexing + O(num_partials_per_bucket) walks.