From cca31292cfed59388173b0eda74fb4093e8b34cb Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sat, 16 May 2026 13:03:28 +0200
Subject: [PATCH 1/6] Run RISC-V tests with multiple RVV QEMU configurations

Given RISC-V allows different hardware implementations to have different
vector length (similar to ARM SVE), we want to make sure that we test
on different configurations. Luckily, QEMU allows us to simply set a
vlen=<128,256,512,...> parameter on QEMU_CPU to emulate different
vector length.
---
 .github/workflows/_test_riscv.yml | 11 +++++++++--
 .github/workflows/riscv64.yml     | 14 ++++++++++++++
 examples/riscv/run.sh             |  4 ++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml
index da19dfc9bda..e7a93a36d10 100644
--- a/.github/workflows/_test_riscv.yml
+++ b/.github/workflows/_test_riscv.yml
@@ -27,6 +27,10 @@ on:
         required: false
         type: boolean
         default: false
+      qemu-cpu:
+        description: 'Configuration(s) for the CPU to emulate with QEMU, expecting a JSON array'
+        required: true
+        type: string
       gcc-version:
         description: 'The version of GCC to use'
         required: false
@@ -52,5 +56,8 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch "--use-pt-pinned-commit"
 
-        export GCC_VERSION=${{ inputs.gcc-version }}
-        bash .ci/scripts/test_riscv_qemu.sh --model="${{ inputs.model }}" ${{ inputs.xnnpack && '--xnnpack' || '' }} ${{ inputs.quantize && '--quantize' || '' }}
+        echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do
+          export QEMU_CPU="${qemu_cpu}"
+          export GCC_VERSION=${{ inputs.gcc-version }}
+          bash .ci/scripts/test_riscv_qemu.sh --model="${{ inputs.model }}" ${{ inputs.xnnpack && '--xnnpack' || '' }} ${{ inputs.quantize && '--quantize' || '' }}
+        done
diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml
index 9823db09cc1..d7beae7dc8e 100644
--- a/.github/workflows/riscv64.yml
+++ b/.github/workflows/riscv64.yml
@@ -50,6 +50,20 @@ jobs:
       model: ${{ matrix.model }}
       xnnpack: ${{ matrix.xnnpack }}
       quantize: ${{ matrix.quantize }}
+      # If XNNPACK, test with multiple RVV length, disabled otherwise
+      qemu-cpu: >-
+        ${{
+          case(
+            matrix.xnnpack, '[
+              "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=128,elen=64,vext_spec=v1.0",
+              "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=256,elen=64,vext_spec=v1.0",
+              "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0"
+            ]',
+            '[
+              "rv64,zba=true,zbb=true,zbs=true,v=false"
+            ]'
+          )
+        }}
       # XNNPACK requires GCC 14+
       gcc-version: ${{ matrix.xnnpack && 14 || 11 }}
       docker-image: ${{ matrix.xnnpack && 'ci-image:executorch-ubuntu-24.04-gcc14' || 'ci-image:executorch-ubuntu-22.04-gcc11' }}
diff --git a/examples/riscv/run.sh b/examples/riscv/run.sh
index 644944ab8a4..d6e86031ac9 100755
--- a/examples/riscv/run.sh
+++ b/examples/riscv/run.sh
@@ -111,6 +111,10 @@ hash "${qemu}" 2>/dev/null || {
 # linker (ld-linux-riscv64-lp64d.so.1) referenced in the ELF resolves.
 export QEMU_LD_PREFIX="${QEMU_LD_PREFIX:-/usr/riscv64-linux-gnu}"
 
+if [[ -n "${QEMU_CPU+x}" ]]; then
+    echo "[run.sh] QEMU_CPU=${QEMU_CPU}"
+fi
+
 log_file=$(mktemp)
 trap 'rm -f "${log_file}"' EXIT
 

From 7eba60a0d4d28ba95b7c6d818f39d20e17a3bb04 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sat, 16 May 2026 13:29:15 +0200
Subject: [PATCH 2/6] Add XNNPACK coverage instrumentation for riscv64

---
 .ci/scripts/test_riscv_qemu.sh         |  15 +-
 .github/workflows/_test_riscv.yml      |  28 ++-
 backends/xnnpack/CMakeLists.txt        |   6 +
 examples/riscv/etdump_summary.py       | 228 +++++++++++++++++++++++++
 examples/riscv/run.sh                  |  25 ++-
 tools/cmake/preset/riscv64_linux.cmake |  12 ++
 6 files changed, 306 insertions(+), 8 deletions(-)
 create mode 100644 examples/riscv/etdump_summary.py

diff --git a/.ci/scripts/test_riscv_qemu.sh b/.ci/scripts/test_riscv_qemu.sh
index 0d8b2815f74..d1998561553 100755
--- a/.ci/scripts/test_riscv_qemu.sh
+++ b/.ci/scripts/test_riscv_qemu.sh
@@ -18,15 +18,18 @@ model="add"
 xnnpack=false
 quantize=false
 verbose=false
+verbose_xnnpack=false
 
 usage() {
     cat <<EOF
 Usage: $(basename "$0") [options]
 Options:
-  --model=<NAME>  Which model to export and run (default: add)
-  --xnnpack       Enable the XNNPACK backend (AOT partitioner + runtime)
-  --quantize      Produce an 8-bit quantized model
-  -h, --help      Show this help
+  --model=<NAME>     Which model to export and run (default: add)
+  --xnnpack          Enable the XNNPACK backend (AOT partitioner + runtime)
+  --quantize         Produce an 8-bit quantized model
+  --verbose          Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
+  --verbose-xnnpack  Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch
+  -h, --help         Show this help
 EOF
 }
 
@@ -36,6 +39,7 @@ for arg in "$@"; do
         --xnnpack) xnnpack=true ;;
         --quantize) quantize=true ;;
         --verbose) verbose=true ;;
+        --verbose-xnnpack) verbose_xnnpack=true ;;
         -h|--help) usage; exit 0 ;;
         *) echo "Unknown option: $arg" >&2; usage; exit 1 ;;
     esac
@@ -51,6 +55,9 @@ fi
 if ${verbose}; then
     run_extra_args+=(--verbose)
 fi
+if ${verbose_xnnpack}; then
+    run_extra_args+=(--verbose-xnnpack)
+fi
 
 bash "${et_root_dir}/examples/riscv/setup.sh"
 bash "${et_root_dir}/examples/riscv/run.sh" --model="${model}" "${run_extra_args[@]}"
diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml
index e7a93a36d10..4be2732016e 100644
--- a/.github/workflows/_test_riscv.yml
+++ b/.github/workflows/_test_riscv.yml
@@ -59,5 +59,31 @@ jobs:
         echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do
           export QEMU_CPU="${qemu_cpu}"
           export GCC_VERSION=${{ inputs.gcc-version }}
-          bash .ci/scripts/test_riscv_qemu.sh --model="${{ inputs.model }}" ${{ inputs.xnnpack && '--xnnpack' || '' }} ${{ inputs.quantize && '--quantize' || '' }}
+          bash .ci/scripts/test_riscv_qemu.sh \
+            --model="${{ inputs.model }}" \
+            ${{ inputs.xnnpack && '--xnnpack --verbose-xnnpack' || '' }} \
+            ${{ inputs.quantize && '--quantize' || '' }}
+
+          # Generate markdown table from riscv_test/${{ inputs.model }}_riscv.etdump.json, sorted by sum_ms
+          (
+            etdump_json="riscv_test/${{ inputs.model }}_riscv.etdump.json"
+            echo "### Model=${{ inputs.model }} XNNPACK=${{ inputs.xnnpack }} Quantize=${{ inputs.quantize }} QEMU_CPU='${QEMU_CPU}'"
+            jq -r '
+              def r3: (. * 1000 | round) / 1000;
+              ["Section","Op","Count","Sum (ms)","Avg (ms)","Max (ms)","Microkernels"],
+              ["---","---","---","---","---","---","---"],
+              ( [ (.ops[]       | . + {section: "ops"}),
+                  (.framework[] | . + {section: "framework"}) ]
+                | sort_by(-.sum_ms) | .[]
+                | [.section, .op, .count, (.sum_ms|r3), (.avg_ms|r3), (.max_ms|r3), ((.kernels // []) | join(", "))] )
+              | "| " + (map(tostring) | join(" | ")) + " |"
+            ' "${etdump_json}"
+            echo
+            echo "<details><summary>Registered XNNPACK microkernels</summary>"
+            echo
+            jq -r '.registered_kernels[] | "- `" + . + "`"' "${etdump_json}"
+            echo
+            echo "</details>"
+            echo
+          ) >> $GITHUB_STEP_SUMMARY
         done
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 1b46c993b17..cd0d945a84f 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -169,6 +169,12 @@ install(
   EXPORT ExecuTorchTargets
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
+if(DEFINED EXECUTORCH_XNNPACK_LOG_LEVEL)
+  target_compile_definitions(
+    xnnpack-logging PUBLIC XNN_LOG_LEVEL=${EXECUTORCH_XNNPACK_LOG_LEVEL}
+  )
+endif()
+
 if(BUILD_TESTING)
   add_subdirectory(test)
 endif()
diff --git a/examples/riscv/etdump_summary.py b/examples/riscv/etdump_summary.py
new file mode 100644
index 00000000000..e4fc5a61d7e
--- /dev/null
+++ b/examples/riscv/etdump_summary.py
@@ -0,0 +1,228 @@
+# Copyright 2026 The ExecuTorch Authors.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Render a per-XNNPACK-op summary from an ETDump file."""
+
+import argparse
+import json
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+from executorch.devtools import Inspector
+
+
+# "Convolution (NHWC, F32) IGEMM #3" -> ("Convolution (NHWC, F32) IGEMM", 3)
+_SEQ_RE = re.compile(r"^(.*?)\s+#(\d+)$")
+
+# Wrappers around per-op events; kept separate to avoid double-counting children.
+FRAMEWORK_EVENTS = frozenset(
+    {
+        "Method::execute",
+        "Method::init",
+        "Program::load_method",
+        "DELEGATE_CALL",
+        "OPERATOR_CALL",
+    }
+)
+
+_REG_LOG_RE = re.compile(r"Note \(XNNPACK\):.*microkernel '([^']+)'")
+
+
+def parse_run_log(path: Path):
+    syms = set()
+    with open(path, errors="ignore") as f:
+        for line in f:
+            m = _REG_LOG_RE.search(line)
+            if m:
+                syms.add(m.group(1))
+    return sorted(syms)
+
+
+# Two-source mapping from an ETDump op name to a symbol-substring pattern.
+# When the operator type uses xnn_microkernel_type_default, runtime.c does NOT
+# append a category suffix, so we fall back to matching on the base op name.
+_OP_NAME_RE = re.compile(r"^(.*?)\s*\(([^)]*)\)\s*(.*)$")
+_DTYPE_TOKENS = frozenset(
+    {
+        "F32",
+        "F16",
+        "QS8",
+        "QU8",
+        "QC8",
+        "QC4",
+        "QD8",
+        "QC8W",
+        "QC4W",
+        "X8",
+        "X16",
+        "X24",
+        "X32",
+        "X64",
+    }
+)
+# Infix between the kind token and `_ukernel_`: zero or more `<word>_`
+# segments (e.g. `_gemm_ukernel_`, `_gemm_minmax_ukernel_`,
+# `_gemm_minmax_fp32_ukernel_`, ...).
+_INFIX = r"(?:[a-z0-9]+_)*"
+_KIND_PATTERN = {
+    # Microkernel categories appended by runtime.c (xnn_microkernel_type_to_string).
+    "GEMM": r"_gemm_" + _INFIX + r"ukernel_",
+    "IGEMM": r"_igemm_" + _INFIX + r"ukernel_",
+    "DWConv": r"_dwconv_" + _INFIX + r"ukernel_",
+    "Transpose": r"_transposec?_" + _INFIX + r"ukernel_",
+    "Reduce": r"_(?:rsum|rmax|rminmax|rdmax|rdsum)_" + _INFIX + r"ukernel_",
+    "Reduce2": r"_(?:rdmax|rdsum)_" + _INFIX + r"ukernel_",
+    "VMulCAddC": r"_vmulcaddc_" + _INFIX + r"ukernel_",
+    "Average Pooling": r"_(?:avgpool|gavgpool)_" + _INFIX + r"ukernel_",
+    "Pixelwise Average Pooling": r"_pavgpool_" + _INFIX + r"ukernel_",
+    "Conv2D HWC2CHW": r"_conv_hwc2chw_" + _INFIX + r"ukernel_",
+    "SPMM": r"_spmm_" + _INFIX + r"ukernel_",
+    "Subconv2D": r"_subconv2d_" + _INFIX + r"ukernel_",
+    # Base op names (default microkernel type, no category suffix in the ETDump name).
+    "Add": r"_v(?:add|addc)_" + _INFIX + r"ukernel_",
+    "Subtract": r"_v(?:sub|subc|rsubc)_" + _INFIX + r"ukernel_",
+    "Multiply": r"_v(?:mul|mulc)_" + _INFIX + r"ukernel_",
+    "Divide": r"_v(?:div|divc|rdivc)_" + _INFIX + r"ukernel_",
+    "Maximum": r"_v(?:max|maxc)_" + _INFIX + r"ukernel_",
+    "Minimum": r"_v(?:min|minc)_" + _INFIX + r"ukernel_",
+    "Clamp": r"_vclamp_" + _INFIX + r"ukernel_",
+    "Sigmoid": r"_vsigmoid_" + _INFIX + r"ukernel_",
+    "Tanh": r"_vtanh_" + _INFIX + r"ukernel_",
+    "Negate": r"_vneg_" + _INFIX + r"ukernel_",
+    "Abs": r"_vabs_" + _INFIX + r"ukernel_",
+    "Square": r"_vsqr_" + _INFIX + r"ukernel_",
+    "Square Root": r"_vsqrt_" + _INFIX + r"ukernel_",
+    "Reciprocal Square Root": r"_vrsqrt_" + _INFIX + r"ukernel_",
+    "Convert": r"_vcvt_" + _INFIX + r"ukernel_",
+    "Copy": r"_(?:copy|memcpy)_" + _INFIX + r"ukernel_",
+    "Constant Pad": r"_xx_pad_" + _INFIX + r"ukernel_",
+    "Softmax": r"_(?:raddstoreexpminusmax|rmax)_" + _INFIX + r"ukernel_",
+    "Max Pooling": r"_maxpool_" + _INFIX + r"ukernel_",
+}
+
+
+def op_kernels(op_name, kernels):
+    m = _OP_NAME_RE.match(op_name)
+    if not m:
+        return []
+    base, inside, tail = m.group(1).strip(), m.group(2), m.group(3).strip()
+    key = tail if tail in _KIND_PATTERN else (base if base in _KIND_PATTERN else None)
+    if key is None:
+        return []
+    dtype_tokens = [
+        s.strip().lower() for s in inside.split(",") if s.strip() in _DTYPE_TOKENS
+    ]
+    cat_re = re.compile(_KIND_PATTERN[key])
+    return [
+        sym
+        for sym in kernels
+        if cat_re.search(sym) and all(d in sym for d in dtype_tokens)
+    ]
+
+
+def aggregate(etdump_path: Path):
+    insp = Inspector(etdump_path=str(etdump_path))
+    per_op = defaultdict(lambda: {"count": 0, "raw": []})
+    framework = defaultdict(lambda: {"count": 0, "raw": []})
+    for block in insp.event_blocks:
+        for ev in block.events:
+            m = _SEQ_RE.match(ev.name or "")
+            base = m.group(1) if m else (ev.name or "<unnamed>")
+            bucket = framework if base in FRAMEWORK_EVENTS else per_op
+            bucket[base]["count"] += 1
+            bucket[base]["raw"].extend(ev.perf_data.raw if ev.perf_data else [])
+    return per_op, framework
+
+
+def render(per_op, framework, etdump_path, kernels):
+    def rows_of(d):
+        rows = []
+        for name, v in d.items():
+            raw = v["raw"]
+            s = sum(raw)
+            rows.append(
+                {
+                    "op": name,
+                    "count": v["count"],
+                    "sum_ms": s,
+                    "avg_ms": (s / len(raw)) if raw else 0.0,
+                    "max_ms": max(raw) if raw else 0.0,
+                    "kernels": op_kernels(name, kernels) if kernels else [],
+                }
+            )
+        rows.sort(key=lambda r: r["sum_ms"], reverse=True)
+        return rows
+
+    op_rows = rows_of(per_op)
+    fw_rows = rows_of(framework)
+    ops_total = sum(r["sum_ms"] for r in op_rows)
+    fw_total = sum(r["sum_ms"] for r in fw_rows)
+
+    def fmt_table(label, rows, total):
+        print(f"\n[etdump_summary] {label}  total={total:.3f} ms")
+        print(
+            f"{'%':>5}  {'sum_ms':>10}  {'count':>6}  {'avg_ms':>10}  {'max_ms':>10}  op"
+        )
+        for r in rows:
+            pct = (r["sum_ms"] / total * 100.0) if total else 0.0
+            print(
+                f"{pct:5.1f}  {r['sum_ms']:10.3f}  {r['count']:6d}  "
+                f"{r['avg_ms']:10.3f}  {r['max_ms']:10.3f}  {r['op']}"
+            )
+
+    print(f"[etdump_summary] {etdump_path}")
+    fmt_table(f"XNNPACK ops ({len(op_rows)} unique)", op_rows, ops_total)
+    fmt_table(f"Framework wrappers ({len(fw_rows)})", fw_rows, fw_total)
+    if kernels:
+        print(f"\n[etdump_summary] Registered XNNPACK microkernels ({len(kernels)}):")
+        for sym in kernels:
+            print(f"  {sym}")
+
+    return op_rows, fw_rows, ops_total
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("etdump", type=Path)
+    parser.add_argument("--run-log", type=Path, default=None)
+    parser.add_argument("--json", type=Path, default=None)
+    args = parser.parse_args()
+
+    if not args.etdump.exists():
+        print(f"[etdump_summary] missing {args.etdump}", file=sys.stderr)
+        sys.exit(1)
+
+    kernels = []
+    if args.run_log is not None:
+        if not args.run_log.exists():
+            print(f"[etdump_summary] missing run log {args.run_log}", file=sys.stderr)
+            sys.exit(1)
+        kernels = parse_run_log(args.run_log)
+
+    per_op, framework = aggregate(args.etdump)
+    op_rows, fw_rows, ops_total = render(per_op, framework, args.etdump, kernels)
+
+    if args.json is not None:
+        args.json.parent.mkdir(parents=True, exist_ok=True)
+        args.json.write_text(
+            json.dumps(
+                {
+                    "etdump": str(args.etdump),
+                    "run_log": str(args.run_log) if args.run_log else None,
+                    "ops_total_ms": ops_total,
+                    "registered_kernels": kernels,
+                    "ops": op_rows,
+                    "framework": fw_rows,
+                },
+                indent=2,
+            )
+        )
+        print(f"[etdump_summary] wrote {args.json}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/riscv/run.sh b/examples/riscv/run.sh
index d6e86031ac9..916284cb73c 100755
--- a/examples/riscv/run.sh
+++ b/examples/riscv/run.sh
@@ -24,6 +24,7 @@ model="add"
 xnnpack=false
 quantize=false
 verbose=false
+verbose_xnnpack=false
 
 usage() {
     cat <<EOF
@@ -33,6 +34,7 @@ Options:
   --xnnpack               Enable the XNNPACK backend (AOT partitioner + runtime)
   --quantize              Produce an 8-bit quantized model
   --verbose               Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
+  --verbose-xnnpack       Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch at runtime
   --build_only            Only export and cross-compile; do not invoke QEMU
   --build_dir=<DIR>       CMake build directory (default: ${build_dir})
   --output_dir=<DIR>      Directory for the exported .bpte (default: ${output_dir})
@@ -48,6 +50,7 @@ for arg in "$@"; do
         --xnnpack) xnnpack=true ;;
         --quantize) quantize=true ;;
         --verbose) verbose=true ;;
+        --verbose-xnnpack) verbose_xnnpack=true ;;
         --build_only) build_only=true ;;
         --build_dir=*) build_dir="${arg#*=}" ;;
         --output_dir=*) output_dir="${arg#*=}" ;;
@@ -79,6 +82,9 @@ cmake_extra_args=()
 if ${xnnpack}; then
     cmake_extra_args+=(-DEXECUTORCH_BUILD_XNNPACK=ON)
 fi
+if ${verbose_xnnpack}; then
+    cmake_extra_args+=(-DEXECUTORCH_XNNPACK_LOG_LEVEL=4 -DEXECUTORCH_BUILD_RISCV_ETDUMP=ON)
+fi
 cmake -S "${et_root_dir}" -B "${build_dir}" \
     --preset riscv64-linux \
     "${cmake_extra_args[@]}" \
@@ -115,13 +121,20 @@ if [[ -n "${QEMU_CPU+x}" ]]; then
     echo "[run.sh] QEMU_CPU=${QEMU_CPU}"
 fi
 
-log_file=$(mktemp)
-trap 'rm -f "${log_file}"' EXIT
-
 runner_extra_args=()
 if ${quantize}; then
     runner_extra_args+=(--bundleio_rtol=0.1 --bundleio_atol=0.25)
 fi
+etdump_path=""
+if ${verbose_xnnpack}; then
+    etdump_path="${output_dir}/${model}_riscv.etdump"
+    rm -f "${etdump_path}"
+    runner_extra_args+=(--etdump_path="${etdump_path}")
+fi
+
+# etdump_summary.py reads the XNN_LOG_LEVEL=4 registrations.
+log_file="${output_dir}/${model}_riscv.run.log"
+rm -f "${log_file}"
 
 set +e
 timeout --signal=KILL "${qemu_timeout}" "${qemu}" "${runner}" \
@@ -133,6 +146,12 @@ set -e
 
 echo "[run.sh] qemu exit status: ${qemu_status}"
 
+if [[ -n "${etdump_path}" && -f "${etdump_path}" ]]; then
+    python "${script_dir}/etdump_summary.py" "${etdump_path}" \
+        --run-log "${log_file}" \
+        --json "${etdump_path}.json" || true
+fi
+
 if grep -q "Test_result: PASS" "${log_file}"; then
     echo "[run.sh] Bundled I/O check PASSED"
     exit 0
diff --git a/tools/cmake/preset/riscv64_linux.cmake b/tools/cmake/preset/riscv64_linux.cmake
index c094534b594..87894b63088 100644
--- a/tools/cmake/preset/riscv64_linux.cmake
+++ b/tools/cmake/preset/riscv64_linux.cmake
@@ -10,6 +10,18 @@ set_overridable_option(EXECUTORCH_BUILD_DEVTOOLS ON)
 set_overridable_option(EXECUTORCH_ENABLE_BUNDLE_IO ON)
 set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON)
 
+define_overridable_option(
+  EXECUTORCH_BUILD_RISCV_ETDUMP "Build etdump support for RISC-V" BOOL OFF
+)
+
+if("${EXECUTORCH_BUILD_RISCV_ETDUMP}")
+  set(EXECUTORCH_BUILD_DEVTOOLS ON)
+  set(EXECUTORCH_ENABLE_EVENT_TRACER ON)
+  set(FLATCC_ALLOW_WERROR OFF)
+else()
+  set(EXECUTORCH_ENABLE_EVENT_TRACER OFF)
+endif()
+
 if(EXECUTORCH_BUILD_XNNPACK)
   if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER_VERSION VERSION_LESS 14)
     message(FATAL_ERROR "XNNPACK requires GCC 14+ on riscv64")

From 2c8507d44c542756fbc80f5b684c981710627d43 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Wed, 20 May 2026 20:49:21 +0200
Subject: [PATCH 3/6] Align RISC-V workflow display name to others

---
 .github/workflows/riscv64.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml
index d7beae7dc8e..ddb1955ece2 100644
--- a/.github/workflows/riscv64.yml
+++ b/.github/workflows/riscv64.yml
@@ -1,4 +1,4 @@
-name: RISC-V
+name: Test RISC-V Backend
 
 on:
   push:

From 4e1355dfb0bd7df5e5d096aa979fd45589196a95 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Thu, 21 May 2026 09:33:08 +0200
Subject: [PATCH 4/6] Always use executorch-ubuntu-24.04-gcc14, newer QEMU is
 needed for RISC-V testing

---
 .github/workflows/_test_riscv.yml | 8 ++------
 .github/workflows/riscv64.yml     | 3 ---
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml
index 4be2732016e..163ede72ab2 100644
--- a/.github/workflows/_test_riscv.yml
+++ b/.github/workflows/_test_riscv.yml
@@ -31,10 +31,6 @@ on:
         description: 'Configuration(s) for the CPU to emulate with QEMU, expecting a JSON array'
         required: true
         type: string
-      gcc-version:
-        description: 'The version of GCC to use'
-        required: false
-        type: number
       docker-image:
         description: 'The docker image to use for this job'
         required: false
@@ -45,7 +41,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
-      docker-image: ${{ inputs.docker-image || 'ci-image:executorch-ubuntu-22.04-gcc11' }}
+      docker-image: ci-image:executorch-ubuntu-24.04-gcc14
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: ${{ inputs.timeout }}
@@ -58,7 +54,7 @@ jobs:
 
         echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do
           export QEMU_CPU="${qemu_cpu}"
-          export GCC_VERSION=${{ inputs.gcc-version }}
+          export GCC_VERSION=14
           bash .ci/scripts/test_riscv_qemu.sh \
             --model="${{ inputs.model }}" \
             ${{ inputs.xnnpack && '--xnnpack --verbose-xnnpack' || '' }} \
diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml
index ddb1955ece2..14b9ad62047 100644
--- a/.github/workflows/riscv64.yml
+++ b/.github/workflows/riscv64.yml
@@ -64,6 +64,3 @@ jobs:
             ]'
           )
         }}
-      # XNNPACK requires GCC 14+
-      gcc-version: ${{ matrix.xnnpack && 14 || 11 }}
-      docker-image: ${{ matrix.xnnpack && 'ci-image:executorch-ubuntu-24.04-gcc14' || 'ci-image:executorch-ubuntu-22.04-gcc11' }}

From 239fe1b942a181aa8d0d057d68ab89b2ba2e6eb5 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Thu, 21 May 2026 09:36:19 +0200
Subject: [PATCH 5/6] Rename --verbose to --debug-xnnpack

It's more aligned with the intent
---
 .ci/scripts/test_riscv_qemu.sh | 10 +++++-----
 examples/riscv/aot_riscv.py    |  8 ++++----
 examples/riscv/run.sh          | 10 +++++-----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.ci/scripts/test_riscv_qemu.sh b/.ci/scripts/test_riscv_qemu.sh
index d1998561553..2842542aa3a 100755
--- a/.ci/scripts/test_riscv_qemu.sh
+++ b/.ci/scripts/test_riscv_qemu.sh
@@ -17,8 +17,8 @@ et_root_dir=$(realpath "${script_dir}/../..")
 model="add"
 xnnpack=false
 quantize=false
-verbose=false
 verbose_xnnpack=false
+debug_xnnpack=false
 
 usage() {
     cat <<EOF
@@ -27,8 +27,8 @@ Options:
   --model=<NAME>     Which model to export and run (default: add)
   --xnnpack          Enable the XNNPACK backend (AOT partitioner + runtime)
   --quantize         Produce an 8-bit quantized model
-  --verbose          Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
   --verbose-xnnpack  Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch
+  --debug-xnnpack    Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
   -h, --help         Show this help
 EOF
 }
@@ -38,7 +38,7 @@ for arg in "$@"; do
         --model=*) model="${arg#*=}" ;;
         --xnnpack) xnnpack=true ;;
         --quantize) quantize=true ;;
-        --verbose) verbose=true ;;
+        --debug-xnnpack) debug_xnnpack=true ;;
         --verbose-xnnpack) verbose_xnnpack=true ;;
         -h|--help) usage; exit 0 ;;
         *) echo "Unknown option: $arg" >&2; usage; exit 1 ;;
@@ -52,8 +52,8 @@ fi
 if ${quantize}; then
     run_extra_args+=(--quantize)
 fi
-if ${verbose}; then
-    run_extra_args+=(--verbose)
+if ${debug_xnnpack}; then
+    run_extra_args+=(--debug-xnnpack)
 fi
 if ${verbose_xnnpack}; then
     run_extra_args+=(--verbose-xnnpack)
diff --git a/examples/riscv/aot_riscv.py b/examples/riscv/aot_riscv.py
index 22e8b31df73..529e2b1e767 100644
--- a/examples/riscv/aot_riscv.py
+++ b/examples/riscv/aot_riscv.py
@@ -148,13 +148,13 @@ def main() -> None:
         help="Produce an 8-bit quantized model",
     )
     parser.add_argument(
-        "--verbose",
+        "--debug-xnnpack",
         action="store_true",
         help="Enable XNNPACK partitioner DEBUG logging and dump the lowered graph",
     )
     args = parser.parse_args()
 
-    if args.verbose:
+    if args.debug_xnnpack:
         logging.basicConfig(level=logging.DEBUG)
 
     if args.output is None:
@@ -181,7 +181,7 @@ def main() -> None:
             XnnpackPartitioner,
         )
 
-        partitioners.append(XnnpackPartitioner(verbose=args.verbose))
+        partitioners.append(XnnpackPartitioner(verbose=args.debug_xnnpack))
 
     compile_config = None
     if args.quantize:
@@ -202,7 +202,7 @@ def main() -> None:
         f"quantize={args.quantize} delegated_nodes={delegated}"
     )
 
-    if args.verbose:
+    if args.debug_xnnpack:
         from executorch.exir.backend.utils import print_delegated_graph
 
         print_delegated_graph(edge.exported_program().graph_module)
diff --git a/examples/riscv/run.sh b/examples/riscv/run.sh
index 916284cb73c..2c207816bfc 100755
--- a/examples/riscv/run.sh
+++ b/examples/riscv/run.sh
@@ -23,7 +23,7 @@ qemu_timeout="600"
 model="add"
 xnnpack=false
 quantize=false
-verbose=false
+debug_xnnpack=false
 verbose_xnnpack=false
 
 usage() {
@@ -33,8 +33,8 @@ Options:
   --model=<NAME>          Which model to export and run (default: ${model})
   --xnnpack               Enable the XNNPACK backend (AOT partitioner + runtime)
   --quantize              Produce an 8-bit quantized model
-  --verbose               Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
   --verbose-xnnpack       Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch at runtime
+  --debug-xnnpack         Enable XNNPACK partitioner DEBUG logging and dump the lowered graph
   --build_only            Only export and cross-compile; do not invoke QEMU
   --build_dir=<DIR>       CMake build directory (default: ${build_dir})
   --output_dir=<DIR>      Directory for the exported .bpte (default: ${output_dir})
@@ -49,7 +49,7 @@ for arg in "$@"; do
         --model=*) model="${arg#*=}" ;;
         --xnnpack) xnnpack=true ;;
         --quantize) quantize=true ;;
-        --verbose) verbose=true ;;
+        --debug-xnnpack) debug_xnnpack=true ;;
         --verbose-xnnpack) verbose_xnnpack=true ;;
         --build_only) build_only=true ;;
         --build_dir=*) build_dir="${arg#*=}" ;;
@@ -72,8 +72,8 @@ fi
 if ${quantize}; then
     aot_extra_args+=(--quantize)
 fi
-if ${verbose}; then
-    aot_extra_args+=(--verbose)
+if ${debug_xnnpack}; then
+    aot_extra_args+=(--debug-xnnpack)
 fi
 python "${script_dir}/aot_riscv.py" --model "${model}" "${aot_extra_args[@]}" --output "${bpte_path}"
 

From e17eca91aea1cbf1b6527b1fbfe73c1ee030c2b2 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Thu, 21 May 2026 10:39:37 +0200
Subject: [PATCH 6/6] Fix possible issues with 'echo | jq | while read' failure
 in pipes

---
 .github/workflows/_test_riscv.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml
index 163ede72ab2..e3b049bd614 100644
--- a/.github/workflows/_test_riscv.yml
+++ b/.github/workflows/_test_riscv.yml
@@ -52,6 +52,9 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch "--use-pt-pinned-commit"
 
+        # Allows failure in `echo | jq | while read` pipeline to bubble up and fail the workflow
+        set -o pipefail
+
         echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do
           export QEMU_CPU="${qemu_cpu}"
           export GCC_VERSION=14