diff --git a/.ci/scripts/test_riscv_qemu.sh b/.ci/scripts/test_riscv_qemu.sh index 0d8b2815f74..2842542aa3a 100755 --- a/.ci/scripts/test_riscv_qemu.sh +++ b/.ci/scripts/test_riscv_qemu.sh @@ -17,16 +17,19 @@ et_root_dir=$(realpath "${script_dir}/../..") model="add" xnnpack=false quantize=false -verbose=false +verbose_xnnpack=false +debug_xnnpack=false usage() { cat < Which model to export and run (default: add) - --xnnpack Enable the XNNPACK backend (AOT partitioner + runtime) - --quantize Produce an 8-bit quantized model - -h, --help Show this help + --model= Which model to export and run (default: add) + --xnnpack Enable the XNNPACK backend (AOT partitioner + runtime) + --quantize Produce an 8-bit quantized model + --verbose-xnnpack Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch + --debug-xnnpack Enable XNNPACK partitioner DEBUG logging and dump the lowered graph + -h, --help Show this help EOF } @@ -35,7 +38,8 @@ for arg in "$@"; do --model=*) model="${arg#*=}" ;; --xnnpack) xnnpack=true ;; --quantize) quantize=true ;; - --verbose) verbose=true ;; + --debug-xnnpack) debug_xnnpack=true ;; + --verbose-xnnpack) verbose_xnnpack=true ;; -h|--help) usage; exit 0 ;; *) echo "Unknown option: $arg" >&2; usage; exit 1 ;; esac @@ -48,8 +52,11 @@ fi if ${quantize}; then run_extra_args+=(--quantize) fi -if ${verbose}; then - run_extra_args+=(--verbose) +if ${debug_xnnpack}; then + run_extra_args+=(--debug-xnnpack) +fi +if ${verbose_xnnpack}; then + run_extra_args+=(--verbose-xnnpack) fi bash "${et_root_dir}/examples/riscv/setup.sh" diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml index da19dfc9bda..223a146e3d8 100644 --- a/.github/workflows/_test_riscv.yml +++ b/.github/workflows/_test_riscv.yml @@ -27,10 +27,10 @@ on: required: false type: boolean default: false - gcc-version: - description: 'The version of GCC to use' - required: false - type: number + qemu-cpu: + description: 'Configuration(s) for the CPU to emulate with QEMU, expecting a JSON array' + required: true + type: string docker-image: description: 'The docker image to use for this job' required: false @@ -41,7 +41,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.2xlarge - docker-image: ${{ inputs.docker-image || 'ci-image:executorch-ubuntu-22.04-gcc11' }} + docker-image: ci-image:executorch-ubuntu-24.04-gcc14 submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: ${{ inputs.timeout }} @@ -52,5 +52,40 @@ jobs: source .ci/scripts/utils.sh install_executorch "--use-pt-pinned-commit" - export GCC_VERSION=${{ inputs.gcc-version }} - bash .ci/scripts/test_riscv_qemu.sh --model="${{ inputs.model }}" ${{ inputs.xnnpack && '--xnnpack' || '' }} ${{ inputs.quantize && '--quantize' || '' }} + # Allows failure in `echo | jq | while read` pipeline to bubble up and fail the workflow + set -o pipefail + + echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do + export QEMU_CPU="${qemu_cpu}" + export GCC_VERSION=14 + bash .ci/scripts/test_riscv_qemu.sh \ + --model="${{ inputs.model }}" \ + ${{ inputs.xnnpack && '--xnnpack --verbose-xnnpack' || '' }} \ + ${{ inputs.quantize && '--quantize' || '' }} + + # We only generate riscv_test/${{ inputs.model }}_riscv.etdump.json from `--verbose-xnnpack`. + if ${{ inputs.xnnpack }}; then + # Generate markdown table from riscv_test/${{ inputs.model }}_riscv.etdump.json, sorted by sum_ms + ( + etdump_json="riscv_test/${{ inputs.model }}_riscv.etdump.json" + echo "### Model=${{ inputs.model }} XNNPACK=${{ inputs.xnnpack }} Quantize=${{ inputs.quantize }} QEMU_CPU='${QEMU_CPU}'" + jq -r ' + def r3: (. * 1000 | round) / 1000; + ["Section","Op","Count","Sum (ms)","Avg (ms)","Max (ms)","Microkernels"], + ["---","---","---","---","---","---","---"], + ( [ (.ops[] | . + {section: "ops"}), + (.framework[] | . + {section: "framework"}) ] + | sort_by(-.sum_ms) | .[] + | [.section, .op, .count, (.sum_ms|r3), (.avg_ms|r3), (.max_ms|r3), ((.kernels // []) | join(", "))] ) + | "| " + (map(tostring) | join(" | ")) + " |" + ' "${etdump_json}" + echo + echo "
Registered XNNPACK microkernels" + echo + jq -r '.registered_kernels[] | "- `" + . + "`"' "${etdump_json}" + echo + echo "
" + echo + ) >> $GITHUB_STEP_SUMMARY + fi + done diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml index 9823db09cc1..14b9ad62047 100644 --- a/.github/workflows/riscv64.yml +++ b/.github/workflows/riscv64.yml @@ -1,4 +1,4 @@ -name: RISC-V +name: Test RISC-V Backend on: push: @@ -50,6 +50,17 @@ jobs: model: ${{ matrix.model }} xnnpack: ${{ matrix.xnnpack }} quantize: ${{ matrix.quantize }} - # XNNPACK requires GCC 14+ - gcc-version: ${{ matrix.xnnpack && 14 || 11 }} - docker-image: ${{ matrix.xnnpack && 'ci-image:executorch-ubuntu-24.04-gcc14' || 'ci-image:executorch-ubuntu-22.04-gcc11' }} + # If XNNPACK, test with multiple RVV length, disabled otherwise + qemu-cpu: >- + ${{ + case( + matrix.xnnpack, '[ + "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=128,elen=64,vext_spec=v1.0", + "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=256,elen=64,vext_spec=v1.0", + "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0" + ]', + '[ + "rv64,zba=true,zbb=true,zbs=true,v=false" + ]' + ) + }} diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 1b46c993b17..cd0d945a84f 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -169,6 +169,12 @@ install( EXPORT ExecuTorchTargets DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +if(DEFINED EXECUTORCH_XNNPACK_LOG_LEVEL) + target_compile_definitions( + xnnpack-logging PUBLIC XNN_LOG_LEVEL=${EXECUTORCH_XNNPACK_LOG_LEVEL} + ) +endif() + if(BUILD_TESTING) add_subdirectory(test) endif() diff --git a/examples/riscv/aot_riscv.py b/examples/riscv/aot_riscv.py index 22e8b31df73..529e2b1e767 100644 --- a/examples/riscv/aot_riscv.py +++ b/examples/riscv/aot_riscv.py @@ -148,13 +148,13 @@ def main() -> None: help="Produce an 8-bit quantized model", ) parser.add_argument( - "--verbose", + "--debug-xnnpack", action="store_true", help="Enable XNNPACK partitioner DEBUG logging and dump the lowered graph", ) args = parser.parse_args() - if args.verbose: + if args.debug_xnnpack: logging.basicConfig(level=logging.DEBUG) if args.output is None: @@ -181,7 +181,7 @@ def main() -> None: XnnpackPartitioner, ) - partitioners.append(XnnpackPartitioner(verbose=args.verbose)) + partitioners.append(XnnpackPartitioner(verbose=args.debug_xnnpack)) compile_config = None if args.quantize: @@ -202,7 +202,7 @@ def main() -> None: f"quantize={args.quantize} delegated_nodes={delegated}" ) - if args.verbose: + if args.debug_xnnpack: from executorch.exir.backend.utils import print_delegated_graph print_delegated_graph(edge.exported_program().graph_module) diff --git a/examples/riscv/etdump_summary.py b/examples/riscv/etdump_summary.py new file mode 100644 index 00000000000..e4fc5a61d7e --- /dev/null +++ b/examples/riscv/etdump_summary.py @@ -0,0 +1,228 @@ +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Render a per-XNNPACK-op summary from an ETDump file.""" + +import argparse +import json +import re +import sys +from collections import defaultdict +from pathlib import Path + +from executorch.devtools import Inspector + + +# "Convolution (NHWC, F32) IGEMM #3" -> ("Convolution (NHWC, F32) IGEMM", 3) +_SEQ_RE = re.compile(r"^(.*?)\s+#(\d+)$") + +# Wrappers around per-op events; kept separate to avoid double-counting children. +FRAMEWORK_EVENTS = frozenset( + { + "Method::execute", + "Method::init", + "Program::load_method", + "DELEGATE_CALL", + "OPERATOR_CALL", + } +) + +_REG_LOG_RE = re.compile(r"Note \(XNNPACK\):.*microkernel '([^']+)'") + + +def parse_run_log(path: Path): + syms = set() + with open(path, errors="ignore") as f: + for line in f: + m = _REG_LOG_RE.search(line) + if m: + syms.add(m.group(1)) + return sorted(syms) + + +# Two-source mapping from an ETDump op name to a symbol-substring pattern. +# When the operator type uses xnn_microkernel_type_default, runtime.c does NOT +# append a category suffix, so we fall back to matching on the base op name. +_OP_NAME_RE = re.compile(r"^(.*?)\s*\(([^)]*)\)\s*(.*)$") +_DTYPE_TOKENS = frozenset( + { + "F32", + "F16", + "QS8", + "QU8", + "QC8", + "QC4", + "QD8", + "QC8W", + "QC4W", + "X8", + "X16", + "X24", + "X32", + "X64", + } +) +# Infix between the kind token and `_ukernel_`: zero or more `_` +# segments (e.g. `_gemm_ukernel_`, `_gemm_minmax_ukernel_`, +# `_gemm_minmax_fp32_ukernel_`, ...). +_INFIX = r"(?:[a-z0-9]+_)*" +_KIND_PATTERN = { + # Microkernel categories appended by runtime.c (xnn_microkernel_type_to_string). + "GEMM": r"_gemm_" + _INFIX + r"ukernel_", + "IGEMM": r"_igemm_" + _INFIX + r"ukernel_", + "DWConv": r"_dwconv_" + _INFIX + r"ukernel_", + "Transpose": r"_transposec?_" + _INFIX + r"ukernel_", + "Reduce": r"_(?:rsum|rmax|rminmax|rdmax|rdsum)_" + _INFIX + r"ukernel_", + "Reduce2": r"_(?:rdmax|rdsum)_" + _INFIX + r"ukernel_", + "VMulCAddC": r"_vmulcaddc_" + _INFIX + r"ukernel_", + "Average Pooling": r"_(?:avgpool|gavgpool)_" + _INFIX + r"ukernel_", + "Pixelwise Average Pooling": r"_pavgpool_" + _INFIX + r"ukernel_", + "Conv2D HWC2CHW": r"_conv_hwc2chw_" + _INFIX + r"ukernel_", + "SPMM": r"_spmm_" + _INFIX + r"ukernel_", + "Subconv2D": r"_subconv2d_" + _INFIX + r"ukernel_", + # Base op names (default microkernel type, no category suffix in the ETDump name). + "Add": r"_v(?:add|addc)_" + _INFIX + r"ukernel_", + "Subtract": r"_v(?:sub|subc|rsubc)_" + _INFIX + r"ukernel_", + "Multiply": r"_v(?:mul|mulc)_" + _INFIX + r"ukernel_", + "Divide": r"_v(?:div|divc|rdivc)_" + _INFIX + r"ukernel_", + "Maximum": r"_v(?:max|maxc)_" + _INFIX + r"ukernel_", + "Minimum": r"_v(?:min|minc)_" + _INFIX + r"ukernel_", + "Clamp": r"_vclamp_" + _INFIX + r"ukernel_", + "Sigmoid": r"_vsigmoid_" + _INFIX + r"ukernel_", + "Tanh": r"_vtanh_" + _INFIX + r"ukernel_", + "Negate": r"_vneg_" + _INFIX + r"ukernel_", + "Abs": r"_vabs_" + _INFIX + r"ukernel_", + "Square": r"_vsqr_" + _INFIX + r"ukernel_", + "Square Root": r"_vsqrt_" + _INFIX + r"ukernel_", + "Reciprocal Square Root": r"_vrsqrt_" + _INFIX + r"ukernel_", + "Convert": r"_vcvt_" + _INFIX + r"ukernel_", + "Copy": r"_(?:copy|memcpy)_" + _INFIX + r"ukernel_", + "Constant Pad": r"_xx_pad_" + _INFIX + r"ukernel_", + "Softmax": r"_(?:raddstoreexpminusmax|rmax)_" + _INFIX + r"ukernel_", + "Max Pooling": r"_maxpool_" + _INFIX + r"ukernel_", +} + + +def op_kernels(op_name, kernels): + m = _OP_NAME_RE.match(op_name) + if not m: + return [] + base, inside, tail = m.group(1).strip(), m.group(2), m.group(3).strip() + key = tail if tail in _KIND_PATTERN else (base if base in _KIND_PATTERN else None) + if key is None: + return [] + dtype_tokens = [ + s.strip().lower() for s in inside.split(",") if s.strip() in _DTYPE_TOKENS + ] + cat_re = re.compile(_KIND_PATTERN[key]) + return [ + sym + for sym in kernels + if cat_re.search(sym) and all(d in sym for d in dtype_tokens) + ] + + +def aggregate(etdump_path: Path): + insp = Inspector(etdump_path=str(etdump_path)) + per_op = defaultdict(lambda: {"count": 0, "raw": []}) + framework = defaultdict(lambda: {"count": 0, "raw": []}) + for block in insp.event_blocks: + for ev in block.events: + m = _SEQ_RE.match(ev.name or "") + base = m.group(1) if m else (ev.name or "") + bucket = framework if base in FRAMEWORK_EVENTS else per_op + bucket[base]["count"] += 1 + bucket[base]["raw"].extend(ev.perf_data.raw if ev.perf_data else []) + return per_op, framework + + +def render(per_op, framework, etdump_path, kernels): + def rows_of(d): + rows = [] + for name, v in d.items(): + raw = v["raw"] + s = sum(raw) + rows.append( + { + "op": name, + "count": v["count"], + "sum_ms": s, + "avg_ms": (s / len(raw)) if raw else 0.0, + "max_ms": max(raw) if raw else 0.0, + "kernels": op_kernels(name, kernels) if kernels else [], + } + ) + rows.sort(key=lambda r: r["sum_ms"], reverse=True) + return rows + + op_rows = rows_of(per_op) + fw_rows = rows_of(framework) + ops_total = sum(r["sum_ms"] for r in op_rows) + fw_total = sum(r["sum_ms"] for r in fw_rows) + + def fmt_table(label, rows, total): + print(f"\n[etdump_summary] {label} total={total:.3f} ms") + print( + f"{'%':>5} {'sum_ms':>10} {'count':>6} {'avg_ms':>10} {'max_ms':>10} op" + ) + for r in rows: + pct = (r["sum_ms"] / total * 100.0) if total else 0.0 + print( + f"{pct:5.1f} {r['sum_ms']:10.3f} {r['count']:6d} " + f"{r['avg_ms']:10.3f} {r['max_ms']:10.3f} {r['op']}" + ) + + print(f"[etdump_summary] {etdump_path}") + fmt_table(f"XNNPACK ops ({len(op_rows)} unique)", op_rows, ops_total) + fmt_table(f"Framework wrappers ({len(fw_rows)})", fw_rows, fw_total) + if kernels: + print(f"\n[etdump_summary] Registered XNNPACK microkernels ({len(kernels)}):") + for sym in kernels: + print(f" {sym}") + + return op_rows, fw_rows, ops_total + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("etdump", type=Path) + parser.add_argument("--run-log", type=Path, default=None) + parser.add_argument("--json", type=Path, default=None) + args = parser.parse_args() + + if not args.etdump.exists(): + print(f"[etdump_summary] missing {args.etdump}", file=sys.stderr) + sys.exit(1) + + kernels = [] + if args.run_log is not None: + if not args.run_log.exists(): + print(f"[etdump_summary] missing run log {args.run_log}", file=sys.stderr) + sys.exit(1) + kernels = parse_run_log(args.run_log) + + per_op, framework = aggregate(args.etdump) + op_rows, fw_rows, ops_total = render(per_op, framework, args.etdump, kernels) + + if args.json is not None: + args.json.parent.mkdir(parents=True, exist_ok=True) + args.json.write_text( + json.dumps( + { + "etdump": str(args.etdump), + "run_log": str(args.run_log) if args.run_log else None, + "ops_total_ms": ops_total, + "registered_kernels": kernels, + "ops": op_rows, + "framework": fw_rows, + }, + indent=2, + ) + ) + print(f"[etdump_summary] wrote {args.json}") + + +if __name__ == "__main__": + main() diff --git a/examples/riscv/run.sh b/examples/riscv/run.sh index 644944ab8a4..2c207816bfc 100755 --- a/examples/riscv/run.sh +++ b/examples/riscv/run.sh @@ -23,7 +23,8 @@ qemu_timeout="600" model="add" xnnpack=false quantize=false -verbose=false +debug_xnnpack=false +verbose_xnnpack=false usage() { cat < Which model to export and run (default: ${model}) --xnnpack Enable the XNNPACK backend (AOT partitioner + runtime) --quantize Produce an 8-bit quantized model - --verbose Enable XNNPACK partitioner DEBUG logging and dump the lowered graph + --verbose-xnnpack Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch at runtime + --debug-xnnpack Enable XNNPACK partitioner DEBUG logging and dump the lowered graph --build_only Only export and cross-compile; do not invoke QEMU --build_dir= CMake build directory (default: ${build_dir}) --output_dir= Directory for the exported .bpte (default: ${output_dir}) @@ -47,7 +49,8 @@ for arg in "$@"; do --model=*) model="${arg#*=}" ;; --xnnpack) xnnpack=true ;; --quantize) quantize=true ;; - --verbose) verbose=true ;; + --debug-xnnpack) debug_xnnpack=true ;; + --verbose-xnnpack) verbose_xnnpack=true ;; --build_only) build_only=true ;; --build_dir=*) build_dir="${arg#*=}" ;; --output_dir=*) output_dir="${arg#*=}" ;; @@ -69,8 +72,8 @@ fi if ${quantize}; then aot_extra_args+=(--quantize) fi -if ${verbose}; then - aot_extra_args+=(--verbose) +if ${debug_xnnpack}; then + aot_extra_args+=(--debug-xnnpack) fi python "${script_dir}/aot_riscv.py" --model "${model}" "${aot_extra_args[@]}" --output "${bpte_path}" @@ -79,6 +82,9 @@ cmake_extra_args=() if ${xnnpack}; then cmake_extra_args+=(-DEXECUTORCH_BUILD_XNNPACK=ON) fi +if ${verbose_xnnpack}; then + cmake_extra_args+=(-DEXECUTORCH_XNNPACK_LOG_LEVEL=4 -DEXECUTORCH_BUILD_RISCV_ETDUMP=ON) +fi cmake -S "${et_root_dir}" -B "${build_dir}" \ --preset riscv64-linux \ "${cmake_extra_args[@]}" \ @@ -111,13 +117,24 @@ hash "${qemu}" 2>/dev/null || { # linker (ld-linux-riscv64-lp64d.so.1) referenced in the ELF resolves. export QEMU_LD_PREFIX="${QEMU_LD_PREFIX:-/usr/riscv64-linux-gnu}" -log_file=$(mktemp) -trap 'rm -f "${log_file}"' EXIT +if [[ -n "${QEMU_CPU+x}" ]]; then + echo "[run.sh] QEMU_CPU=${QEMU_CPU}" +fi runner_extra_args=() if ${quantize}; then runner_extra_args+=(--bundleio_rtol=0.1 --bundleio_atol=0.25) fi +etdump_path="" +if ${verbose_xnnpack}; then + etdump_path="${output_dir}/${model}_riscv.etdump" + rm -f "${etdump_path}" + runner_extra_args+=(--etdump_path="${etdump_path}") +fi + +# etdump_summary.py reads the XNN_LOG_LEVEL=4 registrations. +log_file="${output_dir}/${model}_riscv.run.log" +rm -f "${log_file}" set +e timeout --signal=KILL "${qemu_timeout}" "${qemu}" "${runner}" \ @@ -129,6 +146,12 @@ set -e echo "[run.sh] qemu exit status: ${qemu_status}" +if [[ -n "${etdump_path}" && -f "${etdump_path}" ]]; then + python "${script_dir}/etdump_summary.py" "${etdump_path}" \ + --run-log "${log_file}" \ + --json "${etdump_path}.json" || true +fi + if grep -q "Test_result: PASS" "${log_file}"; then echo "[run.sh] Bundled I/O check PASSED" exit 0 diff --git a/tools/cmake/preset/riscv64_linux.cmake b/tools/cmake/preset/riscv64_linux.cmake index c094534b594..87894b63088 100644 --- a/tools/cmake/preset/riscv64_linux.cmake +++ b/tools/cmake/preset/riscv64_linux.cmake @@ -10,6 +10,18 @@ set_overridable_option(EXECUTORCH_BUILD_DEVTOOLS ON) set_overridable_option(EXECUTORCH_ENABLE_BUNDLE_IO ON) set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON) +define_overridable_option( + EXECUTORCH_BUILD_RISCV_ETDUMP "Build etdump support for RISC-V" BOOL OFF +) + +if("${EXECUTORCH_BUILD_RISCV_ETDUMP}") + set(EXECUTORCH_BUILD_DEVTOOLS ON) + set(EXECUTORCH_ENABLE_EVENT_TRACER ON) + set(FLATCC_ALLOW_WERROR OFF) +else() + set(EXECUTORCH_ENABLE_EVENT_TRACER OFF) +endif() + if(EXECUTORCH_BUILD_XNNPACK) if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER_VERSION VERSION_LESS 14) message(FATAL_ERROR "XNNPACK requires GCC 14+ on riscv64")