diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml index 14e80631d1..c642bfe6d2 100644 --- a/.github/workflows/_runner-chimera.yml +++ b/.github/workflows/_runner-chimera.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-cortexm.yml b/.github/workflows/_runner-cortexm.yml index 3fbdf0ee16..c6be8af465 100644 --- a/.github/workflows/_runner-cortexm.yml +++ b/.github/workflows/_runner-cortexm.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml index a5c8b3ac98..6934014447 100644 --- a/.github/workflows/_runner-gap9-tiled.yml +++ b/.github/workflows/_runner-gap9-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml index e1d6e452a6..cc790d3d33 100644 --- a/.github/workflows/_runner-gap9.yml +++ b/.github/workflows/_runner-gap9.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-generic.yml b/.github/workflows/_runner-generic.yml index 6681cbac96..b44b47f73d 100644 --- a/.github/workflows/_runner-generic.yml +++ b/.github/workflows/_runner-generic.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-mempool.yml b/.github/workflows/_runner-mempool.yml index deb4809330..b2f0ae4f7a 100644 --- a/.github/workflows/_runner-mempool.yml +++ b/.github/workflows/_runner-mempool.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml index b1f5f2fcb3..664d5f01be 100644 --- a/.github/workflows/_runner-siracusa-neureka-tiled.yml +++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index ea9c8989af..cc09f234e0 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa.yml b/.github/workflows/_runner-siracusa.yml index ea8fe5d405..1c51333f7a 100644 --- a/.github/workflows/_runner-siracusa.yml +++ b/.github/workflows/_runner-siracusa.yml @@ -25,6 +25,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml index fbd5195b08..bcdd58a166 100644 --- a/.github/workflows/_runner-snitch-tiled-sequential.yml +++ b/.github/workflows/_runner-snitch-tiled-sequential.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml index bc599e4fe7..48130ea26a 100644 --- a/.github/workflows/_runner-snitch.yml +++ b/.github/workflows/_runner-snitch.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml index b067664f40..2624cbe15d 100644 --- a/.github/workflows/_runner-softhier.yml +++ b/.github/workflows/_runner-softhier.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml index fc468306b1..84f2779e4c 100644 --- a/.github/workflows/ci-deeploy.yml +++ b/.github/workflows/ci-deeploy.yml @@ -35,6 +35,8 @@ jobs: container: image: ${{ needs.select-env.outputs.image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: @@ -49,6 +51,8 @@ jobs: container: image: ${{ needs.select-env.outputs.image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml index f9010adb1d..5b456e1d64 100644 --- a/.github/workflows/infra-generate-ccache-gap9.yml +++ b/.github/workflows/infra-generate-ccache-gap9.yml @@ -23,6 +23,8 @@ jobs: container: image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml index bd6e2c7787..e2d54eaa83 100644 --- a/.github/workflows/infra-generate-ccache.yml +++ b/.github/workflows/infra-generate-ccache.yml @@ -22,6 +22,8 @@ jobs: container: image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 38c5b3ce35..42281c6f0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ### List of Pull Requests +- Add Microbenchmarking Infrastructure and CI Using GVSoC CSR [#162](https://github.com/pulp-platform/Deeploy/pull/162) - Fix CI Cache Generation [#176](https://github.com/pulp-platform/Deeploy/pull/176) - Fix Broken CI [#175](https://github.com/pulp-platform/Deeploy/pull/175) - Improve Docstring and Debugging [#160](https://github.com/pulp-platform/Deeploy/pull/160) @@ -23,6 +24,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Shell Format pre-commit hook - Add integer MaxPool1D for Generic platform and RQSConv1D support for PULPOpen, with corresponding kernel tests. - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows +- Per-layer microbenchmarking on PULPOpen via `--profileMicrobenchmark`: new `PULPMicrobenchmark` code-transformation pass + `perf_utils.h` helpers report cycles, instructions, stalls and cache misses per layer in `RunNetwork` ### Changed - Use by default `devel` container for GAP9 CI diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 797bd44c47..de5a66aae9 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -53,6 +53,7 @@ class CodeGenVerbosity: tilingProfiling: Optional[bool] = False # Specifies if we should profile the tiling code untiledProfiling: Optional[bool] = None # Specifies if we should profile the untilied code + microbenchmarkProfiling: Optional[bool] = False # Wrap each layer with PULP perf-counter microbenchmark _NoVerbosity = CodeGenVerbosity(None) diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 5d7b02ae62..2c78978e23 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -24,6 +24,7 @@ from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling +from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark import PULPMicrobenchmark from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack @@ -115,6 +116,7 @@ MemoryManagementGeneration("L2"), MemoryManagementGeneration("L3.*"), MemoryManagementGeneration(), + PULPMicrobenchmark(), ]) ClusterTransformer = CodeTransformation([ @@ -133,6 +135,7 @@ MemoryManagementGeneration("L2"), MemoryManagementGeneration("L3.*"), MemoryManagementGeneration(), + PULPMicrobenchmark(), ]) SimpleTransformer = CodeTransformation([ diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py new file mode 100644 index 0000000000..bb35f32d47 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Tuple + +from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \ + NodeTemplate, _NoVerbosity + + +class PULPMicrobenchmark(CodeTransformationPass): + + _preTemplate = NodeTemplate(""" + perf_stats_t ${op}_perf_start, ${op}_perf_end, ${op}_perf_total; + if (pi_core_id() == 0) { + perf_bench_init(); + perf_bench_start(); + perf_bench_read(&${op}_perf_start); + } + """) + + _postTemplate = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_stop(); + perf_bench_read(&${op}_perf_end); + perf_bench_diff(&${op}_perf_total, &${op}_perf_end, &${op}_perf_start); + perf_bench_print("${op}", &${op}_perf_total); + } + """) + + def apply(self, + ctxt: NetworkContext, + executionBlock: ExecutionBlock, + name: str, + verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: + + if not verbose.microbenchmarkProfiling: + return ctxt, executionBlock + + executionBlock.addLeft(self._preTemplate, {"op": name}) + executionBlock.addRight(self._postTemplate, {"op": name}) + return ctxt, executionBlock diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 7456dd9e1b..f13e6451fb 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -248,7 +248,8 @@ class PULPStructBuffer(StructBuffer): # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't... _includeList = [ - "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h" + "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", + "perf_utils.h" ] diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py index f029be7361..0b25bc6bbe 100644 --- a/DeeployTest/generateNetwork.py +++ b/DeeployTest/generateNetwork.py @@ -141,6 +141,7 @@ def generateNetwork(args): verbosityCfg = _NoVerbosity if isinstance(platform, PULPPlatform): verbosityCfg.untiledProfiling = args.profileUntiled + verbosityCfg.microbenchmarkProfiling = args.profileMicrobenchmark # Parse graph and infer output levels and signedness _ = deployer.prepare(verbosityCfg) @@ -172,6 +173,11 @@ def generateNetwork(args): dest = 'profileUntiled', default = False, help = 'Profile Untiled for L2\n') + parser.add_argument('--profileMicrobenchmark', + action = 'store_true', + dest = 'profileMicrobenchmark', + default = False, + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') parser.add_argument('--input-type-map', nargs = '*', default = [], diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 69e04343ff..9678bc4e4f 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -202,7 +202,10 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg - max: Initalize all variables at their maximal value. - min: Initalize all variables at their minimal value. """) - parser.add_argument('--profileTiling', action = "store_true") + parser.add_argument('--profileTiling', action = "store_true", help = 'Enable tiling profiling') + parser.add_argument('--profileMicrobenchmark', + action = "store_true", + help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation') parser.add_argument('--plotMemAlloc', action = 'store_true', help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n') @@ -224,6 +227,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg if args.profileTiling: verbosityCfg.tilingProfiling = True + if args.profileMicrobenchmark: + verbosityCfg.microbenchmarkProfiling = True + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') graph = gs.import_onnx(onnx_graph) diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index a5a8d70ef3..fbbd95703e 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -94,6 +94,12 @@ def __init__(self, action = 'store_true', default = False, help = 'Enable untiled profiling (Siracusa only)\n') + self.add_argument('--profileMicrobenchmark', + '--profile-microbenchmark', + dest = 'profileMicrobenchmark', + action = 'store_true', + default = False, + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') self.add_argument('--toolchain', metavar = '', dest = 'toolchain', @@ -235,6 +241,9 @@ def create_config_from_args(args: argparse.Namespace, if not tiling and getattr(args, 'profileUntiled', False): gen_args_list.append("--profileUntiled") + if getattr(args, 'profileMicrobenchmark', False): + gen_args_list.append("--profileMicrobenchmark") + config = DeeployTestConfig( test_name = test_name, test_dir = test_dir_abs, diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py index 472d8cfed9..29119bba6f 100644 --- a/DeeployTest/testUtils/pytestRunner.py +++ b/DeeployTest/testUtils/pytestRunner.py @@ -45,6 +45,7 @@ def create_test_config( mem_alloc_strategy: str = "MiniMalloc", search_strategy: str = "random-max", profile_tiling: bool = False, + profile_microbenchmark: bool = False, plot_mem_alloc: bool = False, randomized_mem_scheduler: bool = False, profile_untiled: bool = False, @@ -86,6 +87,8 @@ def create_test_config( gen_args_list.append(f"--searchStrategy={search_strategy}") if profile_tiling: gen_args_list.append("--profileTiling") + if profile_microbenchmark: + gen_args_list.append("--profileMicrobenchmark") if plot_mem_alloc: gen_args_list.append("--plotMemAlloc") if randomized_mem_scheduler: diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 9578c2f26c..18a4f870ac 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -211,6 +211,9 @@ def __init__(self, tiling_arguments: bool, description = None): action = "store_true", help = 'Enable randomized memory scheduler\n') self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n') + self.add_argument('--profileMicrobenchmark', + action = 'store_true', + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') self.add_argument('--memAllocStrategy', metavar = 'memAllocStrategy', dest = 'memAllocStrategy', @@ -271,6 +274,8 @@ def generate_cmd_args(self) -> str: command += " --randomizedMemoryScheduler" if self.args.profileTiling: command += f" --profileTiling" + if self.args.profileMicrobenchmark: + command += f" --profileMicrobenchmark" if self.args.memAllocStrategy: command += f" --memAllocStrategy={self.args.memAllocStrategy}" if self.args.plotMemAlloc: diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h new file mode 100644 index 0000000000..c710402ed2 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/perf_utils.h @@ -0,0 +1,149 @@ +/* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + * + * Performance Counter Utilities for PULP Benchmarking + */ + +#ifndef __PERF_UTILS_H__ +#define __PERF_UTILS_H__ + +#include "pmsis.h" + +// Performance event IDs (compatible with PMSIS) +#define PI_PERF_CYCLES CSR_PCER_CYCLES +#define PI_PERF_INSTR CSR_PCER_INSTR +#define PI_PERF_LD_STALL CSR_PCER_LD_STALL +#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL +#define PI_PERF_IMISS CSR_PCER_IMISS +#define PI_PERF_LD CSR_PCER_LD +#define PI_PERF_ST CSR_PCER_ST +#define PI_PERF_JUMP CSR_PCER_JUMP +#define PI_PERF_BRANCH CSR_PCER_BRANCH +#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH +#define PI_PERF_RVC CSR_PCER_RVC +#define PI_PERF_LD_EXT CSR_PCER_LD_EXT +#define PI_PERF_ST_EXT CSR_PCER_ST_EXT +#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC +#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC +#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT + +// Benchmark statistics structure +typedef struct { + unsigned int cycles; + unsigned int instr; + unsigned int ld; + unsigned int st; + unsigned int ld_stall; + unsigned int jmp_stall; + unsigned int imiss; + unsigned int branch; + unsigned int taken_branch; + unsigned int rvc; + unsigned int ld_ext; + unsigned int st_ext; + unsigned int ld_ext_cyc; + unsigned int st_ext_cyc; + unsigned int tcdm_cont; +} perf_stats_t; + +// Initialize performance counters for comprehensive benchmarking +static inline void perf_bench_init() { + // Enable all performance counters + pi_perf_conf( + (1 << PI_PERF_CYCLES) | (1 << PI_PERF_INSTR) | (1 << PI_PERF_LD_STALL) | + (1 << PI_PERF_JMP_STALL) | (1 << PI_PERF_IMISS) | (1 << PI_PERF_LD) | + (1 << PI_PERF_ST) | (1 << PI_PERF_JUMP) | (1 << PI_PERF_BRANCH) | + (1 << PI_PERF_TAKEN_BRANCH) | (1 << PI_PERF_RVC) | (1 << PI_PERF_LD_EXT) | + (1 << PI_PERF_ST_EXT) | (1 << PI_PERF_LD_EXT_CYC) | + (1 << PI_PERF_ST_EXT_CYC) | (1 << PI_PERF_TCDM_CONT)); +} + +// Start performance monitoring +static inline void perf_bench_start() { + pi_perf_reset(); + pi_perf_start(); +} + +// Stop performance monitoring +static inline void perf_bench_stop() { pi_perf_stop(); } + +// Read all performance counters into structure +static inline void perf_bench_read(perf_stats_t *stats) { + stats->cycles = pi_perf_read(PI_PERF_CYCLES); + stats->instr = pi_perf_read(PI_PERF_INSTR); + stats->ld = pi_perf_read(PI_PERF_LD); + stats->st = pi_perf_read(PI_PERF_ST); + stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL); + stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL); + stats->imiss = pi_perf_read(PI_PERF_IMISS); + stats->branch = pi_perf_read(PI_PERF_BRANCH); + stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH); + stats->rvc = pi_perf_read(PI_PERF_RVC); + stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT); + stats->st_ext = pi_perf_read(PI_PERF_ST_EXT); + stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC); + stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC); + stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT); +} + +// Print performance statistics (core 0 only to avoid clutter) +static inline void perf_bench_print(const char *label, perf_stats_t *stats) { + if (pi_core_id() == 0) { + printf("\n=== Performance Statistics: %s ===\n", label); + printf("Cycles: %10u\n", stats->cycles); + printf("Instructions: %10u\n", stats->instr); + printf("IPC: %10.3f\n", + stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f); + printf("\n--- Instruction Mix ---\n"); + printf("Loads: %10u (%.2f%%)\n", stats->ld, + stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f); + printf("Stores: %10u (%.2f%%)\n", stats->st, + stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f); + printf("Branches: %10u (%.2f%%)\n", stats->branch, + stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f); + printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch, + stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch + : 0.0f); + printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc, + stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f); + printf("\n--- Stalls & Hazards ---\n"); + printf("Load Stalls: %10u\n", stats->ld_stall); + printf("Jump Stalls: %10u\n", stats->jmp_stall); + printf("I-cache Misses: %10u\n", stats->imiss); + printf("TCDM Contentions: %10u\n", stats->tcdm_cont); + printf("\n--- Memory Hierarchy ---\n"); + printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext, + stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f); + printf("External Stores: %10u (%.2f%%)\n", stats->st_ext, + stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f); + printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc, + stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f); + printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc, + stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f); + printf("========================================\n\n"); + } +} + +// Compute difference between two stats (for analyzing specific code sections) +static inline void perf_bench_diff(perf_stats_t *result, perf_stats_t *end, + perf_stats_t *start) { + result->cycles = end->cycles - start->cycles; + result->instr = end->instr - start->instr; + result->ld = end->ld - start->ld; + result->st = end->st - start->st; + result->ld_stall = end->ld_stall - start->ld_stall; + result->jmp_stall = end->jmp_stall - start->jmp_stall; + result->imiss = end->imiss - start->imiss; + result->branch = end->branch - start->branch; + result->taken_branch = end->taken_branch - start->taken_branch; + result->rvc = end->rvc - start->rvc; + result->ld_ext = end->ld_ext - start->ld_ext; + result->st_ext = end->st_ext - start->st_ext; + result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc; + result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc; + result->tcdm_cont = end->tcdm_cont - start->tcdm_cont; +} + +#endif // __PERF_UTILS_H__ diff --git a/docs/tutorials/microbenchmark.rst b/docs/tutorials/microbenchmark.rst new file mode 100644 index 0000000000..c005090020 --- /dev/null +++ b/docs/tutorials/microbenchmark.rst @@ -0,0 +1,24 @@ +.. SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +.. +.. SPDX-License-Identifier: Apache-2.0 + +Microbenchmark +============== + +Pass ``--profileMicrobenchmark`` to any PULPOpen runner (``testMVP.py``, ``generateNetwork.py``, ``deeployRunner_*.py``) to wrap each layer in ``RunNetwork`` with PULP performance counters. Off by default; zero overhead when unused. + +The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling` into :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark`, which is registered last in the PULPOpen ``ForkTransformer`` and ``ClusterTransformer`` chains so it covers the full per-layer body (tiling, DMA, memory management). The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h``. + +Each layer prints one block on ``core 0``: + +.. code-block:: text + + === Performance Statistics: Add_0 === + Cycles: 1442 + Instructions: 149 + IPC: 0.103 + Loads / Stores / Branches / Taken Branches / RVC + Load Stalls / Jump Stalls / I-cache Misses / TCDM Contentions + External Loads / Stores and their cycle counts + +External-memory and TCDM-contention counters are zero when the wrapped region has no L2/L3 traffic or no bank conflicts (e.g. small untiled kernels that fit in L1). Some events may not be modelled by GVSoC — verify on a tiled test before assuming a counter is broken. diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst index 0b3d97c761..c0a9660104 100644 --- a/docs/tutorials/overview.rst +++ b/docs/tutorials/overview.rst @@ -14,5 +14,6 @@ Each tutorial covers a specific topic and includes code examples to illustrate t introduction debugging + microbenchmark