From a437c2e6b2b867ad1dc5c0bcc2b51902870f2246 Mon Sep 17 00:00:00 2001 From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com> Date: Thu, 12 Feb 2026 15:27:58 +0000 Subject: [PATCH 01/11] Deeploy Microbenchmark with GVSoC CSR and Demo on GEMM --- TargetLibraries/PULPOpen/src/Gemm.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c index a46f8ac6ae..02fd991674 100644 --- a/TargetLibraries/PULPOpen/src/Gemm.c +++ b/TargetLibraries/PULPOpen/src/Gemm.c @@ -6,6 +6,7 @@ #include "DeeployPULPMath.h" #include "pmsis.h" +// #include "perf_utils.h" void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB, @@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); + //RW: Performance monitoring is currently disabled + // perf_stats_t perf_start, perf_end, perf_total; + + // // Initialize and start performance counters (only core 0) + // if (core_id == 0) { + // perf_bench_init(); + // perf_bench_start(); + // perf_bench_read(&perf_start); + // } + uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0); uint32_t M_start = MIN(core_id * M_chunk, M); uint32_t M_end = MIN(M_start + M_chunk, M); @@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, } } } + + // RW: Stop performance counters and print results (only core 0) + // if (core_id == 0) { + // perf_bench_stop(); + // perf_bench_read(&perf_end); + // perf_bench_diff(&perf_total, &perf_end, &perf_start); + + // char label[100]; + // snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u", + // M, N, O, transA, transB); + // perf_bench_print(label, &perf_total); + // } } \ No newline at end of file From 0b61499cd7bdddc3c29f3bccf05897ada08f73e6 Mon Sep 17 00:00:00 2001 From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com> Date: Sun, 15 Feb 2026 22:06:11 +0000 Subject: [PATCH 02/11] Add microbenchmark to codepass --- Deeploy/Targets/PULPOpen/Bindings.py | 4 +- .../PULPClusterTiling.py | 39 ++++- Deeploy/Targets/PULPOpen/Platform.py | 2 +- .../DoubleBufferingTilingCodeGeneration.py | 39 ++++- .../SingleBufferingTilingCodeGeneration.py | 40 ++++- .../TilingPrototypes.py | 99 +++++++++++ TargetLibraries/PULPOpen/inc/perf_utils.h | 158 ++++++++++++++++++ TargetLibraries/PULPOpen/src/Gemm.c | 23 --- 8 files changed, 370 insertions(+), 34 deletions(-) create mode 100644 TargetLibraries/PULPOpen/inc/perf_utils.h diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 5d7b02ae62..8a5f92ea2c 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -103,7 +103,7 @@ PULPSynchCoresPass(), ForkClosure(writeback = False, generateStruct = True), TilingVariableReplacementUpdate("L1"), - PULPClusterTiling("L2", "L1", MchanDma()), + PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters ArgumentStructGeneration(), MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), @@ -121,7 +121,7 @@ TilingVariableReplacement("L1"), TilingCallClosure(writeback = False, generateStruct = True), TilingVariableReplacementUpdate("L1"), - PULPClusterTiling("L2", "L1", MchanDma()), + PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters ArgumentStructGeneration(), MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py index 3c0bba3107..59aec47a5d 100644 --- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py @@ -7,9 +7,9 @@ from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \ - DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn + DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \ - ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration + PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration): @@ -28,13 +28,38 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration pass +class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn): + """Single buffering with performance counter profiling""" + pass + + +class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn): + """Double buffering with performance counter profiling""" + pass + + +class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn): + """Single buffering with both cycle profiling and performance counter profiling""" + pass + + +class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn): + """Double buffering with both cycle profiling and performance counter profiling""" + pass + + class PULPClusterTiling(CodeTransformationPass): - def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma): + def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False): + self.usePerfCounters = usePerfCounters self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma) self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) + self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) + self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma) self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) + self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) + self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) def apply(self, ctxt: NetworkContext, @@ -42,10 +67,16 @@ def apply(self, name: str, verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: - if verbose.tilingProfiling: + if self.usePerfCounters and verbose.tilingProfiling: + # Use combined profiling: cycle measurements + performance counter stats + ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name) + ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name) + elif verbose.tilingProfiling: + # Use cycle profiling only (basic cycle measurements) ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name) ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name) else: + # No profiling ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name) ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name) diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 7456dd9e1b..11b9747526 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -248,7 +248,7 @@ class PULPStructBuffer(StructBuffer): # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't... _includeList = [ - "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h" + "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h" ] diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py index ad9c6ad012..ce9ec86f27 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py @@ -11,8 +11,8 @@ from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ - PrototypeTilingMixIn, TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \ + ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape @@ -364,3 +364,38 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, _egressDMAStatements, closeLoopStatements) return executionBlock + +class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn): + """ + Double buffering tiling with performance counter profiling. + Provides detailed instruction-level statistics for each tile. + """ + + @classmethod + def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: + + executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, + teardownStatements) + + # Inject performance counter initialization in setup (only once, not per-tile) + executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo) + + # Inject performance counter stop and print in teardown (only once, not per-tile) + executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo) + + return executionBlock + + @classmethod + def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: + + # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles + # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo) + + executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, + egressDMAStatements, closeLoopStatements) + return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index ea1e938b58..e4bb803611 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -10,8 +10,8 @@ from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ - PrototypeTilingMixIn, TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \ + ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme @@ -191,3 +191,39 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, _egressDMAStatements, closeLoopStatements) return executionBlock + + +class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn): + """ + Single buffering tiling with performance counter profiling. + Provides detailed instruction-level statistics for each tile. + """ + + @classmethod + def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + setupStatements: List[CodeSnippet], + teardownStatements: List[CodeSnippet]) -> ExecutionBlock: + + executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, + teardownStatements) + + # Inject performance counter initialization in setup (only once, not per-tile) + executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo) + + # Inject performance counter stop and print in teardown (only once, not per-tile) + executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo) + + return executionBlock + + @classmethod + def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, + openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], + egressDMAStatements: List[CodeSnippet], + closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: + + # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles + # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo) + + executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, + egressDMAStatements, closeLoopStatements) + return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index 09a4ef56eb..70aabd9805 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -64,6 +64,105 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM return executionBlock +class PerfCounterProfilingMixIn(ABC): + """ + MixIn for injecting performance counter profiling code. + Provides detailed instruction-level statistics using CSR performance counters. + """ + + _perfCounterInit = NodeTemplate(""" + perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total; + if (pi_core_id() == 0) { + perf_bench_init(); + perf_bench_start(); + perf_bench_read(&${nodeName}_perf_start); + } + """) + + _perfCounterStop = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_stop(); + perf_bench_read(&${nodeName}_perf_end); + perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start); + perf_bench_print("${nodeName}", &${nodeName}_perf_total); + } + """) + + _perfCounterKernelStart = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_start(); + perf_bench_read(&${nodeName}_perf_kernel_start); + } + """) + + _perfCounterKernelEnd = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_stop(); + perf_bench_read(&${nodeName}_perf_kernel_end); + perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start); + perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total); + } + """) + + _perfCounterKernelDecl = NodeTemplate(""" + perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total; + """) + + @classmethod + def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: + """ + Inject performance counter initialization at the beginning of the node execution. + This should be called in the setup phase. + """ + nodeName = metaInfo.nodeName + + executionBlock.addLeft(cls._perfCounterInit, { + "nodeName": nodeName, + }) + + return executionBlock + + @classmethod + def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: + """ + Inject performance counter stop and print at the end of the node execution. + This should be called in the teardown phase. + """ + nodeName = metaInfo.nodeName + + executionBlock.addRight(cls._perfCounterStop, { + "nodeName": nodeName, + }) + + return executionBlock + + @classmethod + def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: + """ + Wrap the kernel execution with performance counter measurements. + This provides detailed statistics for just the kernel computation (excluding DMA). + """ + nodeName = metaInfo.nodeName + + if metaInfo.kernelLevelTiling: + # Add declaration at the beginning + executionBlock.addLeft(cls._perfCounterKernelDecl, { + "nodeName": nodeName, + }) + + # Add start measurement before kernel + executionBlock.addLeft(cls._perfCounterKernelStart, { + "nodeName": nodeName, + }) + + # Add stop and print after kernel + executionBlock.addRight(cls._perfCounterKernelEnd, { + "nodeName": nodeName, + }) + + return executionBlock + + class ProfilingPrototypeMixIn(ABC): _measureCycles = NodeTemplate(""" ${measurements}[${tileIdxVar}] = getCycles(); diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h new file mode 100644 index 0000000000..2d9fbc39c6 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/perf_utils.h @@ -0,0 +1,158 @@ +/* + * Performance Counter Utilities for PULP Benchmarking + */ + +#ifndef __PERF_UTILS_H__ +#define __PERF_UTILS_H__ + +#include "pmsis.h" + +// Performance event IDs (compatible with PMSIS) +#define PI_PERF_CYCLES CSR_PCER_CYCLES +#define PI_PERF_INSTR CSR_PCER_INSTR +#define PI_PERF_LD_STALL CSR_PCER_LD_STALL +#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL +#define PI_PERF_IMISS CSR_PCER_IMISS +#define PI_PERF_LD CSR_PCER_LD +#define PI_PERF_ST CSR_PCER_ST +#define PI_PERF_JUMP CSR_PCER_JUMP +#define PI_PERF_BRANCH CSR_PCER_BRANCH +#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH +#define PI_PERF_RVC CSR_PCER_RVC +#define PI_PERF_LD_EXT CSR_PCER_LD_EXT +#define PI_PERF_ST_EXT CSR_PCER_ST_EXT +#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC +#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC +#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT + +// Benchmark statistics structure +typedef struct { + unsigned int cycles; + unsigned int instr; + unsigned int ld; + unsigned int st; + unsigned int ld_stall; + unsigned int jmp_stall; + unsigned int imiss; + unsigned int branch; + unsigned int taken_branch; + unsigned int rvc; + unsigned int ld_ext; + unsigned int st_ext; + unsigned int ld_ext_cyc; + unsigned int st_ext_cyc; + unsigned int tcdm_cont; +} perf_stats_t; + +// Initialize performance counters for comprehensive benchmarking +static inline void perf_bench_init() { + // Enable all performance counters + pi_perf_conf( + (1 << PI_PERF_CYCLES) | + (1 << PI_PERF_INSTR) | + (1 << PI_PERF_LD_STALL) | + (1 << PI_PERF_JMP_STALL) | + (1 << PI_PERF_IMISS) | + (1 << PI_PERF_LD) | + (1 << PI_PERF_ST) | + (1 << PI_PERF_JUMP) | + (1 << PI_PERF_BRANCH) | + (1 << PI_PERF_TAKEN_BRANCH) | + (1 << PI_PERF_RVC) | + (1 << PI_PERF_LD_EXT) | + (1 << PI_PERF_ST_EXT) | + (1 << PI_PERF_LD_EXT_CYC) | + (1 << PI_PERF_ST_EXT_CYC) | + (1 << PI_PERF_TCDM_CONT) + ); +} + +// Start performance monitoring +static inline void perf_bench_start() { + pi_perf_reset(); + pi_perf_start(); +} + +// Stop performance monitoring +static inline void perf_bench_stop() { + pi_perf_stop(); +} + +// Read all performance counters into structure +static inline void perf_bench_read(perf_stats_t *stats) { + stats->cycles = pi_perf_read(PI_PERF_CYCLES); + stats->instr = pi_perf_read(PI_PERF_INSTR); + stats->ld = pi_perf_read(PI_PERF_LD); + stats->st = pi_perf_read(PI_PERF_ST); + stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL); + stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL); + stats->imiss = pi_perf_read(PI_PERF_IMISS); + stats->branch = pi_perf_read(PI_PERF_BRANCH); + stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH); + stats->rvc = pi_perf_read(PI_PERF_RVC); + stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT); + stats->st_ext = pi_perf_read(PI_PERF_ST_EXT); + stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC); + stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC); + stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT); +} + +// Print performance statistics (core 0 only to avoid clutter) +static inline void perf_bench_print(const char *label, perf_stats_t *stats) { + if (pi_core_id() == 0) { + printf("\n=== Performance Statistics: %s ===\n", label); + printf("Cycles: %10u\n", stats->cycles); + printf("Instructions: %10u\n", stats->instr); + printf("IPC: %10.3f\n", + stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f); + printf("\n--- Instruction Mix ---\n"); + printf("Loads: %10u (%.2f%%)\n", stats->ld, + stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f); + printf("Stores: %10u (%.2f%%)\n", stats->st, + stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f); + printf("Branches: %10u (%.2f%%)\n", stats->branch, + stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f); + printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch, + stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f); + printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc, + stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f); + printf("\n--- Stalls & Hazards ---\n"); + printf("Load Stalls: %10u\n", stats->ld_stall); + printf("Jump Stalls: %10u\n", stats->jmp_stall); + printf("I-cache Misses: %10u\n", stats->imiss); + printf("TCDM Contentions: %10u\n", stats->tcdm_cont); + printf("\n--- Memory Hierarchy ---\n"); + printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext, + stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f); + printf("External Stores: %10u (%.2f%%)\n", stats->st_ext, + stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f); + printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc, + stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f); + printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc, + stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f); + printf("========================================\n\n"); + } +} + +// Compute difference between two stats (for analyzing specific code sections) +static inline void perf_bench_diff(perf_stats_t *result, + perf_stats_t *end, + perf_stats_t *start) { + result->cycles = end->cycles - start->cycles; + result->instr = end->instr - start->instr; + result->ld = end->ld - start->ld; + result->st = end->st - start->st; + result->ld_stall = end->ld_stall - start->ld_stall; + result->jmp_stall = end->jmp_stall - start->jmp_stall; + result->imiss = end->imiss - start->imiss; + result->branch = end->branch - start->branch; + result->taken_branch = end->taken_branch - start->taken_branch; + result->rvc = end->rvc - start->rvc; + result->ld_ext = end->ld_ext - start->ld_ext; + result->st_ext = end->st_ext - start->st_ext; + result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc; + result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc; + result->tcdm_cont = end->tcdm_cont - start->tcdm_cont; +} + +#endif // __PERF_UTILS_H__ diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c index 02fd991674..a46f8ac6ae 100644 --- a/TargetLibraries/PULPOpen/src/Gemm.c +++ b/TargetLibraries/PULPOpen/src/Gemm.c @@ -6,7 +6,6 @@ #include "DeeployPULPMath.h" #include "pmsis.h" -// #include "perf_utils.h" void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB, @@ -18,16 +17,6 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, int8_t core_id = pi_core_id(); int8_t log2Core = LOG2(NUM_CORES); - //RW: Performance monitoring is currently disabled - // perf_stats_t perf_start, perf_end, perf_total; - - // // Initialize and start performance counters (only core 0) - // if (core_id == 0) { - // perf_bench_init(); - // perf_bench_start(); - // perf_bench_read(&perf_start); - // } - uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0); uint32_t M_start = MIN(core_id * M_chunk, M); uint32_t M_end = MIN(M_start + M_chunk, M); @@ -362,16 +351,4 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, } } } - - // RW: Stop performance counters and print results (only core 0) - // if (core_id == 0) { - // perf_bench_stop(); - // perf_bench_read(&perf_end); - // perf_bench_diff(&perf_total, &perf_end, &perf_start); - - // char label[100]; - // snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u", - // M, N, O, transA, transB); - // perf_bench_print(label, &perf_total); - // } } \ No newline at end of file From e7dd555f881b38542135f3e477850ef230ecaa7c Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 09:29:43 +0000 Subject: [PATCH 03/11] Update pro microbenchmark codetransformation --- Deeploy/DeeployTypes.py | 1 + Deeploy/Targets/PULPOpen/Bindings.py | 7 +- .../PULPClusterTiling.py | 39 +------- .../PULPMicrobenchmark.py | 42 ++++++++ .../DoubleBufferingTilingCodeGeneration.py | 39 +------- .../SingleBufferingTilingCodeGeneration.py | 38 +------ .../TilingPrototypes.py | 99 ------------------- DeeployTest/testMVP.py | 6 ++ DeeployTest/testUtils/deeployRunner.py | 5 + DeeployTest/testUtils/testRunner.py | 5 + 10 files changed, 72 insertions(+), 209 deletions(-) create mode 100644 Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 797bd44c47..de5a66aae9 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -53,6 +53,7 @@ class CodeGenVerbosity: tilingProfiling: Optional[bool] = False # Specifies if we should profile the tiling code untiledProfiling: Optional[bool] = None # Specifies if we should profile the untilied code + microbenchmarkProfiling: Optional[bool] = False # Wrap each layer with PULP perf-counter microbenchmark _NoVerbosity = CodeGenVerbosity(None) diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 8a5f92ea2c..2c78978e23 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -24,6 +24,7 @@ from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling +from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark import PULPMicrobenchmark from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack @@ -103,7 +104,7 @@ PULPSynchCoresPass(), ForkClosure(writeback = False, generateStruct = True), TilingVariableReplacementUpdate("L1"), - PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters + PULPClusterTiling("L2", "L1", MchanDma()), ArgumentStructGeneration(), MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), @@ -115,13 +116,14 @@ MemoryManagementGeneration("L2"), MemoryManagementGeneration("L3.*"), MemoryManagementGeneration(), + PULPMicrobenchmark(), ]) ClusterTransformer = CodeTransformation([ TilingVariableReplacement("L1"), TilingCallClosure(writeback = False, generateStruct = True), TilingVariableReplacementUpdate("L1"), - PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True), # Enable perf counters + PULPClusterTiling("L2", "L1", MchanDma()), ArgumentStructGeneration(), MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), @@ -133,6 +135,7 @@ MemoryManagementGeneration("L2"), MemoryManagementGeneration("L3.*"), MemoryManagementGeneration(), + PULPMicrobenchmark(), ]) SimpleTransformer = CodeTransformation([ diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py index 59aec47a5d..3c0bba3107 100644 --- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py @@ -7,9 +7,9 @@ from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \ - DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn + DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \ - PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration + ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration): @@ -28,38 +28,13 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration pass -class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn): - """Single buffering with performance counter profiling""" - pass - - -class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn): - """Double buffering with performance counter profiling""" - pass - - -class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn): - """Single buffering with both cycle profiling and performance counter profiling""" - pass - - -class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn): - """Double buffering with both cycle profiling and performance counter profiling""" - pass - - class PULPClusterTiling(CodeTransformationPass): - def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False): - self.usePerfCounters = usePerfCounters + def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma): self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma) self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) - self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) - self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma) self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma) self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) - self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) - self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma) def apply(self, ctxt: NetworkContext, @@ -67,16 +42,10 @@ def apply(self, name: str, verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: - if self.usePerfCounters and verbose.tilingProfiling: - # Use combined profiling: cycle measurements + performance counter stats - ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name) - ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name) - elif verbose.tilingProfiling: - # Use cycle profiling only (basic cycle measurements) + if verbose.tilingProfiling: ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name) ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name) else: - # No profiling ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name) ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name) diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py new file mode 100644 index 0000000000..bb35f32d47 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Tuple + +from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \ + NodeTemplate, _NoVerbosity + + +class PULPMicrobenchmark(CodeTransformationPass): + + _preTemplate = NodeTemplate(""" + perf_stats_t ${op}_perf_start, ${op}_perf_end, ${op}_perf_total; + if (pi_core_id() == 0) { + perf_bench_init(); + perf_bench_start(); + perf_bench_read(&${op}_perf_start); + } + """) + + _postTemplate = NodeTemplate(""" + if (pi_core_id() == 0) { + perf_bench_stop(); + perf_bench_read(&${op}_perf_end); + perf_bench_diff(&${op}_perf_total, &${op}_perf_end, &${op}_perf_start); + perf_bench_print("${op}", &${op}_perf_total); + } + """) + + def apply(self, + ctxt: NetworkContext, + executionBlock: ExecutionBlock, + name: str, + verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]: + + if not verbose.microbenchmarkProfiling: + return ctxt, executionBlock + + executionBlock.addLeft(self._preTemplate, {"op": name}) + executionBlock.addRight(self._postTemplate, {"op": name}) + return ctxt, executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py index ce9ec86f27..ad9c6ad012 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py @@ -11,8 +11,8 @@ from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \ - ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ + PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape @@ -364,38 +364,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, _egressDMAStatements, closeLoopStatements) return executionBlock - -class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn): - """ - Double buffering tiling with performance counter profiling. - Provides detailed instruction-level statistics for each tile. - """ - - @classmethod - def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - setupStatements: List[CodeSnippet], - teardownStatements: List[CodeSnippet]) -> ExecutionBlock: - - executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, - teardownStatements) - - # Inject performance counter initialization in setup (only once, not per-tile) - executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo) - - # Inject performance counter stop and print in teardown (only once, not per-tile) - executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo) - - return executionBlock - - @classmethod - def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], - egressDMAStatements: List[CodeSnippet], - closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: - - # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles - # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo) - - executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, - egressDMAStatements, closeLoopStatements) - return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index e4bb803611..d234776b57 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -10,8 +10,8 @@ from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays -from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \ - ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo +from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \ + PrototypeTilingMixIn, TilingMetaInfo from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme @@ -193,37 +193,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn return executionBlock -class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn): - """ - Single buffering tiling with performance counter profiling. - Provides detailed instruction-level statistics for each tile. - """ - - @classmethod - def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - setupStatements: List[CodeSnippet], - teardownStatements: List[CodeSnippet]) -> ExecutionBlock: - - executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, - teardownStatements) - - # Inject performance counter initialization in setup (only once, not per-tile) - executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo) - - # Inject performance counter stop and print in teardown (only once, not per-tile) - executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo) - - return executionBlock - - @classmethod - def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo, - openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet], - egressDMAStatements: List[CodeSnippet], - closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock: - - # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles - # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo) - - executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements, - egressDMAStatements, closeLoopStatements) - return executionBlock diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py index 70aabd9805..09a4ef56eb 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py @@ -64,105 +64,6 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM return executionBlock -class PerfCounterProfilingMixIn(ABC): - """ - MixIn for injecting performance counter profiling code. - Provides detailed instruction-level statistics using CSR performance counters. - """ - - _perfCounterInit = NodeTemplate(""" - perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total; - if (pi_core_id() == 0) { - perf_bench_init(); - perf_bench_start(); - perf_bench_read(&${nodeName}_perf_start); - } - """) - - _perfCounterStop = NodeTemplate(""" - if (pi_core_id() == 0) { - perf_bench_stop(); - perf_bench_read(&${nodeName}_perf_end); - perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start); - perf_bench_print("${nodeName}", &${nodeName}_perf_total); - } - """) - - _perfCounterKernelStart = NodeTemplate(""" - if (pi_core_id() == 0) { - perf_bench_start(); - perf_bench_read(&${nodeName}_perf_kernel_start); - } - """) - - _perfCounterKernelEnd = NodeTemplate(""" - if (pi_core_id() == 0) { - perf_bench_stop(); - perf_bench_read(&${nodeName}_perf_kernel_end); - perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start); - perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total); - } - """) - - _perfCounterKernelDecl = NodeTemplate(""" - perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total; - """) - - @classmethod - def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: - """ - Inject performance counter initialization at the beginning of the node execution. - This should be called in the setup phase. - """ - nodeName = metaInfo.nodeName - - executionBlock.addLeft(cls._perfCounterInit, { - "nodeName": nodeName, - }) - - return executionBlock - - @classmethod - def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: - """ - Inject performance counter stop and print at the end of the node execution. - This should be called in the teardown phase. - """ - nodeName = metaInfo.nodeName - - executionBlock.addRight(cls._perfCounterStop, { - "nodeName": nodeName, - }) - - return executionBlock - - @classmethod - def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock: - """ - Wrap the kernel execution with performance counter measurements. - This provides detailed statistics for just the kernel computation (excluding DMA). - """ - nodeName = metaInfo.nodeName - - if metaInfo.kernelLevelTiling: - # Add declaration at the beginning - executionBlock.addLeft(cls._perfCounterKernelDecl, { - "nodeName": nodeName, - }) - - # Add start measurement before kernel - executionBlock.addLeft(cls._perfCounterKernelStart, { - "nodeName": nodeName, - }) - - # Add stop and print after kernel - executionBlock.addRight(cls._perfCounterKernelEnd, { - "nodeName": nodeName, - }) - - return executionBlock - - class ProfilingPrototypeMixIn(ABC): _measureCycles = NodeTemplate(""" ${measurements}[${tileIdxVar}] = getCycles(); diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 69e04343ff..15fb6a2ac5 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -203,6 +203,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg - min: Initalize all variables at their minimal value. """) parser.add_argument('--profileTiling', action = "store_true") + parser.add_argument('--profileMicrobenchmark', + action = "store_true", + help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation') parser.add_argument('--plotMemAlloc', action = 'store_true', help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n') @@ -224,6 +227,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg if args.profileTiling: verbosityCfg.tilingProfiling = True + if args.profileMicrobenchmark: + verbosityCfg.microbenchmarkProfiling = True + onnx_graph = onnx.load_model(f'{args.dir}/network.onnx') graph = gs.import_onnx(onnx_graph) diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index a5a8d70ef3..3c03e5b4c3 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -143,6 +143,9 @@ def __init__(self, action = "store_true", help = 'Enable randomized memory scheduler\n') self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n') + self.add_argument('--profileMicrobenchmark', + action = 'store_true', + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') self.add_argument('--memAllocStrategy', metavar = '', dest = 'memAllocStrategy', @@ -225,6 +228,8 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.append("--randomizedMemoryScheduler") if hasattr(args, 'profileTiling') and args.profileTiling: gen_args_list.append("--profileTiling") + if hasattr(args, 'profileMicrobenchmark') and args.profileMicrobenchmark: + gen_args_list.append("--profileMicrobenchmark") if hasattr(args, 'memAllocStrategy') and args.memAllocStrategy: gen_args_list.append(f"--memAllocStrategy={args.memAllocStrategy}") if hasattr(args, 'searchStrategy') and args.searchStrategy: diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py index 9578c2f26c..18a4f870ac 100644 --- a/DeeployTest/testUtils/testRunner.py +++ b/DeeployTest/testUtils/testRunner.py @@ -211,6 +211,9 @@ def __init__(self, tiling_arguments: bool, description = None): action = "store_true", help = 'Enable randomized memory scheduler\n') self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n') + self.add_argument('--profileMicrobenchmark', + action = 'store_true', + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') self.add_argument('--memAllocStrategy', metavar = 'memAllocStrategy', dest = 'memAllocStrategy', @@ -271,6 +274,8 @@ def generate_cmd_args(self) -> str: command += " --randomizedMemoryScheduler" if self.args.profileTiling: command += f" --profileTiling" + if self.args.profileMicrobenchmark: + command += f" --profileMicrobenchmark" if self.args.memAllocStrategy: command += f" --memAllocStrategy={self.args.memAllocStrategy}" if self.args.plotMemAlloc: From ec5df1bf66359561a3b4c255852e90b5170b474c Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 10:57:38 +0000 Subject: [PATCH 04/11] Add helper function for profileMicrobenchmark --- DeeployTest/generateNetwork.py | 6 ++++++ DeeployTest/testMVP.py | 2 +- DeeployTest/testUtils/deeployRunner.py | 14 +++++++++----- DeeployTest/testUtils/pytestRunner.py | 3 +++ 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py index f029be7361..0b25bc6bbe 100644 --- a/DeeployTest/generateNetwork.py +++ b/DeeployTest/generateNetwork.py @@ -141,6 +141,7 @@ def generateNetwork(args): verbosityCfg = _NoVerbosity if isinstance(platform, PULPPlatform): verbosityCfg.untiledProfiling = args.profileUntiled + verbosityCfg.microbenchmarkProfiling = args.profileMicrobenchmark # Parse graph and infer output levels and signedness _ = deployer.prepare(verbosityCfg) @@ -172,6 +173,11 @@ def generateNetwork(args): dest = 'profileUntiled', default = False, help = 'Profile Untiled for L2\n') + parser.add_argument('--profileMicrobenchmark', + action = 'store_true', + dest = 'profileMicrobenchmark', + default = False, + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') parser.add_argument('--input-type-map', nargs = '*', default = [], diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 15fb6a2ac5..9678bc4e4f 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -202,7 +202,7 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg - max: Initalize all variables at their maximal value. - min: Initalize all variables at their minimal value. """) - parser.add_argument('--profileTiling', action = "store_true") + parser.add_argument('--profileTiling', action = "store_true", help = 'Enable tiling profiling') parser.add_argument('--profileMicrobenchmark', action = "store_true", help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation') diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index 3c03e5b4c3..fbbd95703e 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -94,6 +94,12 @@ def __init__(self, action = 'store_true', default = False, help = 'Enable untiled profiling (Siracusa only)\n') + self.add_argument('--profileMicrobenchmark', + '--profile-microbenchmark', + dest = 'profileMicrobenchmark', + action = 'store_true', + default = False, + help = 'Wrap each layer with PULP perf-counter microbenchmark\n') self.add_argument('--toolchain', metavar = '', dest = 'toolchain', @@ -143,9 +149,6 @@ def __init__(self, action = "store_true", help = 'Enable randomized memory scheduler\n') self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n') - self.add_argument('--profileMicrobenchmark', - action = 'store_true', - help = 'Wrap each layer with PULP perf-counter microbenchmark\n') self.add_argument('--memAllocStrategy', metavar = '', dest = 'memAllocStrategy', @@ -228,8 +231,6 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.append("--randomizedMemoryScheduler") if hasattr(args, 'profileTiling') and args.profileTiling: gen_args_list.append("--profileTiling") - if hasattr(args, 'profileMicrobenchmark') and args.profileMicrobenchmark: - gen_args_list.append("--profileMicrobenchmark") if hasattr(args, 'memAllocStrategy') and args.memAllocStrategy: gen_args_list.append(f"--memAllocStrategy={args.memAllocStrategy}") if hasattr(args, 'searchStrategy') and args.searchStrategy: @@ -240,6 +241,9 @@ def create_config_from_args(args: argparse.Namespace, if not tiling and getattr(args, 'profileUntiled', False): gen_args_list.append("--profileUntiled") + if getattr(args, 'profileMicrobenchmark', False): + gen_args_list.append("--profileMicrobenchmark") + config = DeeployTestConfig( test_name = test_name, test_dir = test_dir_abs, diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py index 472d8cfed9..29119bba6f 100644 --- a/DeeployTest/testUtils/pytestRunner.py +++ b/DeeployTest/testUtils/pytestRunner.py @@ -45,6 +45,7 @@ def create_test_config( mem_alloc_strategy: str = "MiniMalloc", search_strategy: str = "random-max", profile_tiling: bool = False, + profile_microbenchmark: bool = False, plot_mem_alloc: bool = False, randomized_mem_scheduler: bool = False, profile_untiled: bool = False, @@ -86,6 +87,8 @@ def create_test_config( gen_args_list.append(f"--searchStrategy={search_strategy}") if profile_tiling: gen_args_list.append("--profileTiling") + if profile_microbenchmark: + gen_args_list.append("--profileMicrobenchmark") if plot_mem_alloc: gen_args_list.append("--plotMemAlloc") if randomized_mem_scheduler: From 3d1b6afbadd104cd997d1aa6ef39ad6db07c3db8 Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 10:59:30 +0000 Subject: [PATCH 05/11] perf-util add pre-commit --- TargetLibraries/PULPOpen/inc/perf_utils.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h index 2d9fbc39c6..dc0a78c5e2 100644 --- a/TargetLibraries/PULPOpen/inc/perf_utils.h +++ b/TargetLibraries/PULPOpen/inc/perf_utils.h @@ -1,4 +1,8 @@ /* + * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + * * Performance Counter Utilities for PULP Benchmarking */ From a77284376faf9cbbcdafde2643822c0d81498748 Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 11:06:22 +0000 Subject: [PATCH 06/11] Rebase singlebuffertilingcodegeneration --- .../SingleBufferingTilingCodeGeneration.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py index d234776b57..ea1e938b58 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py @@ -191,5 +191,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements, _egressDMAStatements, closeLoopStatements) return executionBlock - - From 4d03bf165847c1fb810bf79b621ad126af9cc1f5 Mon Sep 17 00:00:00 2001 From: Victor Jung Date: Wed, 8 Apr 2026 13:56:44 +0200 Subject: [PATCH 07/11] Make workspace safe to prevent "dubious ownership" sporadic issues --- .github/workflows/_runner-chimera.yml | 2 ++ .github/workflows/_runner-cortexm.yml | 2 ++ .github/workflows/_runner-gap9-tiled.yml | 2 ++ .github/workflows/_runner-gap9.yml | 2 ++ .github/workflows/_runner-generic.yml | 2 ++ .github/workflows/_runner-mempool.yml | 2 ++ .github/workflows/_runner-siracusa-neureka-tiled.yml | 2 ++ .github/workflows/_runner-siracusa-tiled.yml | 2 ++ .github/workflows/_runner-siracusa.yml | 2 ++ .github/workflows/_runner-snitch-tiled-sequential.yml | 2 ++ .github/workflows/_runner-snitch.yml | 2 ++ .github/workflows/_runner-softhier.yml | 2 ++ .github/workflows/ci-deeploy.yml | 4 ++++ .github/workflows/infra-generate-ccache-gap9.yml | 2 ++ .github/workflows/infra-generate-ccache.yml | 2 ++ 15 files changed, 32 insertions(+) diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml index 14e80631d1..c642bfe6d2 100644 --- a/.github/workflows/_runner-chimera.yml +++ b/.github/workflows/_runner-chimera.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-cortexm.yml b/.github/workflows/_runner-cortexm.yml index 3fbdf0ee16..c6be8af465 100644 --- a/.github/workflows/_runner-cortexm.yml +++ b/.github/workflows/_runner-cortexm.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml index a5c8b3ac98..6934014447 100644 --- a/.github/workflows/_runner-gap9-tiled.yml +++ b/.github/workflows/_runner-gap9-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml index e1d6e452a6..cc790d3d33 100644 --- a/.github/workflows/_runner-gap9.yml +++ b/.github/workflows/_runner-gap9.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-generic.yml b/.github/workflows/_runner-generic.yml index 6681cbac96..b44b47f73d 100644 --- a/.github/workflows/_runner-generic.yml +++ b/.github/workflows/_runner-generic.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-mempool.yml b/.github/workflows/_runner-mempool.yml index deb4809330..b2f0ae4f7a 100644 --- a/.github/workflows/_runner-mempool.yml +++ b/.github/workflows/_runner-mempool.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml index b1f5f2fcb3..664d5f01be 100644 --- a/.github/workflows/_runner-siracusa-neureka-tiled.yml +++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index ea9c8989af..cc09f234e0 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-siracusa.yml b/.github/workflows/_runner-siracusa.yml index ea8fe5d405..1c51333f7a 100644 --- a/.github/workflows/_runner-siracusa.yml +++ b/.github/workflows/_runner-siracusa.yml @@ -25,6 +25,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml index fbd5195b08..bcdd58a166 100644 --- a/.github/workflows/_runner-snitch-tiled-sequential.yml +++ b/.github/workflows/_runner-snitch-tiled-sequential.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml index bc599e4fe7..48130ea26a 100644 --- a/.github/workflows/_runner-snitch.yml +++ b/.github/workflows/_runner-snitch.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml index b067664f40..2624cbe15d 100644 --- a/.github/workflows/_runner-softhier.yml +++ b/.github/workflows/_runner-softhier.yml @@ -24,6 +24,8 @@ jobs: container: image: ${{ inputs.docker-image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml index fc468306b1..84f2779e4c 100644 --- a/.github/workflows/ci-deeploy.yml +++ b/.github/workflows/ci-deeploy.yml @@ -35,6 +35,8 @@ jobs: container: image: ${{ needs.select-env.outputs.image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: @@ -49,6 +51,8 @@ jobs: container: image: ${{ needs.select-env.outputs.image }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml index f9010adb1d..5b456e1d64 100644 --- a/.github/workflows/infra-generate-ccache-gap9.yml +++ b/.github/workflows/infra-generate-ccache-gap9.yml @@ -23,6 +23,8 @@ jobs: container: image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml index bd6e2c7787..e2d54eaa83 100644 --- a/.github/workflows/infra-generate-ccache.yml +++ b/.github/workflows/infra-generate-ccache.yml @@ -22,6 +22,8 @@ jobs: container: image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }} steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' - name: Checkout Repo uses: actions/checkout@v4 with: From c43fd5e76278adf172f697bda3150e3acf824075 Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 11:10:24 +0000 Subject: [PATCH 08/11] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38c5b3ce35..42281c6f0a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid ### List of Pull Requests +- Add Microbenchmarking Infrastructure and CI Using GVSoC CSR [#162](https://github.com/pulp-platform/Deeploy/pull/162) - Fix CI Cache Generation [#176](https://github.com/pulp-platform/Deeploy/pull/176) - Fix Broken CI [#175](https://github.com/pulp-platform/Deeploy/pull/175) - Improve Docstring and Debugging [#160](https://github.com/pulp-platform/Deeploy/pull/160) @@ -23,6 +24,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid - Shell Format pre-commit hook - Add integer MaxPool1D for Generic platform and RQSConv1D support for PULPOpen, with corresponding kernel tests. - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows +- Per-layer microbenchmarking on PULPOpen via `--profileMicrobenchmark`: new `PULPMicrobenchmark` code-transformation pass + `perf_utils.h` helpers report cycles, instructions, stalls and cache misses per layer in `RunNetwork` ### Changed - Use by default `devel` container for GAP9 CI From 87d8115a7c246b89de4e6837724f1c870597d0b3 Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 11:19:26 +0000 Subject: [PATCH 09/11] Fix linting --- Deeploy/Targets/PULPOpen/Platform.py | 3 +- TargetLibraries/PULPOpen/inc/perf_utils.h | 229 ++++++++++------------ 2 files changed, 110 insertions(+), 122 deletions(-) diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index 11b9747526..f13e6451fb 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -248,7 +248,8 @@ class PULPStructBuffer(StructBuffer): # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't... _includeList = [ - "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h" + "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", + "perf_utils.h" ] diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h index dc0a78c5e2..c710402ed2 100644 --- a/TargetLibraries/PULPOpen/inc/perf_utils.h +++ b/TargetLibraries/PULPOpen/inc/perf_utils.h @@ -12,151 +12,138 @@ #include "pmsis.h" // Performance event IDs (compatible with PMSIS) -#define PI_PERF_CYCLES CSR_PCER_CYCLES -#define PI_PERF_INSTR CSR_PCER_INSTR -#define PI_PERF_LD_STALL CSR_PCER_LD_STALL -#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL -#define PI_PERF_IMISS CSR_PCER_IMISS -#define PI_PERF_LD CSR_PCER_LD -#define PI_PERF_ST CSR_PCER_ST -#define PI_PERF_JUMP CSR_PCER_JUMP -#define PI_PERF_BRANCH CSR_PCER_BRANCH -#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH -#define PI_PERF_RVC CSR_PCER_RVC -#define PI_PERF_LD_EXT CSR_PCER_LD_EXT -#define PI_PERF_ST_EXT CSR_PCER_ST_EXT -#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC -#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC -#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT +#define PI_PERF_CYCLES CSR_PCER_CYCLES +#define PI_PERF_INSTR CSR_PCER_INSTR +#define PI_PERF_LD_STALL CSR_PCER_LD_STALL +#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL +#define PI_PERF_IMISS CSR_PCER_IMISS +#define PI_PERF_LD CSR_PCER_LD +#define PI_PERF_ST CSR_PCER_ST +#define PI_PERF_JUMP CSR_PCER_JUMP +#define PI_PERF_BRANCH CSR_PCER_BRANCH +#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH +#define PI_PERF_RVC CSR_PCER_RVC +#define PI_PERF_LD_EXT CSR_PCER_LD_EXT +#define PI_PERF_ST_EXT CSR_PCER_ST_EXT +#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC +#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC +#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT // Benchmark statistics structure typedef struct { - unsigned int cycles; - unsigned int instr; - unsigned int ld; - unsigned int st; - unsigned int ld_stall; - unsigned int jmp_stall; - unsigned int imiss; - unsigned int branch; - unsigned int taken_branch; - unsigned int rvc; - unsigned int ld_ext; - unsigned int st_ext; - unsigned int ld_ext_cyc; - unsigned int st_ext_cyc; - unsigned int tcdm_cont; + unsigned int cycles; + unsigned int instr; + unsigned int ld; + unsigned int st; + unsigned int ld_stall; + unsigned int jmp_stall; + unsigned int imiss; + unsigned int branch; + unsigned int taken_branch; + unsigned int rvc; + unsigned int ld_ext; + unsigned int st_ext; + unsigned int ld_ext_cyc; + unsigned int st_ext_cyc; + unsigned int tcdm_cont; } perf_stats_t; // Initialize performance counters for comprehensive benchmarking static inline void perf_bench_init() { - // Enable all performance counters - pi_perf_conf( - (1 << PI_PERF_CYCLES) | - (1 << PI_PERF_INSTR) | - (1 << PI_PERF_LD_STALL) | - (1 << PI_PERF_JMP_STALL) | - (1 << PI_PERF_IMISS) | - (1 << PI_PERF_LD) | - (1 << PI_PERF_ST) | - (1 << PI_PERF_JUMP) | - (1 << PI_PERF_BRANCH) | - (1 << PI_PERF_TAKEN_BRANCH) | - (1 << PI_PERF_RVC) | - (1 << PI_PERF_LD_EXT) | - (1 << PI_PERF_ST_EXT) | - (1 << PI_PERF_LD_EXT_CYC) | - (1 << PI_PERF_ST_EXT_CYC) | - (1 << PI_PERF_TCDM_CONT) - ); + // Enable all performance counters + pi_perf_conf( + (1 << PI_PERF_CYCLES) | (1 << PI_PERF_INSTR) | (1 << PI_PERF_LD_STALL) | + (1 << PI_PERF_JMP_STALL) | (1 << PI_PERF_IMISS) | (1 << PI_PERF_LD) | + (1 << PI_PERF_ST) | (1 << PI_PERF_JUMP) | (1 << PI_PERF_BRANCH) | + (1 << PI_PERF_TAKEN_BRANCH) | (1 << PI_PERF_RVC) | (1 << PI_PERF_LD_EXT) | + (1 << PI_PERF_ST_EXT) | (1 << PI_PERF_LD_EXT_CYC) | + (1 << PI_PERF_ST_EXT_CYC) | (1 << PI_PERF_TCDM_CONT)); } // Start performance monitoring static inline void perf_bench_start() { - pi_perf_reset(); - pi_perf_start(); + pi_perf_reset(); + pi_perf_start(); } // Stop performance monitoring -static inline void perf_bench_stop() { - pi_perf_stop(); -} +static inline void perf_bench_stop() { pi_perf_stop(); } // Read all performance counters into structure static inline void perf_bench_read(perf_stats_t *stats) { - stats->cycles = pi_perf_read(PI_PERF_CYCLES); - stats->instr = pi_perf_read(PI_PERF_INSTR); - stats->ld = pi_perf_read(PI_PERF_LD); - stats->st = pi_perf_read(PI_PERF_ST); - stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL); - stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL); - stats->imiss = pi_perf_read(PI_PERF_IMISS); - stats->branch = pi_perf_read(PI_PERF_BRANCH); - stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH); - stats->rvc = pi_perf_read(PI_PERF_RVC); - stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT); - stats->st_ext = pi_perf_read(PI_PERF_ST_EXT); - stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC); - stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC); - stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT); + stats->cycles = pi_perf_read(PI_PERF_CYCLES); + stats->instr = pi_perf_read(PI_PERF_INSTR); + stats->ld = pi_perf_read(PI_PERF_LD); + stats->st = pi_perf_read(PI_PERF_ST); + stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL); + stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL); + stats->imiss = pi_perf_read(PI_PERF_IMISS); + stats->branch = pi_perf_read(PI_PERF_BRANCH); + stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH); + stats->rvc = pi_perf_read(PI_PERF_RVC); + stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT); + stats->st_ext = pi_perf_read(PI_PERF_ST_EXT); + stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC); + stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC); + stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT); } // Print performance statistics (core 0 only to avoid clutter) static inline void perf_bench_print(const char *label, perf_stats_t *stats) { - if (pi_core_id() == 0) { - printf("\n=== Performance Statistics: %s ===\n", label); - printf("Cycles: %10u\n", stats->cycles); - printf("Instructions: %10u\n", stats->instr); - printf("IPC: %10.3f\n", - stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f); - printf("\n--- Instruction Mix ---\n"); - printf("Loads: %10u (%.2f%%)\n", stats->ld, - stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f); - printf("Stores: %10u (%.2f%%)\n", stats->st, - stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f); - printf("Branches: %10u (%.2f%%)\n", stats->branch, - stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f); - printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch, - stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f); - printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc, - stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f); - printf("\n--- Stalls & Hazards ---\n"); - printf("Load Stalls: %10u\n", stats->ld_stall); - printf("Jump Stalls: %10u\n", stats->jmp_stall); - printf("I-cache Misses: %10u\n", stats->imiss); - printf("TCDM Contentions: %10u\n", stats->tcdm_cont); - printf("\n--- Memory Hierarchy ---\n"); - printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext, - stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f); - printf("External Stores: %10u (%.2f%%)\n", stats->st_ext, - stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f); - printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc, - stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f); - printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc, - stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f); - printf("========================================\n\n"); - } + if (pi_core_id() == 0) { + printf("\n=== Performance Statistics: %s ===\n", label); + printf("Cycles: %10u\n", stats->cycles); + printf("Instructions: %10u\n", stats->instr); + printf("IPC: %10.3f\n", + stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f); + printf("\n--- Instruction Mix ---\n"); + printf("Loads: %10u (%.2f%%)\n", stats->ld, + stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f); + printf("Stores: %10u (%.2f%%)\n", stats->st, + stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f); + printf("Branches: %10u (%.2f%%)\n", stats->branch, + stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f); + printf("Taken Branches: %10u (%.2f%%)\n", stats->taken_branch, + stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch + : 0.0f); + printf("Compressed (RVC): %10u (%.2f%%)\n", stats->rvc, + stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f); + printf("\n--- Stalls & Hazards ---\n"); + printf("Load Stalls: %10u\n", stats->ld_stall); + printf("Jump Stalls: %10u\n", stats->jmp_stall); + printf("I-cache Misses: %10u\n", stats->imiss); + printf("TCDM Contentions: %10u\n", stats->tcdm_cont); + printf("\n--- Memory Hierarchy ---\n"); + printf("External Loads: %10u (%.2f%%)\n", stats->ld_ext, + stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f); + printf("External Stores: %10u (%.2f%%)\n", stats->st_ext, + stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f); + printf("Ext Load Cycles: %10u (avg: %.2f)\n", stats->ld_ext_cyc, + stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f); + printf("Ext Store Cycles: %10u (avg: %.2f)\n", stats->st_ext_cyc, + stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f); + printf("========================================\n\n"); + } } // Compute difference between two stats (for analyzing specific code sections) -static inline void perf_bench_diff(perf_stats_t *result, - perf_stats_t *end, - perf_stats_t *start) { - result->cycles = end->cycles - start->cycles; - result->instr = end->instr - start->instr; - result->ld = end->ld - start->ld; - result->st = end->st - start->st; - result->ld_stall = end->ld_stall - start->ld_stall; - result->jmp_stall = end->jmp_stall - start->jmp_stall; - result->imiss = end->imiss - start->imiss; - result->branch = end->branch - start->branch; - result->taken_branch = end->taken_branch - start->taken_branch; - result->rvc = end->rvc - start->rvc; - result->ld_ext = end->ld_ext - start->ld_ext; - result->st_ext = end->st_ext - start->st_ext; - result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc; - result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc; - result->tcdm_cont = end->tcdm_cont - start->tcdm_cont; +static inline void perf_bench_diff(perf_stats_t *result, perf_stats_t *end, + perf_stats_t *start) { + result->cycles = end->cycles - start->cycles; + result->instr = end->instr - start->instr; + result->ld = end->ld - start->ld; + result->st = end->st - start->st; + result->ld_stall = end->ld_stall - start->ld_stall; + result->jmp_stall = end->jmp_stall - start->jmp_stall; + result->imiss = end->imiss - start->imiss; + result->branch = end->branch - start->branch; + result->taken_branch = end->taken_branch - start->taken_branch; + result->rvc = end->rvc - start->rvc; + result->ld_ext = end->ld_ext - start->ld_ext; + result->st_ext = end->st_ext - start->st_ext; + result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc; + result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc; + result->tcdm_cont = end->tcdm_cont - start->tcdm_cont; } #endif // __PERF_UTILS_H__ From 4fe41a8ba2a56e42f841889797c550db06ab3cce Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 12:05:05 +0000 Subject: [PATCH 10/11] Add microbenchmark tutorial to docs --- docs/tutorials/microbenchmark.rst | 84 +++++++++++++++++++++++++++++++ docs/tutorials/overview.rst | 1 + 2 files changed, 85 insertions(+) create mode 100644 docs/tutorials/microbenchmark.rst diff --git a/docs/tutorials/microbenchmark.rst b/docs/tutorials/microbenchmark.rst new file mode 100644 index 0000000000..9f8d181b88 --- /dev/null +++ b/docs/tutorials/microbenchmark.rst @@ -0,0 +1,84 @@ +.. SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +.. +.. SPDX-License-Identifier: Apache-2.0 + +Per-Layer Microbenchmarking on PULPOpen +======================================= + +Deeploy can wrap each layer in the generated ``RunNetwork`` with PULP performance-counter instrumentation, producing per-layer reports of cycles, instructions, stalls, instruction-cache misses, branch behaviour, and external/TCDM memory traffic. This is intended for profiling individual layers of a deployed network on real hardware or in GVSoC, without modifying any kernel source. + +The instrumentation is **off by default** and adds zero overhead unless explicitly enabled. + +Enabling +-------- + +Pass ``--profileMicrobenchmark`` to any of the runner entry points: + +.. code-block:: bash + + python testMVP.py ... --profileMicrobenchmark + python generateNetwork.py ... --profileMicrobenchmark + python deeployRunner_siracusa.py -t Tests/Kernels/FP32/Add/Regular --profileMicrobenchmark + +The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling` +into the :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark` +code-transformation pass, which is registered at the outermost position of the PULPOpen +``ForkTransformer`` and ``ClusterTransformer`` chains. Because it runs last, the wrapped region +covers the full per-layer body, including all tiling, DMA, and memory-management code. + +Output Format +------------- + +Each layer emits one block of statistics on ``core 0``: + +.. code-block:: text + + === Performance Statistics: Add_0 === + Cycles: 1442 + Instructions: 149 + IPC: 0.103 + + --- Instruction Mix --- + Loads: 24 (16.11%) + Stores: 27 (18.12%) + Branches: 5 (3.36%) + Taken Branches: 2 (40.00%) + Compressed (RVC): 0 (0.00%) + + --- Stalls & Hazards --- + Load Stalls: 0 + Jump Stalls: 0 + I-cache Misses: 724 + TCDM Contentions: 0 + + --- Memory Hierarchy --- + External Loads: 0 (0.00%) + External Stores: 0 (0.00%) + Ext Load Cycles: 0 (avg: 0.00) + Ext Store Cycles: 0 (avg: 0.00) + ======================================== + +Underlying Helpers +------------------ + +The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h`` and are included by +default in PULPOpen builds via ``Platform.py``. The pass injects: + +- ``perf_bench_init()`` / ``perf_bench_start()`` / ``perf_bench_read(&start)`` before the layer body +- ``perf_bench_stop()`` / ``perf_bench_read(&end)`` / ``perf_bench_diff(&total, &end, &start)`` / + ``perf_bench_print("", &total)`` after it + +All counters listed in ``perf_stats_t`` are configured at once in ``pi_perf_conf``, so a single +wrap captures the full event set. + +Notes & Caveats +--------------- + +- **External memory counters** (``LD_EXT``, ``ST_EXT``, ``LD_EXT_CYC``, ``ST_EXT_CYC``) only show + non-zero values when the wrapped region performs L2/L3 traffic. Untiled tests that fit in L1/TCDM + will report zero. +- **TCDM contention** depends on the access pattern — regular, bank-friendly kernels (e.g. element-wise + Add) can legitimately report zero contention even with all 8 cores active. +- Some events may not be modelled by GVSoC; verify on a tiled test (e.g. Siracusa-tiled GEMM) before + concluding a counter is broken. +- Output is printed by ``core 0`` only to keep logs readable. diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst index 0b3d97c761..c0a9660104 100644 --- a/docs/tutorials/overview.rst +++ b/docs/tutorials/overview.rst @@ -14,5 +14,6 @@ Each tutorial covers a specific topic and includes code examples to illustrate t introduction debugging + microbenchmark From d85ac5fc6325a1b516a20f8cd0997a7c5192af11 Mon Sep 17 00:00:00 2001 From: Run Wang Date: Mon, 13 Apr 2026 12:13:26 +0000 Subject: [PATCH 11/11] Trim microbenchmark tutorial --- docs/tutorials/microbenchmark.rst | 78 ++++--------------------------- 1 file changed, 9 insertions(+), 69 deletions(-) diff --git a/docs/tutorials/microbenchmark.rst b/docs/tutorials/microbenchmark.rst index 9f8d181b88..c005090020 100644 --- a/docs/tutorials/microbenchmark.rst +++ b/docs/tutorials/microbenchmark.rst @@ -2,34 +2,14 @@ .. .. SPDX-License-Identifier: Apache-2.0 -Per-Layer Microbenchmarking on PULPOpen -======================================= +Microbenchmark +============== -Deeploy can wrap each layer in the generated ``RunNetwork`` with PULP performance-counter instrumentation, producing per-layer reports of cycles, instructions, stalls, instruction-cache misses, branch behaviour, and external/TCDM memory traffic. This is intended for profiling individual layers of a deployed network on real hardware or in GVSoC, without modifying any kernel source. +Pass ``--profileMicrobenchmark`` to any PULPOpen runner (``testMVP.py``, ``generateNetwork.py``, ``deeployRunner_*.py``) to wrap each layer in ``RunNetwork`` with PULP performance counters. Off by default; zero overhead when unused. -The instrumentation is **off by default** and adds zero overhead unless explicitly enabled. +The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling` into :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark`, which is registered last in the PULPOpen ``ForkTransformer`` and ``ClusterTransformer`` chains so it covers the full per-layer body (tiling, DMA, memory management). The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h``. -Enabling --------- - -Pass ``--profileMicrobenchmark`` to any of the runner entry points: - -.. code-block:: bash - - python testMVP.py ... --profileMicrobenchmark - python generateNetwork.py ... --profileMicrobenchmark - python deeployRunner_siracusa.py -t Tests/Kernels/FP32/Add/Regular --profileMicrobenchmark - -The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling` -into the :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark` -code-transformation pass, which is registered at the outermost position of the PULPOpen -``ForkTransformer`` and ``ClusterTransformer`` chains. Because it runs last, the wrapped region -covers the full per-layer body, including all tiling, DMA, and memory-management code. - -Output Format -------------- - -Each layer emits one block of statistics on ``core 0``: +Each layer prints one block on ``core 0``: .. code-block:: text @@ -37,48 +17,8 @@ Each layer emits one block of statistics on ``core 0``: Cycles: 1442 Instructions: 149 IPC: 0.103 + Loads / Stores / Branches / Taken Branches / RVC + Load Stalls / Jump Stalls / I-cache Misses / TCDM Contentions + External Loads / Stores and their cycle counts - --- Instruction Mix --- - Loads: 24 (16.11%) - Stores: 27 (18.12%) - Branches: 5 (3.36%) - Taken Branches: 2 (40.00%) - Compressed (RVC): 0 (0.00%) - - --- Stalls & Hazards --- - Load Stalls: 0 - Jump Stalls: 0 - I-cache Misses: 724 - TCDM Contentions: 0 - - --- Memory Hierarchy --- - External Loads: 0 (0.00%) - External Stores: 0 (0.00%) - Ext Load Cycles: 0 (avg: 0.00) - Ext Store Cycles: 0 (avg: 0.00) - ======================================== - -Underlying Helpers ------------------- - -The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h`` and are included by -default in PULPOpen builds via ``Platform.py``. The pass injects: - -- ``perf_bench_init()`` / ``perf_bench_start()`` / ``perf_bench_read(&start)`` before the layer body -- ``perf_bench_stop()`` / ``perf_bench_read(&end)`` / ``perf_bench_diff(&total, &end, &start)`` / - ``perf_bench_print("", &total)`` after it - -All counters listed in ``perf_stats_t`` are configured at once in ``pi_perf_conf``, so a single -wrap captures the full event set. - -Notes & Caveats ---------------- - -- **External memory counters** (``LD_EXT``, ``ST_EXT``, ``LD_EXT_CYC``, ``ST_EXT_CYC``) only show - non-zero values when the wrapped region performs L2/L3 traffic. Untiled tests that fit in L1/TCDM - will report zero. -- **TCDM contention** depends on the access pattern — regular, bank-friendly kernels (e.g. element-wise - Add) can legitimately report zero contention even with all 8 cores active. -- Some events may not be modelled by GVSoC; verify on a tiled test (e.g. Siracusa-tiled GEMM) before - concluding a counter is broken. -- Output is printed by ``core 0`` only to keep logs readable. +External-memory and TCDM-contention counters are zero when the wrapped region has no L2/L3 traffic or no bank conflicts (e.g. small untiled kernels that fit in L1). Some events may not be modelled by GVSoC — verify on a tiled test before assuming a counter is broken.