From a437c2e6b2b867ad1dc5c0bcc2b51902870f2246 Mon Sep 17 00:00:00 2001
From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com>
Date: Thu, 12 Feb 2026 15:27:58 +0000
Subject: [PATCH 01/11] Deeploy Microbenchmark with GVSoC CSR and Demo on GEMM

---
 TargetLibraries/PULPOpen/src/Gemm.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c
index a46f8ac6ae..02fd991674 100644
--- a/TargetLibraries/PULPOpen/src/Gemm.c
+++ b/TargetLibraries/PULPOpen/src/Gemm.c
@@ -6,6 +6,7 @@
 
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
+// #include "perf_utils.h"
 
 void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
                                    const float32_t *__restrict__ pSrcB,
@@ -17,6 +18,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
+  //RW: Performance monitoring is currently disabled 
+  // perf_stats_t perf_start, perf_end, perf_total;
+
+  // // Initialize and start performance counters (only core 0)
+  // if (core_id == 0) {
+  //   perf_bench_init();
+  //   perf_bench_start();
+  //   perf_bench_read(&perf_start);
+  // }
+
   uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0);
   uint32_t M_start = MIN(core_id * M_chunk, M);
   uint32_t M_end = MIN(M_start + M_chunk, M);
@@ -351,4 +362,16 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
       }
     }
   }
+
+  // RW: Stop performance counters and print results (only core 0)
+  // if (core_id == 0) {
+  //   perf_bench_stop();
+  //   perf_bench_read(&perf_end);
+  //   perf_bench_diff(&perf_total, &perf_end, &perf_start);
+
+  //   char label[100];
+  //   snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u",
+  //            M, N, O, transA, transB);
+  //   perf_bench_print(label, &perf_total);
+  // }
 }
\ No newline at end of file

From 0b61499cd7bdddc3c29f3bccf05897ada08f73e6 Mon Sep 17 00:00:00 2001
From: Run Wang <52746141+SamanthaWangdl@users.noreply.github.com>
Date: Sun, 15 Feb 2026 22:06:11 +0000
Subject: [PATCH 02/11] Add microbenchmark to codepass

---
 Deeploy/Targets/PULPOpen/Bindings.py          |   4 +-
 .../PULPClusterTiling.py                      |  39 ++++-
 Deeploy/Targets/PULPOpen/Platform.py          |   2 +-
 .../DoubleBufferingTilingCodeGeneration.py    |  39 ++++-
 .../SingleBufferingTilingCodeGeneration.py    |  40 ++++-
 .../TilingPrototypes.py                       |  99 +++++++++++
 TargetLibraries/PULPOpen/inc/perf_utils.h     | 158 ++++++++++++++++++
 TargetLibraries/PULPOpen/src/Gemm.c           |  23 ---
 8 files changed, 370 insertions(+), 34 deletions(-)
 create mode 100644 TargetLibraries/PULPOpen/inc/perf_utils.h

diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 5d7b02ae62..8a5f92ea2c 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -103,7 +103,7 @@
     PULPSynchCoresPass(),
     ForkClosure(writeback = False, generateStruct = True),
     TilingVariableReplacementUpdate("L1"),
-    PULPClusterTiling("L2", "L1", MchanDma()),
+    PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True),  # Enable perf counters
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
@@ -121,7 +121,7 @@
     TilingVariableReplacement("L1"),
     TilingCallClosure(writeback = False, generateStruct = True),
     TilingVariableReplacementUpdate("L1"),
-    PULPClusterTiling("L2", "L1", MchanDma()),
+    PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True),  # Enable perf counters
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
index 3c0bba3107..59aec47a5d 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
@@ -7,9 +7,9 @@
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
 from Deeploy.TilingExtension.AsyncDma import AsyncDma
 from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
-    DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
+    DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn
 from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
-    ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
+    PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
 
 
 class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration):
@@ -28,13 +28,38 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration
     pass
 
 
+class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn):
+    """Single buffering with performance counter profiling"""
+    pass
+
+
+class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn):
+    """Double buffering with performance counter profiling"""
+    pass
+
+
+class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn):
+    """Single buffering with both cycle profiling and performance counter profiling"""
+    pass
+
+
+class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn):
+    """Double buffering with both cycle profiling and performance counter profiling"""
+    pass
+
+
 class PULPClusterTiling(CodeTransformationPass):
 
-    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False):
+        self.usePerfCounters = usePerfCounters
         self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
         self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
+        self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
+        self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
         self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
         self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
+        self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
+        self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
 
     def apply(self,
               ctxt: NetworkContext,
@@ -42,10 +67,16 @@ def apply(self,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
 
-        if verbose.tilingProfiling:
+        if self.usePerfCounters and verbose.tilingProfiling:
+            # Use combined profiling: cycle measurements + performance counter stats
+            ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name)
+        elif verbose.tilingProfiling:
+            # Use cycle profiling only (basic cycle measurements)
             ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
             ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
         else:
+            # No profiling
             ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
             ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
 
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 7456dd9e1b..11b9747526 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -248,7 +248,7 @@ class PULPStructBuffer(StructBuffer):
 
 # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
 _includeList = [
-    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h"
+    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h"
 ]
 
 
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
index ad9c6ad012..ce9ec86f27 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
@@ -11,8 +11,8 @@
 from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
-    PrototypeTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
+    ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape
 
@@ -364,3 +364,38 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
         executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
                                                   _egressDMAStatements, closeLoopStatements)
         return executionBlock
+
+class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
+    """
+    Double buffering tiling with performance counter profiling.
+    Provides detailed instruction-level statistics for each tile.
+    """
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: List[CodeSnippet],
+                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
+                                                              teardownStatements)
+
+        # Inject performance counter initialization in setup (only once, not per-tile)
+        executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
+
+        # Inject performance counter stop and print in teardown (only once, not per-tile)
+        executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
+
+        return executionBlock
+
+    @classmethod
+    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
+                         egressDMAStatements: List[CodeSnippet],
+                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
+        # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
+
+        executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
+                                                  egressDMAStatements, closeLoopStatements)
+        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
index ea1e938b58..e4bb803611 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
@@ -10,8 +10,8 @@
 from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
-    PrototypeTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
+    ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
 from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme
 
@@ -191,3 +191,39 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
         executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
                                                   _egressDMAStatements, closeLoopStatements)
         return executionBlock
+
+
+class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
+    """
+    Single buffering tiling with performance counter profiling.
+    Provides detailed instruction-level statistics for each tile.
+    """
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: List[CodeSnippet],
+                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
+                                                              teardownStatements)
+
+        # Inject performance counter initialization in setup (only once, not per-tile)
+        executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
+
+        # Inject performance counter stop and print in teardown (only once, not per-tile)
+        executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
+
+        return executionBlock
+
+    @classmethod
+    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
+                         egressDMAStatements: List[CodeSnippet],
+                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
+        # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
+
+        executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
+                                                  egressDMAStatements, closeLoopStatements)
+        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
index 09a4ef56eb..70aabd9805 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
@@ -64,6 +64,105 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM
         return executionBlock
 
 
+class PerfCounterProfilingMixIn(ABC):
+    """
+    MixIn for injecting performance counter profiling code.
+    Provides detailed instruction-level statistics using CSR performance counters.
+    """
+
+    _perfCounterInit = NodeTemplate("""
+    perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total;
+    if (pi_core_id() == 0) {
+        perf_bench_init();
+        perf_bench_start();
+        perf_bench_read(&${nodeName}_perf_start);
+    }
+    """)
+
+    _perfCounterStop = NodeTemplate("""
+    if (pi_core_id() == 0) {
+        perf_bench_stop();
+        perf_bench_read(&${nodeName}_perf_end);
+        perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start);
+        perf_bench_print("${nodeName}", &${nodeName}_perf_total);
+    }
+    """)
+
+    _perfCounterKernelStart = NodeTemplate("""
+    if (pi_core_id() == 0) {
+        perf_bench_start();
+        perf_bench_read(&${nodeName}_perf_kernel_start);
+    }
+    """)
+
+    _perfCounterKernelEnd = NodeTemplate("""
+    if (pi_core_id() == 0) {
+        perf_bench_stop();
+        perf_bench_read(&${nodeName}_perf_kernel_end);
+        perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start);
+        perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total);
+    }
+    """)
+
+    _perfCounterKernelDecl = NodeTemplate("""
+    perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total;
+    """)
+
+    @classmethod
+    def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
+        """
+        Inject performance counter initialization at the beginning of the node execution.
+        This should be called in the setup phase.
+        """
+        nodeName = metaInfo.nodeName
+
+        executionBlock.addLeft(cls._perfCounterInit, {
+            "nodeName": nodeName,
+        })
+
+        return executionBlock
+
+    @classmethod
+    def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
+        """
+        Inject performance counter stop and print at the end of the node execution.
+        This should be called in the teardown phase.
+        """
+        nodeName = metaInfo.nodeName
+
+        executionBlock.addRight(cls._perfCounterStop, {
+            "nodeName": nodeName,
+        })
+
+        return executionBlock
+
+    @classmethod
+    def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
+        """
+        Wrap the kernel execution with performance counter measurements.
+        This provides detailed statistics for just the kernel computation (excluding DMA).
+        """
+        nodeName = metaInfo.nodeName
+
+        if metaInfo.kernelLevelTiling:
+            # Add declaration at the beginning
+            executionBlock.addLeft(cls._perfCounterKernelDecl, {
+                "nodeName": nodeName,
+            })
+
+            # Add start measurement before kernel
+            executionBlock.addLeft(cls._perfCounterKernelStart, {
+                "nodeName": nodeName,
+            })
+
+            # Add stop and print after kernel
+            executionBlock.addRight(cls._perfCounterKernelEnd, {
+                "nodeName": nodeName,
+            })
+
+        return executionBlock
+
+
 class ProfilingPrototypeMixIn(ABC):
     _measureCycles = NodeTemplate("""
     ${measurements}[${tileIdxVar}] = getCycles();
diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h
new file mode 100644
index 0000000000..2d9fbc39c6
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/perf_utils.h
@@ -0,0 +1,158 @@
+/*
+ * Performance Counter Utilities for PULP Benchmarking
+ */
+
+#ifndef __PERF_UTILS_H__
+#define __PERF_UTILS_H__
+
+#include "pmsis.h"
+
+// Performance event IDs (compatible with PMSIS)
+#define PI_PERF_CYCLES          CSR_PCER_CYCLES
+#define PI_PERF_INSTR           CSR_PCER_INSTR
+#define PI_PERF_LD_STALL        CSR_PCER_LD_STALL
+#define PI_PERF_JMP_STALL       CSR_PCER_JMP_STALL
+#define PI_PERF_IMISS           CSR_PCER_IMISS
+#define PI_PERF_LD              CSR_PCER_LD
+#define PI_PERF_ST              CSR_PCER_ST
+#define PI_PERF_JUMP            CSR_PCER_JUMP
+#define PI_PERF_BRANCH          CSR_PCER_BRANCH
+#define PI_PERF_TAKEN_BRANCH    CSR_PCER_TAKEN_BRANCH
+#define PI_PERF_RVC             CSR_PCER_RVC
+#define PI_PERF_LD_EXT          CSR_PCER_LD_EXT
+#define PI_PERF_ST_EXT          CSR_PCER_ST_EXT
+#define PI_PERF_LD_EXT_CYC      CSR_PCER_LD_EXT_CYC
+#define PI_PERF_ST_EXT_CYC      CSR_PCER_ST_EXT_CYC
+#define PI_PERF_TCDM_CONT       CSR_PCER_TCDM_CONT
+
+// Benchmark statistics structure
+typedef struct {
+    unsigned int cycles;
+    unsigned int instr;
+    unsigned int ld;
+    unsigned int st;
+    unsigned int ld_stall;
+    unsigned int jmp_stall;
+    unsigned int imiss;
+    unsigned int branch;
+    unsigned int taken_branch;
+    unsigned int rvc;
+    unsigned int ld_ext;
+    unsigned int st_ext;
+    unsigned int ld_ext_cyc;
+    unsigned int st_ext_cyc;
+    unsigned int tcdm_cont;
+} perf_stats_t;
+
+// Initialize performance counters for comprehensive benchmarking
+static inline void perf_bench_init() {
+    // Enable all performance counters
+    pi_perf_conf(
+        (1 << PI_PERF_CYCLES) |
+        (1 << PI_PERF_INSTR) |
+        (1 << PI_PERF_LD_STALL) |
+        (1 << PI_PERF_JMP_STALL) |
+        (1 << PI_PERF_IMISS) |
+        (1 << PI_PERF_LD) |
+        (1 << PI_PERF_ST) |
+        (1 << PI_PERF_JUMP) |
+        (1 << PI_PERF_BRANCH) |
+        (1 << PI_PERF_TAKEN_BRANCH) |
+        (1 << PI_PERF_RVC) |
+        (1 << PI_PERF_LD_EXT) |
+        (1 << PI_PERF_ST_EXT) |
+        (1 << PI_PERF_LD_EXT_CYC) |
+        (1 << PI_PERF_ST_EXT_CYC) |
+        (1 << PI_PERF_TCDM_CONT)
+    );
+}
+
+// Start performance monitoring
+static inline void perf_bench_start() {
+    pi_perf_reset();
+    pi_perf_start();
+}
+
+// Stop performance monitoring
+static inline void perf_bench_stop() {
+    pi_perf_stop();
+}
+
+// Read all performance counters into structure
+static inline void perf_bench_read(perf_stats_t *stats) {
+    stats->cycles = pi_perf_read(PI_PERF_CYCLES);
+    stats->instr = pi_perf_read(PI_PERF_INSTR);
+    stats->ld = pi_perf_read(PI_PERF_LD);
+    stats->st = pi_perf_read(PI_PERF_ST);
+    stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL);
+    stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL);
+    stats->imiss = pi_perf_read(PI_PERF_IMISS);
+    stats->branch = pi_perf_read(PI_PERF_BRANCH);
+    stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH);
+    stats->rvc = pi_perf_read(PI_PERF_RVC);
+    stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT);
+    stats->st_ext = pi_perf_read(PI_PERF_ST_EXT);
+    stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC);
+    stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC);
+    stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT);
+}
+
+// Print performance statistics (core 0 only to avoid clutter)
+static inline void perf_bench_print(const char *label, perf_stats_t *stats) {
+    if (pi_core_id() == 0) {
+        printf("\n=== Performance Statistics: %s ===\n", label);
+        printf("Cycles:              %10u\n", stats->cycles);
+        printf("Instructions:        %10u\n", stats->instr);
+        printf("IPC:                 %10.3f\n",
+               stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f);
+        printf("\n--- Instruction Mix ---\n");
+        printf("Loads:               %10u (%.2f%%)\n", stats->ld,
+               stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f);
+        printf("Stores:              %10u (%.2f%%)\n", stats->st,
+               stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f);
+        printf("Branches:            %10u (%.2f%%)\n", stats->branch,
+               stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f);
+        printf("Taken Branches:      %10u (%.2f%%)\n", stats->taken_branch,
+               stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f);
+        printf("Compressed (RVC):    %10u (%.2f%%)\n", stats->rvc,
+               stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f);
+        printf("\n--- Stalls & Hazards ---\n");
+        printf("Load Stalls:         %10u\n", stats->ld_stall);
+        printf("Jump Stalls:         %10u\n", stats->jmp_stall);
+        printf("I-cache Misses:      %10u\n", stats->imiss);
+        printf("TCDM Contentions:    %10u\n", stats->tcdm_cont);
+        printf("\n--- Memory Hierarchy ---\n");
+        printf("External Loads:      %10u (%.2f%%)\n", stats->ld_ext,
+               stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f);
+        printf("External Stores:     %10u (%.2f%%)\n", stats->st_ext,
+               stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f);
+        printf("Ext Load Cycles:     %10u (avg: %.2f)\n", stats->ld_ext_cyc,
+               stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f);
+        printf("Ext Store Cycles:    %10u (avg: %.2f)\n", stats->st_ext_cyc,
+               stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f);
+        printf("========================================\n\n");
+    }
+}
+
+// Compute difference between two stats (for analyzing specific code sections)
+static inline void perf_bench_diff(perf_stats_t *result,
+                                    perf_stats_t *end,
+                                    perf_stats_t *start) {
+    result->cycles = end->cycles - start->cycles;
+    result->instr = end->instr - start->instr;
+    result->ld = end->ld - start->ld;
+    result->st = end->st - start->st;
+    result->ld_stall = end->ld_stall - start->ld_stall;
+    result->jmp_stall = end->jmp_stall - start->jmp_stall;
+    result->imiss = end->imiss - start->imiss;
+    result->branch = end->branch - start->branch;
+    result->taken_branch = end->taken_branch - start->taken_branch;
+    result->rvc = end->rvc - start->rvc;
+    result->ld_ext = end->ld_ext - start->ld_ext;
+    result->st_ext = end->st_ext - start->st_ext;
+    result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc;
+    result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc;
+    result->tcdm_cont = end->tcdm_cont - start->tcdm_cont;
+}
+
+#endif // __PERF_UTILS_H__
diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c
index 02fd991674..a46f8ac6ae 100644
--- a/TargetLibraries/PULPOpen/src/Gemm.c
+++ b/TargetLibraries/PULPOpen/src/Gemm.c
@@ -6,7 +6,6 @@
 
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
-// #include "perf_utils.h"
 
 void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
                                    const float32_t *__restrict__ pSrcB,
@@ -18,16 +17,6 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
   int8_t core_id = pi_core_id();
   int8_t log2Core = LOG2(NUM_CORES);
 
-  //RW: Performance monitoring is currently disabled 
-  // perf_stats_t perf_start, perf_end, perf_total;
-
-  // // Initialize and start performance counters (only core 0)
-  // if (core_id == 0) {
-  //   perf_bench_init();
-  //   perf_bench_start();
-  //   perf_bench_read(&perf_start);
-  // }
-
   uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0);
   uint32_t M_start = MIN(core_id * M_chunk, M);
   uint32_t M_end = MIN(M_start + M_chunk, M);
@@ -362,16 +351,4 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
       }
     }
   }
-
-  // RW: Stop performance counters and print results (only core 0)
-  // if (core_id == 0) {
-  //   perf_bench_stop();
-  //   perf_bench_read(&perf_end);
-  //   perf_bench_diff(&perf_total, &perf_end, &perf_start);
-
-  //   char label[100];
-  //   snprintf(label, sizeof(label), "GEMM M=%u N=%u O=%u transA=%u transB=%u",
-  //            M, N, O, transA, transB);
-  //   perf_bench_print(label, &perf_total);
-  // }
 }
\ No newline at end of file

From e7dd555f881b38542135f3e477850ef230ecaa7c Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 09:29:43 +0000
Subject: [PATCH 03/11] Update pro microbenchmark  codetransformation

---
 Deeploy/DeeployTypes.py                       |  1 +
 Deeploy/Targets/PULPOpen/Bindings.py          |  7 +-
 .../PULPClusterTiling.py                      | 39 +-------
 .../PULPMicrobenchmark.py                     | 42 ++++++++
 .../DoubleBufferingTilingCodeGeneration.py    | 39 +-------
 .../SingleBufferingTilingCodeGeneration.py    | 38 +------
 .../TilingPrototypes.py                       | 99 -------------------
 DeeployTest/testMVP.py                        |  6 ++
 DeeployTest/testUtils/deeployRunner.py        |  5 +
 DeeployTest/testUtils/testRunner.py           |  5 +
 10 files changed, 72 insertions(+), 209 deletions(-)
 create mode 100644 Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py

diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 797bd44c47..de5a66aae9 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -53,6 +53,7 @@ class CodeGenVerbosity:
 
     tilingProfiling: Optional[bool] = False  # Specifies if we should profile the tiling code
     untiledProfiling: Optional[bool] = None  #  Specifies if we should profile the untilied code
+    microbenchmarkProfiling: Optional[bool] = False  # Wrap each layer with PULP perf-counter microbenchmark
 
 
 _NoVerbosity = CodeGenVerbosity(None)
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 8a5f92ea2c..2c78978e23 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -24,6 +24,7 @@
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark import PULPMicrobenchmark
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
 from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
@@ -103,7 +104,7 @@
     PULPSynchCoresPass(),
     ForkClosure(writeback = False, generateStruct = True),
     TilingVariableReplacementUpdate("L1"),
-    PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True),  # Enable perf counters
+    PULPClusterTiling("L2", "L1", MchanDma()),
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
@@ -115,13 +116,14 @@
     MemoryManagementGeneration("L2"),
     MemoryManagementGeneration("L3.*"),
     MemoryManagementGeneration(),
+    PULPMicrobenchmark(),
 ])
 
 ClusterTransformer = CodeTransformation([
     TilingVariableReplacement("L1"),
     TilingCallClosure(writeback = False, generateStruct = True),
     TilingVariableReplacementUpdate("L1"),
-    PULPClusterTiling("L2", "L1", MchanDma(), usePerfCounters=True),  # Enable perf counters
+    PULPClusterTiling("L2", "L1", MchanDma()),
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
@@ -133,6 +135,7 @@
     MemoryManagementGeneration("L2"),
     MemoryManagementGeneration("L3.*"),
     MemoryManagementGeneration(),
+    PULPMicrobenchmark(),
 ])
 
 SimpleTransformer = CodeTransformation([
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
index 59aec47a5d..3c0bba3107 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
@@ -7,9 +7,9 @@
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
 from Deeploy.TilingExtension.AsyncDma import AsyncDma
 from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
-    DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn, ProfilingDoubleBufferingTilingMixIn
+    DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
 from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
-    PerfCounterSingleBufferingTilingMixIn, ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
+    ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
 
 
 class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration):
@@ -28,38 +28,13 @@ class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration
     pass
 
 
-class PerfCounterPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, PerfCounterSingleBufferingTilingMixIn):
-    """Single buffering with performance counter profiling"""
-    pass
-
-
-class PerfCounterPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, PerfCounterDoubleBufferingTilingMixIn):
-    """Double buffering with performance counter profiling"""
-    pass
-
-
-class CombinedProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn, PerfCounterSingleBufferingTilingMixIn):
-    """Single buffering with both cycle profiling and performance counter profiling"""
-    pass
-
-
-class CombinedProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn, PerfCounterDoubleBufferingTilingMixIn):
-    """Double buffering with both cycle profiling and performance counter profiling"""
-    pass
-
-
 class PULPClusterTiling(CodeTransformationPass):
 
-    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, usePerfCounters: bool = False):
-        self.usePerfCounters = usePerfCounters
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
         self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
         self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
-        self.perfCounterSB = PerfCounterPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
-        self.combinedProfilingSB = CombinedProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
         self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
         self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
-        self.perfCounterDB = PerfCounterPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
-        self.combinedProfilingDB = CombinedProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
 
     def apply(self,
               ctxt: NetworkContext,
@@ -67,16 +42,10 @@ def apply(self,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
 
-        if self.usePerfCounters and verbose.tilingProfiling:
-            # Use combined profiling: cycle measurements + performance counter stats
-            ctxt, executionBlock = self.combinedProfilingSB.apply(ctxt, executionBlock, name)
-            ctxt, executionBlock = self.combinedProfilingDB.apply(ctxt, executionBlock, name)
-        elif verbose.tilingProfiling:
-            # Use cycle profiling only (basic cycle measurements)
+        if verbose.tilingProfiling:
             ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
             ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
         else:
-            # No profiling
             ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
             ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
 
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py
new file mode 100644
index 0000000000..bb35f32d47
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Tuple
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, _NoVerbosity
+
+
+class PULPMicrobenchmark(CodeTransformationPass):
+
+    _preTemplate = NodeTemplate("""
+    perf_stats_t ${op}_perf_start, ${op}_perf_end, ${op}_perf_total;
+    if (pi_core_id() == 0) {
+        perf_bench_init();
+        perf_bench_start();
+        perf_bench_read(&${op}_perf_start);
+    }
+    """)
+
+    _postTemplate = NodeTemplate("""
+    if (pi_core_id() == 0) {
+        perf_bench_stop();
+        perf_bench_read(&${op}_perf_end);
+        perf_bench_diff(&${op}_perf_total, &${op}_perf_end, &${op}_perf_start);
+        perf_bench_print("${op}", &${op}_perf_total);
+    }
+    """)
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        if not verbose.microbenchmarkProfiling:
+            return ctxt, executionBlock
+
+        executionBlock.addLeft(self._preTemplate, {"op": name})
+        executionBlock.addRight(self._postTemplate, {"op": name})
+        return ctxt, executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
index ce9ec86f27..ad9c6ad012 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
@@ -11,8 +11,8 @@
 from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
-    ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
+    PrototypeTilingMixIn, TilingMetaInfo
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape
 
@@ -364,38 +364,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
         executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
                                                   _egressDMAStatements, closeLoopStatements)
         return executionBlock
-
-class PerfCounterDoubleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
-    """
-    Double buffering tiling with performance counter profiling.
-    Provides detailed instruction-level statistics for each tile.
-    """
-
-    @classmethod
-    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                                     setupStatements: List[CodeSnippet],
-                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
-
-        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
-                                                              teardownStatements)
-
-        # Inject performance counter initialization in setup (only once, not per-tile)
-        executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
-
-        # Inject performance counter stop and print in teardown (only once, not per-tile)
-        executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
-
-        return executionBlock
-
-    @classmethod
-    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
-                         egressDMAStatements: List[CodeSnippet],
-                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
-
-        # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
-        # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
-
-        executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
-                                                  egressDMAStatements, closeLoopStatements)
-        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
index e4bb803611..d234776b57 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
@@ -10,8 +10,8 @@
 from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PerfCounterProfilingMixIn, \
-    ProfilingPrototypeMixIn, PrototypeTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
+    PrototypeTilingMixIn, TilingMetaInfo
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
 from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme
 
@@ -193,37 +193,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
         return executionBlock
 
 
-class PerfCounterSingleBufferingTilingMixIn(PrototypeTilingMixIn, PerfCounterProfilingMixIn):
-    """
-    Single buffering tiling with performance counter profiling.
-    Provides detailed instruction-level statistics for each tile.
-    """
-
-    @classmethod
-    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                                     setupStatements: List[CodeSnippet],
-                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
-
-        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
-                                                              teardownStatements)
-
-        # Inject performance counter initialization in setup (only once, not per-tile)
-        executionBlock = cls.injectPerfCounterInit(executionBlock, metaInfo)
-
-        # Inject performance counter stop and print in teardown (only once, not per-tile)
-        executionBlock = cls.injectPerfCounterStop(executionBlock, metaInfo)
-
-        return executionBlock
-
-    @classmethod
-    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
-                         egressDMAStatements: List[CodeSnippet],
-                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
-
-        # Don't wrap kernel - perf counters measure the whole tiling loop, not individual tiles
-        # executionBlock = cls.injectPerfCounterKernelWrap(executionBlock, metaInfo)
-
-        executionBlock = super().generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
-                                                  egressDMAStatements, closeLoopStatements)
-        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
index 70aabd9805..09a4ef56eb 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
@@ -64,105 +64,6 @@ def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingM
         return executionBlock
 
 
-class PerfCounterProfilingMixIn(ABC):
-    """
-    MixIn for injecting performance counter profiling code.
-    Provides detailed instruction-level statistics using CSR performance counters.
-    """
-
-    _perfCounterInit = NodeTemplate("""
-    perf_stats_t ${nodeName}_perf_start, ${nodeName}_perf_end, ${nodeName}_perf_total;
-    if (pi_core_id() == 0) {
-        perf_bench_init();
-        perf_bench_start();
-        perf_bench_read(&${nodeName}_perf_start);
-    }
-    """)
-
-    _perfCounterStop = NodeTemplate("""
-    if (pi_core_id() == 0) {
-        perf_bench_stop();
-        perf_bench_read(&${nodeName}_perf_end);
-        perf_bench_diff(&${nodeName}_perf_total, &${nodeName}_perf_end, &${nodeName}_perf_start);
-        perf_bench_print("${nodeName}", &${nodeName}_perf_total);
-    }
-    """)
-
-    _perfCounterKernelStart = NodeTemplate("""
-    if (pi_core_id() == 0) {
-        perf_bench_start();
-        perf_bench_read(&${nodeName}_perf_kernel_start);
-    }
-    """)
-
-    _perfCounterKernelEnd = NodeTemplate("""
-    if (pi_core_id() == 0) {
-        perf_bench_stop();
-        perf_bench_read(&${nodeName}_perf_kernel_end);
-        perf_bench_diff(&${nodeName}_perf_kernel_total, &${nodeName}_perf_kernel_end, &${nodeName}_perf_kernel_start);
-        perf_bench_print("${nodeName} Kernel", &${nodeName}_perf_kernel_total);
-    }
-    """)
-
-    _perfCounterKernelDecl = NodeTemplate("""
-    perf_stats_t ${nodeName}_perf_kernel_start, ${nodeName}_perf_kernel_end, ${nodeName}_perf_kernel_total;
-    """)
-
-    @classmethod
-    def injectPerfCounterInit(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
-        """
-        Inject performance counter initialization at the beginning of the node execution.
-        This should be called in the setup phase.
-        """
-        nodeName = metaInfo.nodeName
-
-        executionBlock.addLeft(cls._perfCounterInit, {
-            "nodeName": nodeName,
-        })
-
-        return executionBlock
-
-    @classmethod
-    def injectPerfCounterStop(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
-        """
-        Inject performance counter stop and print at the end of the node execution.
-        This should be called in the teardown phase.
-        """
-        nodeName = metaInfo.nodeName
-
-        executionBlock.addRight(cls._perfCounterStop, {
-            "nodeName": nodeName,
-        })
-
-        return executionBlock
-
-    @classmethod
-    def injectPerfCounterKernelWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
-        """
-        Wrap the kernel execution with performance counter measurements.
-        This provides detailed statistics for just the kernel computation (excluding DMA).
-        """
-        nodeName = metaInfo.nodeName
-
-        if metaInfo.kernelLevelTiling:
-            # Add declaration at the beginning
-            executionBlock.addLeft(cls._perfCounterKernelDecl, {
-                "nodeName": nodeName,
-            })
-
-            # Add start measurement before kernel
-            executionBlock.addLeft(cls._perfCounterKernelStart, {
-                "nodeName": nodeName,
-            })
-
-            # Add stop and print after kernel
-            executionBlock.addRight(cls._perfCounterKernelEnd, {
-                "nodeName": nodeName,
-            })
-
-        return executionBlock
-
-
 class ProfilingPrototypeMixIn(ABC):
     _measureCycles = NodeTemplate("""
     ${measurements}[${tileIdxVar}] = getCycles();
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 69e04343ff..15fb6a2ac5 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -203,6 +203,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                             - min: Initalize all variables at their minimal value.
                         """)
     parser.add_argument('--profileTiling', action = "store_true")
+    parser.add_argument('--profileMicrobenchmark',
+                        action = "store_true",
+                        help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation')
     parser.add_argument('--plotMemAlloc',
                         action = 'store_true',
                         help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
@@ -224,6 +227,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     if args.profileTiling:
         verbosityCfg.tilingProfiling = True
 
+    if args.profileMicrobenchmark:
+        verbosityCfg.microbenchmarkProfiling = True
+
     onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
     graph = gs.import_onnx(onnx_graph)
 
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index a5a8d70ef3..3c03e5b4c3 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -143,6 +143,9 @@ def __init__(self,
                               action = "store_true",
                               help = 'Enable randomized memory scheduler\n')
             self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n')
+            self.add_argument('--profileMicrobenchmark',
+                              action = 'store_true',
+                              help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
             self.add_argument('--memAllocStrategy',
                               metavar = '<strategy>',
                               dest = 'memAllocStrategy',
@@ -225,6 +228,8 @@ def create_config_from_args(args: argparse.Namespace,
             gen_args_list.append("--randomizedMemoryScheduler")
         if hasattr(args, 'profileTiling') and args.profileTiling:
             gen_args_list.append("--profileTiling")
+        if hasattr(args, 'profileMicrobenchmark') and args.profileMicrobenchmark:
+            gen_args_list.append("--profileMicrobenchmark")
         if hasattr(args, 'memAllocStrategy') and args.memAllocStrategy:
             gen_args_list.append(f"--memAllocStrategy={args.memAllocStrategy}")
         if hasattr(args, 'searchStrategy') and args.searchStrategy:
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index 9578c2f26c..18a4f870ac 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -211,6 +211,9 @@ def __init__(self, tiling_arguments: bool, description = None):
                               action = "store_true",
                               help = 'Enable randomized memory scheduler\n')
             self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n')
+            self.add_argument('--profileMicrobenchmark',
+                              action = 'store_true',
+                              help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
             self.add_argument('--memAllocStrategy',
                               metavar = 'memAllocStrategy',
                               dest = 'memAllocStrategy',
@@ -271,6 +274,8 @@ def generate_cmd_args(self) -> str:
                 command += " --randomizedMemoryScheduler"
             if self.args.profileTiling:
                 command += f" --profileTiling"
+            if self.args.profileMicrobenchmark:
+                command += f" --profileMicrobenchmark"
             if self.args.memAllocStrategy:
                 command += f" --memAllocStrategy={self.args.memAllocStrategy}"
             if self.args.plotMemAlloc:

From ec5df1bf66359561a3b4c255852e90b5170b474c Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 10:57:38 +0000
Subject: [PATCH 04/11] Add helper function for profileMicrobenchmark

---
 DeeployTest/generateNetwork.py         |  6 ++++++
 DeeployTest/testMVP.py                 |  2 +-
 DeeployTest/testUtils/deeployRunner.py | 14 +++++++++-----
 DeeployTest/testUtils/pytestRunner.py  |  3 +++
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
index f029be7361..0b25bc6bbe 100644
--- a/DeeployTest/generateNetwork.py
+++ b/DeeployTest/generateNetwork.py
@@ -141,6 +141,7 @@ def generateNetwork(args):
     verbosityCfg = _NoVerbosity
     if isinstance(platform, PULPPlatform):
         verbosityCfg.untiledProfiling = args.profileUntiled
+        verbosityCfg.microbenchmarkProfiling = args.profileMicrobenchmark
 
     # Parse graph and infer output levels and signedness
     _ = deployer.prepare(verbosityCfg)
@@ -172,6 +173,11 @@ def generateNetwork(args):
                         dest = 'profileUntiled',
                         default = False,
                         help = 'Profile Untiled for L2\n')
+    parser.add_argument('--profileMicrobenchmark',
+                        action = 'store_true',
+                        dest = 'profileMicrobenchmark',
+                        default = False,
+                        help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
     parser.add_argument('--input-type-map',
                         nargs = '*',
                         default = [],
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 15fb6a2ac5..9678bc4e4f 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -202,7 +202,7 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                             - max: Initalize all variables at their maximal value.
                             - min: Initalize all variables at their minimal value.
                         """)
-    parser.add_argument('--profileTiling', action = "store_true")
+    parser.add_argument('--profileTiling', action = "store_true", help = 'Enable tiling profiling')
     parser.add_argument('--profileMicrobenchmark',
                         action = "store_true",
                         help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation')
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index 3c03e5b4c3..fbbd95703e 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -94,6 +94,12 @@ def __init__(self,
                           action = 'store_true',
                           default = False,
                           help = 'Enable untiled profiling (Siracusa only)\n')
+        self.add_argument('--profileMicrobenchmark',
+                          '--profile-microbenchmark',
+                          dest = 'profileMicrobenchmark',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
         self.add_argument('--toolchain',
                           metavar = '<LLVM|GCC>',
                           dest = 'toolchain',
@@ -143,9 +149,6 @@ def __init__(self,
                               action = "store_true",
                               help = 'Enable randomized memory scheduler\n')
             self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n')
-            self.add_argument('--profileMicrobenchmark',
-                              action = 'store_true',
-                              help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
             self.add_argument('--memAllocStrategy',
                               metavar = '<strategy>',
                               dest = 'memAllocStrategy',
@@ -228,8 +231,6 @@ def create_config_from_args(args: argparse.Namespace,
             gen_args_list.append("--randomizedMemoryScheduler")
         if hasattr(args, 'profileTiling') and args.profileTiling:
             gen_args_list.append("--profileTiling")
-        if hasattr(args, 'profileMicrobenchmark') and args.profileMicrobenchmark:
-            gen_args_list.append("--profileMicrobenchmark")
         if hasattr(args, 'memAllocStrategy') and args.memAllocStrategy:
             gen_args_list.append(f"--memAllocStrategy={args.memAllocStrategy}")
         if hasattr(args, 'searchStrategy') and args.searchStrategy:
@@ -240,6 +241,9 @@ def create_config_from_args(args: argparse.Namespace,
     if not tiling and getattr(args, 'profileUntiled', False):
         gen_args_list.append("--profileUntiled")
 
+    if getattr(args, 'profileMicrobenchmark', False):
+        gen_args_list.append("--profileMicrobenchmark")
+
     config = DeeployTestConfig(
         test_name = test_name,
         test_dir = test_dir_abs,
diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py
index 472d8cfed9..29119bba6f 100644
--- a/DeeployTest/testUtils/pytestRunner.py
+++ b/DeeployTest/testUtils/pytestRunner.py
@@ -45,6 +45,7 @@ def create_test_config(
     mem_alloc_strategy: str = "MiniMalloc",
     search_strategy: str = "random-max",
     profile_tiling: bool = False,
+    profile_microbenchmark: bool = False,
     plot_mem_alloc: bool = False,
     randomized_mem_scheduler: bool = False,
     profile_untiled: bool = False,
@@ -86,6 +87,8 @@ def create_test_config(
             gen_args_list.append(f"--searchStrategy={search_strategy}")
         if profile_tiling:
             gen_args_list.append("--profileTiling")
+        if profile_microbenchmark:
+            gen_args_list.append("--profileMicrobenchmark")
         if plot_mem_alloc:
             gen_args_list.append("--plotMemAlloc")
         if randomized_mem_scheduler:

From 3d1b6afbadd104cd997d1aa6ef39ad6db07c3db8 Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 10:59:30 +0000
Subject: [PATCH 05/11] perf-util add pre-commit

---
 TargetLibraries/PULPOpen/inc/perf_utils.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h
index 2d9fbc39c6..dc0a78c5e2 100644
--- a/TargetLibraries/PULPOpen/inc/perf_utils.h
+++ b/TargetLibraries/PULPOpen/inc/perf_utils.h
@@ -1,4 +1,8 @@
 /*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
  * Performance Counter Utilities for PULP Benchmarking
  */
 

From a77284376faf9cbbcdafde2643822c0d81498748 Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 11:06:22 +0000
Subject: [PATCH 06/11] Rebase singlebuffertilingcodegeneration

---
 .../SingleBufferingTilingCodeGeneration.py                      | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
index d234776b57..ea1e938b58 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
@@ -191,5 +191,3 @@ def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaIn
         executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
                                                   _egressDMAStatements, closeLoopStatements)
         return executionBlock
-
-

From 4d03bf165847c1fb810bf79b621ad126af9cc1f5 Mon Sep 17 00:00:00 2001
From: Victor Jung <jungvi@iis.ee.ethz.ch>
Date: Wed, 8 Apr 2026 13:56:44 +0200
Subject: [PATCH 07/11] Make workspace safe to prevent "dubious ownership"
 sporadic issues

---
 .github/workflows/_runner-chimera.yml                 | 2 ++
 .github/workflows/_runner-cortexm.yml                 | 2 ++
 .github/workflows/_runner-gap9-tiled.yml              | 2 ++
 .github/workflows/_runner-gap9.yml                    | 2 ++
 .github/workflows/_runner-generic.yml                 | 2 ++
 .github/workflows/_runner-mempool.yml                 | 2 ++
 .github/workflows/_runner-siracusa-neureka-tiled.yml  | 2 ++
 .github/workflows/_runner-siracusa-tiled.yml          | 2 ++
 .github/workflows/_runner-siracusa.yml                | 2 ++
 .github/workflows/_runner-snitch-tiled-sequential.yml | 2 ++
 .github/workflows/_runner-snitch.yml                  | 2 ++
 .github/workflows/_runner-softhier.yml                | 2 ++
 .github/workflows/ci-deeploy.yml                      | 4 ++++
 .github/workflows/infra-generate-ccache-gap9.yml      | 2 ++
 .github/workflows/infra-generate-ccache.yml           | 2 ++
 15 files changed, 32 insertions(+)

diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml
index 14e80631d1..c642bfe6d2 100644
--- a/.github/workflows/_runner-chimera.yml
+++ b/.github/workflows/_runner-chimera.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-cortexm.yml b/.github/workflows/_runner-cortexm.yml
index 3fbdf0ee16..c6be8af465 100644
--- a/.github/workflows/_runner-cortexm.yml
+++ b/.github/workflows/_runner-cortexm.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml
index a5c8b3ac98..6934014447 100644
--- a/.github/workflows/_runner-gap9-tiled.yml
+++ b/.github/workflows/_runner-gap9-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml
index e1d6e452a6..cc790d3d33 100644
--- a/.github/workflows/_runner-gap9.yml
+++ b/.github/workflows/_runner-gap9.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-generic.yml b/.github/workflows/_runner-generic.yml
index 6681cbac96..b44b47f73d 100644
--- a/.github/workflows/_runner-generic.yml
+++ b/.github/workflows/_runner-generic.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-mempool.yml b/.github/workflows/_runner-mempool.yml
index deb4809330..b2f0ae4f7a 100644
--- a/.github/workflows/_runner-mempool.yml
+++ b/.github/workflows/_runner-mempool.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml
index b1f5f2fcb3..664d5f01be 100644
--- a/.github/workflows/_runner-siracusa-neureka-tiled.yml
+++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index ea9c8989af..cc09f234e0 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa.yml b/.github/workflows/_runner-siracusa.yml
index ea8fe5d405..1c51333f7a 100644
--- a/.github/workflows/_runner-siracusa.yml
+++ b/.github/workflows/_runner-siracusa.yml
@@ -25,6 +25,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml
index fbd5195b08..bcdd58a166 100644
--- a/.github/workflows/_runner-snitch-tiled-sequential.yml
+++ b/.github/workflows/_runner-snitch-tiled-sequential.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml
index bc599e4fe7..48130ea26a 100644
--- a/.github/workflows/_runner-snitch.yml
+++ b/.github/workflows/_runner-snitch.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml
index b067664f40..2624cbe15d 100644
--- a/.github/workflows/_runner-softhier.yml
+++ b/.github/workflows/_runner-softhier.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml
index fc468306b1..84f2779e4c 100644
--- a/.github/workflows/ci-deeploy.yml
+++ b/.github/workflows/ci-deeploy.yml
@@ -35,6 +35,8 @@ jobs:
     container:
       image: ${{ needs.select-env.outputs.image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
@@ -49,6 +51,8 @@ jobs:
     container:
       image: ${{ needs.select-env.outputs.image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml
index f9010adb1d..5b456e1d64 100644
--- a/.github/workflows/infra-generate-ccache-gap9.yml
+++ b/.github/workflows/infra-generate-ccache-gap9.yml
@@ -23,6 +23,8 @@ jobs:
     container:
       image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml
index bd6e2c7787..e2d54eaa83 100644
--- a/.github/workflows/infra-generate-ccache.yml
+++ b/.github/workflows/infra-generate-ccache.yml
@@ -22,6 +22,8 @@ jobs:
     container:
       image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:

From c43fd5e76278adf172f697bda3150e3acf824075 Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 11:10:24 +0000
Subject: [PATCH 08/11] Update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 38c5b3ce35..42281c6f0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 
 
 ### List of Pull Requests
+- Add Microbenchmarking Infrastructure and CI Using GVSoC CSR [#162](https://github.com/pulp-platform/Deeploy/pull/162)
 - Fix CI Cache Generation [#176](https://github.com/pulp-platform/Deeploy/pull/176)
 - Fix Broken CI [#175](https://github.com/pulp-platform/Deeploy/pull/175)
 - Improve Docstring and Debugging [#160](https://github.com/pulp-platform/Deeploy/pull/160)
@@ -23,6 +24,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Shell Format pre-commit hook
 - Add integer MaxPool1D for Generic platform and RQSConv1D support for PULPOpen, with corresponding kernel tests.
 - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows
+- Per-layer microbenchmarking on PULPOpen via `--profileMicrobenchmark`: new `PULPMicrobenchmark` code-transformation pass + `perf_utils.h` helpers report cycles, instructions, stalls and cache misses per layer in `RunNetwork`
 
 ### Changed
 - Use by default `devel` container for GAP9 CI

From 87d8115a7c246b89de4e6837724f1c870597d0b3 Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 11:19:26 +0000
Subject: [PATCH 09/11] Fix linting

---
 Deeploy/Targets/PULPOpen/Platform.py      |   3 +-
 TargetLibraries/PULPOpen/inc/perf_utils.h | 229 ++++++++++------------
 2 files changed, 110 insertions(+), 122 deletions(-)

diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 11b9747526..f13e6451fb 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -248,7 +248,8 @@ class PULPStructBuffer(StructBuffer):
 
 # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
 _includeList = [
-    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h", "perf_utils.h"
+    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h",
+    "perf_utils.h"
 ]
 
 
diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h
index dc0a78c5e2..c710402ed2 100644
--- a/TargetLibraries/PULPOpen/inc/perf_utils.h
+++ b/TargetLibraries/PULPOpen/inc/perf_utils.h
@@ -12,151 +12,138 @@
 #include "pmsis.h"
 
 // Performance event IDs (compatible with PMSIS)
-#define PI_PERF_CYCLES          CSR_PCER_CYCLES
-#define PI_PERF_INSTR           CSR_PCER_INSTR
-#define PI_PERF_LD_STALL        CSR_PCER_LD_STALL
-#define PI_PERF_JMP_STALL       CSR_PCER_JMP_STALL
-#define PI_PERF_IMISS           CSR_PCER_IMISS
-#define PI_PERF_LD              CSR_PCER_LD
-#define PI_PERF_ST              CSR_PCER_ST
-#define PI_PERF_JUMP            CSR_PCER_JUMP
-#define PI_PERF_BRANCH          CSR_PCER_BRANCH
-#define PI_PERF_TAKEN_BRANCH    CSR_PCER_TAKEN_BRANCH
-#define PI_PERF_RVC             CSR_PCER_RVC
-#define PI_PERF_LD_EXT          CSR_PCER_LD_EXT
-#define PI_PERF_ST_EXT          CSR_PCER_ST_EXT
-#define PI_PERF_LD_EXT_CYC      CSR_PCER_LD_EXT_CYC
-#define PI_PERF_ST_EXT_CYC      CSR_PCER_ST_EXT_CYC
-#define PI_PERF_TCDM_CONT       CSR_PCER_TCDM_CONT
+#define PI_PERF_CYCLES CSR_PCER_CYCLES
+#define PI_PERF_INSTR CSR_PCER_INSTR
+#define PI_PERF_LD_STALL CSR_PCER_LD_STALL
+#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL
+#define PI_PERF_IMISS CSR_PCER_IMISS
+#define PI_PERF_LD CSR_PCER_LD
+#define PI_PERF_ST CSR_PCER_ST
+#define PI_PERF_JUMP CSR_PCER_JUMP
+#define PI_PERF_BRANCH CSR_PCER_BRANCH
+#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH
+#define PI_PERF_RVC CSR_PCER_RVC
+#define PI_PERF_LD_EXT CSR_PCER_LD_EXT
+#define PI_PERF_ST_EXT CSR_PCER_ST_EXT
+#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC
+#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC
+#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT
 
 // Benchmark statistics structure
 typedef struct {
-    unsigned int cycles;
-    unsigned int instr;
-    unsigned int ld;
-    unsigned int st;
-    unsigned int ld_stall;
-    unsigned int jmp_stall;
-    unsigned int imiss;
-    unsigned int branch;
-    unsigned int taken_branch;
-    unsigned int rvc;
-    unsigned int ld_ext;
-    unsigned int st_ext;
-    unsigned int ld_ext_cyc;
-    unsigned int st_ext_cyc;
-    unsigned int tcdm_cont;
+  unsigned int cycles;
+  unsigned int instr;
+  unsigned int ld;
+  unsigned int st;
+  unsigned int ld_stall;
+  unsigned int jmp_stall;
+  unsigned int imiss;
+  unsigned int branch;
+  unsigned int taken_branch;
+  unsigned int rvc;
+  unsigned int ld_ext;
+  unsigned int st_ext;
+  unsigned int ld_ext_cyc;
+  unsigned int st_ext_cyc;
+  unsigned int tcdm_cont;
 } perf_stats_t;
 
 // Initialize performance counters for comprehensive benchmarking
 static inline void perf_bench_init() {
-    // Enable all performance counters
-    pi_perf_conf(
-        (1 << PI_PERF_CYCLES) |
-        (1 << PI_PERF_INSTR) |
-        (1 << PI_PERF_LD_STALL) |
-        (1 << PI_PERF_JMP_STALL) |
-        (1 << PI_PERF_IMISS) |
-        (1 << PI_PERF_LD) |
-        (1 << PI_PERF_ST) |
-        (1 << PI_PERF_JUMP) |
-        (1 << PI_PERF_BRANCH) |
-        (1 << PI_PERF_TAKEN_BRANCH) |
-        (1 << PI_PERF_RVC) |
-        (1 << PI_PERF_LD_EXT) |
-        (1 << PI_PERF_ST_EXT) |
-        (1 << PI_PERF_LD_EXT_CYC) |
-        (1 << PI_PERF_ST_EXT_CYC) |
-        (1 << PI_PERF_TCDM_CONT)
-    );
+  // Enable all performance counters
+  pi_perf_conf(
+      (1 << PI_PERF_CYCLES) | (1 << PI_PERF_INSTR) | (1 << PI_PERF_LD_STALL) |
+      (1 << PI_PERF_JMP_STALL) | (1 << PI_PERF_IMISS) | (1 << PI_PERF_LD) |
+      (1 << PI_PERF_ST) | (1 << PI_PERF_JUMP) | (1 << PI_PERF_BRANCH) |
+      (1 << PI_PERF_TAKEN_BRANCH) | (1 << PI_PERF_RVC) | (1 << PI_PERF_LD_EXT) |
+      (1 << PI_PERF_ST_EXT) | (1 << PI_PERF_LD_EXT_CYC) |
+      (1 << PI_PERF_ST_EXT_CYC) | (1 << PI_PERF_TCDM_CONT));
 }
 
 // Start performance monitoring
 static inline void perf_bench_start() {
-    pi_perf_reset();
-    pi_perf_start();
+  pi_perf_reset();
+  pi_perf_start();
 }
 
 // Stop performance monitoring
-static inline void perf_bench_stop() {
-    pi_perf_stop();
-}
+static inline void perf_bench_stop() { pi_perf_stop(); }
 
 // Read all performance counters into structure
 static inline void perf_bench_read(perf_stats_t *stats) {
-    stats->cycles = pi_perf_read(PI_PERF_CYCLES);
-    stats->instr = pi_perf_read(PI_PERF_INSTR);
-    stats->ld = pi_perf_read(PI_PERF_LD);
-    stats->st = pi_perf_read(PI_PERF_ST);
-    stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL);
-    stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL);
-    stats->imiss = pi_perf_read(PI_PERF_IMISS);
-    stats->branch = pi_perf_read(PI_PERF_BRANCH);
-    stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH);
-    stats->rvc = pi_perf_read(PI_PERF_RVC);
-    stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT);
-    stats->st_ext = pi_perf_read(PI_PERF_ST_EXT);
-    stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC);
-    stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC);
-    stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT);
+  stats->cycles = pi_perf_read(PI_PERF_CYCLES);
+  stats->instr = pi_perf_read(PI_PERF_INSTR);
+  stats->ld = pi_perf_read(PI_PERF_LD);
+  stats->st = pi_perf_read(PI_PERF_ST);
+  stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL);
+  stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL);
+  stats->imiss = pi_perf_read(PI_PERF_IMISS);
+  stats->branch = pi_perf_read(PI_PERF_BRANCH);
+  stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH);
+  stats->rvc = pi_perf_read(PI_PERF_RVC);
+  stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT);
+  stats->st_ext = pi_perf_read(PI_PERF_ST_EXT);
+  stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC);
+  stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC);
+  stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT);
 }
 
 // Print performance statistics (core 0 only to avoid clutter)
 static inline void perf_bench_print(const char *label, perf_stats_t *stats) {
-    if (pi_core_id() == 0) {
-        printf("\n=== Performance Statistics: %s ===\n", label);
-        printf("Cycles:              %10u\n", stats->cycles);
-        printf("Instructions:        %10u\n", stats->instr);
-        printf("IPC:                 %10.3f\n",
-               stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f);
-        printf("\n--- Instruction Mix ---\n");
-        printf("Loads:               %10u (%.2f%%)\n", stats->ld,
-               stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f);
-        printf("Stores:              %10u (%.2f%%)\n", stats->st,
-               stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f);
-        printf("Branches:            %10u (%.2f%%)\n", stats->branch,
-               stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f);
-        printf("Taken Branches:      %10u (%.2f%%)\n", stats->taken_branch,
-               stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch : 0.0f);
-        printf("Compressed (RVC):    %10u (%.2f%%)\n", stats->rvc,
-               stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f);
-        printf("\n--- Stalls & Hazards ---\n");
-        printf("Load Stalls:         %10u\n", stats->ld_stall);
-        printf("Jump Stalls:         %10u\n", stats->jmp_stall);
-        printf("I-cache Misses:      %10u\n", stats->imiss);
-        printf("TCDM Contentions:    %10u\n", stats->tcdm_cont);
-        printf("\n--- Memory Hierarchy ---\n");
-        printf("External Loads:      %10u (%.2f%%)\n", stats->ld_ext,
-               stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f);
-        printf("External Stores:     %10u (%.2f%%)\n", stats->st_ext,
-               stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f);
-        printf("Ext Load Cycles:     %10u (avg: %.2f)\n", stats->ld_ext_cyc,
-               stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f);
-        printf("Ext Store Cycles:    %10u (avg: %.2f)\n", stats->st_ext_cyc,
-               stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f);
-        printf("========================================\n\n");
-    }
+  if (pi_core_id() == 0) {
+    printf("\n=== Performance Statistics: %s ===\n", label);
+    printf("Cycles:              %10u\n", stats->cycles);
+    printf("Instructions:        %10u\n", stats->instr);
+    printf("IPC:                 %10.3f\n",
+           stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f);
+    printf("\n--- Instruction Mix ---\n");
+    printf("Loads:               %10u (%.2f%%)\n", stats->ld,
+           stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f);
+    printf("Stores:              %10u (%.2f%%)\n", stats->st,
+           stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f);
+    printf("Branches:            %10u (%.2f%%)\n", stats->branch,
+           stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f);
+    printf("Taken Branches:      %10u (%.2f%%)\n", stats->taken_branch,
+           stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch
+                             : 0.0f);
+    printf("Compressed (RVC):    %10u (%.2f%%)\n", stats->rvc,
+           stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f);
+    printf("\n--- Stalls & Hazards ---\n");
+    printf("Load Stalls:         %10u\n", stats->ld_stall);
+    printf("Jump Stalls:         %10u\n", stats->jmp_stall);
+    printf("I-cache Misses:      %10u\n", stats->imiss);
+    printf("TCDM Contentions:    %10u\n", stats->tcdm_cont);
+    printf("\n--- Memory Hierarchy ---\n");
+    printf("External Loads:      %10u (%.2f%%)\n", stats->ld_ext,
+           stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f);
+    printf("External Stores:     %10u (%.2f%%)\n", stats->st_ext,
+           stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f);
+    printf("Ext Load Cycles:     %10u (avg: %.2f)\n", stats->ld_ext_cyc,
+           stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f);
+    printf("Ext Store Cycles:    %10u (avg: %.2f)\n", stats->st_ext_cyc,
+           stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f);
+    printf("========================================\n\n");
+  }
 }
 
 // Compute difference between two stats (for analyzing specific code sections)
-static inline void perf_bench_diff(perf_stats_t *result,
-                                    perf_stats_t *end,
-                                    perf_stats_t *start) {
-    result->cycles = end->cycles - start->cycles;
-    result->instr = end->instr - start->instr;
-    result->ld = end->ld - start->ld;
-    result->st = end->st - start->st;
-    result->ld_stall = end->ld_stall - start->ld_stall;
-    result->jmp_stall = end->jmp_stall - start->jmp_stall;
-    result->imiss = end->imiss - start->imiss;
-    result->branch = end->branch - start->branch;
-    result->taken_branch = end->taken_branch - start->taken_branch;
-    result->rvc = end->rvc - start->rvc;
-    result->ld_ext = end->ld_ext - start->ld_ext;
-    result->st_ext = end->st_ext - start->st_ext;
-    result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc;
-    result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc;
-    result->tcdm_cont = end->tcdm_cont - start->tcdm_cont;
+static inline void perf_bench_diff(perf_stats_t *result, perf_stats_t *end,
+                                   perf_stats_t *start) {
+  result->cycles = end->cycles - start->cycles;
+  result->instr = end->instr - start->instr;
+  result->ld = end->ld - start->ld;
+  result->st = end->st - start->st;
+  result->ld_stall = end->ld_stall - start->ld_stall;
+  result->jmp_stall = end->jmp_stall - start->jmp_stall;
+  result->imiss = end->imiss - start->imiss;
+  result->branch = end->branch - start->branch;
+  result->taken_branch = end->taken_branch - start->taken_branch;
+  result->rvc = end->rvc - start->rvc;
+  result->ld_ext = end->ld_ext - start->ld_ext;
+  result->st_ext = end->st_ext - start->st_ext;
+  result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc;
+  result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc;
+  result->tcdm_cont = end->tcdm_cont - start->tcdm_cont;
 }
 
 #endif // __PERF_UTILS_H__

From 4fe41a8ba2a56e42f841889797c550db06ab3cce Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 12:05:05 +0000
Subject: [PATCH 10/11] Add microbenchmark tutorial to docs

---
 docs/tutorials/microbenchmark.rst | 84 +++++++++++++++++++++++++++++++
 docs/tutorials/overview.rst       |  1 +
 2 files changed, 85 insertions(+)
 create mode 100644 docs/tutorials/microbenchmark.rst

diff --git a/docs/tutorials/microbenchmark.rst b/docs/tutorials/microbenchmark.rst
new file mode 100644
index 0000000000..9f8d181b88
--- /dev/null
+++ b/docs/tutorials/microbenchmark.rst
@@ -0,0 +1,84 @@
+.. SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
+Per-Layer Microbenchmarking on PULPOpen
+=======================================
+
+Deeploy can wrap each layer in the generated ``RunNetwork`` with PULP performance-counter instrumentation, producing per-layer reports of cycles, instructions, stalls, instruction-cache misses, branch behaviour, and external/TCDM memory traffic. This is intended for profiling individual layers of a deployed network on real hardware or in GVSoC, without modifying any kernel source.
+
+The instrumentation is **off by default** and adds zero overhead unless explicitly enabled.
+
+Enabling
+--------
+
+Pass ``--profileMicrobenchmark`` to any of the runner entry points:
+
+.. code-block:: bash
+
+    python testMVP.py        ... --profileMicrobenchmark
+    python generateNetwork.py ... --profileMicrobenchmark
+    python deeployRunner_siracusa.py -t Tests/Kernels/FP32/Add/Regular --profileMicrobenchmark
+
+The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling`
+into the :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark`
+code-transformation pass, which is registered at the outermost position of the PULPOpen
+``ForkTransformer`` and ``ClusterTransformer`` chains. Because it runs last, the wrapped region
+covers the full per-layer body, including all tiling, DMA, and memory-management code.
+
+Output Format
+-------------
+
+Each layer emits one block of statistics on ``core 0``:
+
+.. code-block:: text
+
+    === Performance Statistics: Add_0 ===
+    Cycles:                    1442
+    Instructions:               149
+    IPC:                      0.103
+
+    --- Instruction Mix ---
+    Loads:                       24 (16.11%)
+    Stores:                      27 (18.12%)
+    Branches:                     5 (3.36%)
+    Taken Branches:               2 (40.00%)
+    Compressed (RVC):             0 (0.00%)
+
+    --- Stalls & Hazards ---
+    Load Stalls:                  0
+    Jump Stalls:                  0
+    I-cache Misses:             724
+    TCDM Contentions:             0
+
+    --- Memory Hierarchy ---
+    External Loads:               0 (0.00%)
+    External Stores:              0 (0.00%)
+    Ext Load Cycles:              0 (avg: 0.00)
+    Ext Store Cycles:             0 (avg: 0.00)
+    ========================================
+
+Underlying Helpers
+------------------
+
+The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h`` and are included by
+default in PULPOpen builds via ``Platform.py``. The pass injects:
+
+- ``perf_bench_init()`` / ``perf_bench_start()`` / ``perf_bench_read(&start)`` before the layer body
+- ``perf_bench_stop()`` / ``perf_bench_read(&end)`` / ``perf_bench_diff(&total, &end, &start)`` /
+  ``perf_bench_print("<layer>", &total)`` after it
+
+All counters listed in ``perf_stats_t`` are configured at once in ``pi_perf_conf``, so a single
+wrap captures the full event set.
+
+Notes & Caveats
+---------------
+
+- **External memory counters** (``LD_EXT``, ``ST_EXT``, ``LD_EXT_CYC``, ``ST_EXT_CYC``) only show
+  non-zero values when the wrapped region performs L2/L3 traffic. Untiled tests that fit in L1/TCDM
+  will report zero.
+- **TCDM contention** depends on the access pattern — regular, bank-friendly kernels (e.g. element-wise
+  Add) can legitimately report zero contention even with all 8 cores active.
+- Some events may not be modelled by GVSoC; verify on a tiled test (e.g. Siracusa-tiled GEMM) before
+  concluding a counter is broken.
+- Output is printed by ``core 0`` only to keep logs readable.
diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst
index 0b3d97c761..c0a9660104 100644
--- a/docs/tutorials/overview.rst
+++ b/docs/tutorials/overview.rst
@@ -14,5 +14,6 @@ Each tutorial covers a specific topic and includes code examples to illustrate t
 
    introduction
    debugging
+   microbenchmark
 
 

From d85ac5fc6325a1b516a20f8cd0997a7c5192af11 Mon Sep 17 00:00:00 2001
From: Run Wang <samanthawangdl@gmail.com>
Date: Mon, 13 Apr 2026 12:13:26 +0000
Subject: [PATCH 11/11] Trim microbenchmark tutorial

---
 docs/tutorials/microbenchmark.rst | 78 ++++---------------------------
 1 file changed, 9 insertions(+), 69 deletions(-)

diff --git a/docs/tutorials/microbenchmark.rst b/docs/tutorials/microbenchmark.rst
index 9f8d181b88..c005090020 100644
--- a/docs/tutorials/microbenchmark.rst
+++ b/docs/tutorials/microbenchmark.rst
@@ -2,34 +2,14 @@
 ..
 .. SPDX-License-Identifier: Apache-2.0
 
-Per-Layer Microbenchmarking on PULPOpen
-=======================================
+Microbenchmark
+==============
 
-Deeploy can wrap each layer in the generated ``RunNetwork`` with PULP performance-counter instrumentation, producing per-layer reports of cycles, instructions, stalls, instruction-cache misses, branch behaviour, and external/TCDM memory traffic. This is intended for profiling individual layers of a deployed network on real hardware or in GVSoC, without modifying any kernel source.
+Pass ``--profileMicrobenchmark`` to any PULPOpen runner (``testMVP.py``, ``generateNetwork.py``, ``deeployRunner_*.py``) to wrap each layer in ``RunNetwork`` with PULP performance counters. Off by default; zero overhead when unused.
 
-The instrumentation is **off by default** and adds zero overhead unless explicitly enabled.
+The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling` into :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark`, which is registered last in the PULPOpen ``ForkTransformer`` and ``ClusterTransformer`` chains so it covers the full per-layer body (tiling, DMA, memory management). The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h``.
 
-Enabling
---------
-
-Pass ``--profileMicrobenchmark`` to any of the runner entry points:
-
-.. code-block:: bash
-
-    python testMVP.py        ... --profileMicrobenchmark
-    python generateNetwork.py ... --profileMicrobenchmark
-    python deeployRunner_siracusa.py -t Tests/Kernels/FP32/Add/Regular --profileMicrobenchmark
-
-The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling`
-into the :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark`
-code-transformation pass, which is registered at the outermost position of the PULPOpen
-``ForkTransformer`` and ``ClusterTransformer`` chains. Because it runs last, the wrapped region
-covers the full per-layer body, including all tiling, DMA, and memory-management code.
-
-Output Format
--------------
-
-Each layer emits one block of statistics on ``core 0``:
+Each layer prints one block on ``core 0``:
 
 .. code-block:: text
 
@@ -37,48 +17,8 @@ Each layer emits one block of statistics on ``core 0``:
     Cycles:                    1442
     Instructions:               149
     IPC:                      0.103
+    Loads / Stores / Branches / Taken Branches / RVC
+    Load Stalls / Jump Stalls / I-cache Misses / TCDM Contentions
+    External Loads / Stores and their cycle counts
 
-    --- Instruction Mix ---
-    Loads:                       24 (16.11%)
-    Stores:                      27 (18.12%)
-    Branches:                     5 (3.36%)
-    Taken Branches:               2 (40.00%)
-    Compressed (RVC):             0 (0.00%)
-
-    --- Stalls & Hazards ---
-    Load Stalls:                  0
-    Jump Stalls:                  0
-    I-cache Misses:             724
-    TCDM Contentions:             0
-
-    --- Memory Hierarchy ---
-    External Loads:               0 (0.00%)
-    External Stores:              0 (0.00%)
-    Ext Load Cycles:              0 (avg: 0.00)
-    Ext Store Cycles:             0 (avg: 0.00)
-    ========================================
-
-Underlying Helpers
-------------------
-
-The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h`` and are included by
-default in PULPOpen builds via ``Platform.py``. The pass injects:
-
-- ``perf_bench_init()`` / ``perf_bench_start()`` / ``perf_bench_read(&start)`` before the layer body
-- ``perf_bench_stop()`` / ``perf_bench_read(&end)`` / ``perf_bench_diff(&total, &end, &start)`` /
-  ``perf_bench_print("<layer>", &total)`` after it
-
-All counters listed in ``perf_stats_t`` are configured at once in ``pi_perf_conf``, so a single
-wrap captures the full event set.
-
-Notes & Caveats
----------------
-
-- **External memory counters** (``LD_EXT``, ``ST_EXT``, ``LD_EXT_CYC``, ``ST_EXT_CYC``) only show
-  non-zero values when the wrapped region performs L2/L3 traffic. Untiled tests that fit in L1/TCDM
-  will report zero.
-- **TCDM contention** depends on the access pattern — regular, bank-friendly kernels (e.g. element-wise
-  Add) can legitimately report zero contention even with all 8 cores active.
-- Some events may not be modelled by GVSoC; verify on a tiled test (e.g. Siracusa-tiled GEMM) before
-  concluding a counter is broken.
-- Output is printed by ``core 0`` only to keep logs readable.
+External-memory and TCDM-contention counters are zero when the wrapped region has no L2/L3 traffic or no bank conflicts (e.g. small untiled kernels that fit in L1). Some events may not be modelled by GVSoC — verify on a tiled test before assuming a counter is broken.