diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml
index 14e80631d1..c642bfe6d2 100644
--- a/.github/workflows/_runner-chimera.yml
+++ b/.github/workflows/_runner-chimera.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-cortexm.yml b/.github/workflows/_runner-cortexm.yml
index 3fbdf0ee16..c6be8af465 100644
--- a/.github/workflows/_runner-cortexm.yml
+++ b/.github/workflows/_runner-cortexm.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-gap9-tiled.yml b/.github/workflows/_runner-gap9-tiled.yml
index a5c8b3ac98..6934014447 100644
--- a/.github/workflows/_runner-gap9-tiled.yml
+++ b/.github/workflows/_runner-gap9-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-gap9.yml b/.github/workflows/_runner-gap9.yml
index e1d6e452a6..cc790d3d33 100644
--- a/.github/workflows/_runner-gap9.yml
+++ b/.github/workflows/_runner-gap9.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-generic.yml b/.github/workflows/_runner-generic.yml
index 6681cbac96..b44b47f73d 100644
--- a/.github/workflows/_runner-generic.yml
+++ b/.github/workflows/_runner-generic.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-mempool.yml b/.github/workflows/_runner-mempool.yml
index deb4809330..b2f0ae4f7a 100644
--- a/.github/workflows/_runner-mempool.yml
+++ b/.github/workflows/_runner-mempool.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml
index b1f5f2fcb3..664d5f01be 100644
--- a/.github/workflows/_runner-siracusa-neureka-tiled.yml
+++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index ea9c8989af..cc09f234e0 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-siracusa.yml b/.github/workflows/_runner-siracusa.yml
index ea8fe5d405..1c51333f7a 100644
--- a/.github/workflows/_runner-siracusa.yml
+++ b/.github/workflows/_runner-siracusa.yml
@@ -25,6 +25,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml
index fbd5195b08..bcdd58a166 100644
--- a/.github/workflows/_runner-snitch-tiled-sequential.yml
+++ b/.github/workflows/_runner-snitch-tiled-sequential.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml
index bc599e4fe7..48130ea26a 100644
--- a/.github/workflows/_runner-snitch.yml
+++ b/.github/workflows/_runner-snitch.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml
index b067664f40..2624cbe15d 100644
--- a/.github/workflows/_runner-softhier.yml
+++ b/.github/workflows/_runner-softhier.yml
@@ -24,6 +24,8 @@ jobs:
     container:
       image: ${{ inputs.docker-image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml
index fc468306b1..84f2779e4c 100644
--- a/.github/workflows/ci-deeploy.yml
+++ b/.github/workflows/ci-deeploy.yml
@@ -35,6 +35,8 @@ jobs:
     container:
       image: ${{ needs.select-env.outputs.image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
@@ -49,6 +51,8 @@ jobs:
     container:
       image: ${{ needs.select-env.outputs.image }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/infra-generate-ccache-gap9.yml b/.github/workflows/infra-generate-ccache-gap9.yml
index f9010adb1d..5b456e1d64 100644
--- a/.github/workflows/infra-generate-ccache-gap9.yml
+++ b/.github/workflows/infra-generate-ccache-gap9.yml
@@ -23,6 +23,8 @@ jobs:
     container:
       image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml
index bd6e2c7787..e2d54eaa83 100644
--- a/.github/workflows/infra-generate-ccache.yml
+++ b/.github/workflows/infra-generate-ccache.yml
@@ -22,6 +22,8 @@ jobs:
     container:
       image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }}
     steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
       - name: Checkout Repo
         uses: actions/checkout@v4
         with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 38c5b3ce35..42281c6f0a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 
 
 ### List of Pull Requests
+- Add Microbenchmarking Infrastructure and CI Using GVSoC CSR [#162](https://github.com/pulp-platform/Deeploy/pull/162)
 - Fix CI Cache Generation [#176](https://github.com/pulp-platform/Deeploy/pull/176)
 - Fix Broken CI [#175](https://github.com/pulp-platform/Deeploy/pull/175)
 - Improve Docstring and Debugging [#160](https://github.com/pulp-platform/Deeploy/pull/160)
@@ -23,6 +24,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Shell Format pre-commit hook
 - Add integer MaxPool1D for Generic platform and RQSConv1D support for PULPOpen, with corresponding kernel tests.
 - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows
+- Per-layer microbenchmarking on PULPOpen via `--profileMicrobenchmark`: new `PULPMicrobenchmark` code-transformation pass + `perf_utils.h` helpers report cycles, instructions, stalls and cache misses per layer in `RunNetwork`
 
 ### Changed
 - Use by default `devel` container for GAP9 CI
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 797bd44c47..de5a66aae9 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -53,6 +53,7 @@ class CodeGenVerbosity:
 
     tilingProfiling: Optional[bool] = False  # Specifies if we should profile the tiling code
     untiledProfiling: Optional[bool] = None  #  Specifies if we should profile the untilied code
+    microbenchmarkProfiling: Optional[bool] = False  # Wrap each layer with PULP perf-counter microbenchmark
 
 
 _NoVerbosity = CodeGenVerbosity(None)
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 5d7b02ae62..2c78978e23 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -24,6 +24,7 @@
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark import PULPMicrobenchmark
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
 from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
@@ -115,6 +116,7 @@
     MemoryManagementGeneration("L2"),
     MemoryManagementGeneration("L3.*"),
     MemoryManagementGeneration(),
+    PULPMicrobenchmark(),
 ])
 
 ClusterTransformer = CodeTransformation([
@@ -133,6 +135,7 @@
     MemoryManagementGeneration("L2"),
     MemoryManagementGeneration("L3.*"),
     MemoryManagementGeneration(),
+    PULPMicrobenchmark(),
 ])
 
 SimpleTransformer = CodeTransformation([
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py
new file mode 100644
index 0000000000..bb35f32d47
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPMicrobenchmark.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Tuple
+
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, _NoVerbosity
+
+
+class PULPMicrobenchmark(CodeTransformationPass):
+
+    _preTemplate = NodeTemplate("""
+    perf_stats_t ${op}_perf_start, ${op}_perf_end, ${op}_perf_total;
+    if (pi_core_id() == 0) {
+        perf_bench_init();
+        perf_bench_start();
+        perf_bench_read(&${op}_perf_start);
+    }
+    """)
+
+    _postTemplate = NodeTemplate("""
+    if (pi_core_id() == 0) {
+        perf_bench_stop();
+        perf_bench_read(&${op}_perf_end);
+        perf_bench_diff(&${op}_perf_total, &${op}_perf_end, &${op}_perf_start);
+        perf_bench_print("${op}", &${op}_perf_total);
+    }
+    """)
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+
+        if not verbose.microbenchmarkProfiling:
+            return ctxt, executionBlock
+
+        executionBlock.addLeft(self._preTemplate, {"op": name})
+        executionBlock.addRight(self._postTemplate, {"op": name})
+        return ctxt, executionBlock
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 7456dd9e1b..f13e6451fb 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -248,7 +248,8 @@ class PULPStructBuffer(StructBuffer):
 
 # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
 _includeList = [
-    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h"
+    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h",
+    "perf_utils.h"
 ]
 
 
diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
index f029be7361..0b25bc6bbe 100644
--- a/DeeployTest/generateNetwork.py
+++ b/DeeployTest/generateNetwork.py
@@ -141,6 +141,7 @@ def generateNetwork(args):
     verbosityCfg = _NoVerbosity
     if isinstance(platform, PULPPlatform):
         verbosityCfg.untiledProfiling = args.profileUntiled
+        verbosityCfg.microbenchmarkProfiling = args.profileMicrobenchmark
 
     # Parse graph and infer output levels and signedness
     _ = deployer.prepare(verbosityCfg)
@@ -172,6 +173,11 @@ def generateNetwork(args):
                         dest = 'profileUntiled',
                         default = False,
                         help = 'Profile Untiled for L2\n')
+    parser.add_argument('--profileMicrobenchmark',
+                        action = 'store_true',
+                        dest = 'profileMicrobenchmark',
+                        default = False,
+                        help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
     parser.add_argument('--input-type-map',
                         nargs = '*',
                         default = [],
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 69e04343ff..9678bc4e4f 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -202,7 +202,10 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                             - max: Initalize all variables at their maximal value.
                             - min: Initalize all variables at their minimal value.
                         """)
-    parser.add_argument('--profileTiling', action = "store_true")
+    parser.add_argument('--profileTiling', action = "store_true", help = 'Enable tiling profiling')
+    parser.add_argument('--profileMicrobenchmark',
+                        action = "store_true",
+                        help = 'Wrap each layer with PULP perf-counter microbenchmark instrumentation')
     parser.add_argument('--plotMemAlloc',
                         action = 'store_true',
                         help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
@@ -224,6 +227,9 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     if args.profileTiling:
         verbosityCfg.tilingProfiling = True
 
+    if args.profileMicrobenchmark:
+        verbosityCfg.microbenchmarkProfiling = True
+
     onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
     graph = gs.import_onnx(onnx_graph)
 
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index a5a8d70ef3..fbbd95703e 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -94,6 +94,12 @@ def __init__(self,
                           action = 'store_true',
                           default = False,
                           help = 'Enable untiled profiling (Siracusa only)\n')
+        self.add_argument('--profileMicrobenchmark',
+                          '--profile-microbenchmark',
+                          dest = 'profileMicrobenchmark',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
         self.add_argument('--toolchain',
                           metavar = '<LLVM|GCC>',
                           dest = 'toolchain',
@@ -235,6 +241,9 @@ def create_config_from_args(args: argparse.Namespace,
     if not tiling and getattr(args, 'profileUntiled', False):
         gen_args_list.append("--profileUntiled")
 
+    if getattr(args, 'profileMicrobenchmark', False):
+        gen_args_list.append("--profileMicrobenchmark")
+
     config = DeeployTestConfig(
         test_name = test_name,
         test_dir = test_dir_abs,
diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py
index 472d8cfed9..29119bba6f 100644
--- a/DeeployTest/testUtils/pytestRunner.py
+++ b/DeeployTest/testUtils/pytestRunner.py
@@ -45,6 +45,7 @@ def create_test_config(
     mem_alloc_strategy: str = "MiniMalloc",
     search_strategy: str = "random-max",
     profile_tiling: bool = False,
+    profile_microbenchmark: bool = False,
     plot_mem_alloc: bool = False,
     randomized_mem_scheduler: bool = False,
     profile_untiled: bool = False,
@@ -86,6 +87,8 @@ def create_test_config(
             gen_args_list.append(f"--searchStrategy={search_strategy}")
         if profile_tiling:
             gen_args_list.append("--profileTiling")
+        if profile_microbenchmark:
+            gen_args_list.append("--profileMicrobenchmark")
         if plot_mem_alloc:
             gen_args_list.append("--plotMemAlloc")
         if randomized_mem_scheduler:
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index 9578c2f26c..18a4f870ac 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -211,6 +211,9 @@ def __init__(self, tiling_arguments: bool, description = None):
                               action = "store_true",
                               help = 'Enable randomized memory scheduler\n')
             self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n')
+            self.add_argument('--profileMicrobenchmark',
+                              action = 'store_true',
+                              help = 'Wrap each layer with PULP perf-counter microbenchmark\n')
             self.add_argument('--memAllocStrategy',
                               metavar = 'memAllocStrategy',
                               dest = 'memAllocStrategy',
@@ -271,6 +274,8 @@ def generate_cmd_args(self) -> str:
                 command += " --randomizedMemoryScheduler"
             if self.args.profileTiling:
                 command += f" --profileTiling"
+            if self.args.profileMicrobenchmark:
+                command += f" --profileMicrobenchmark"
             if self.args.memAllocStrategy:
                 command += f" --memAllocStrategy={self.args.memAllocStrategy}"
             if self.args.plotMemAlloc:
diff --git a/TargetLibraries/PULPOpen/inc/perf_utils.h b/TargetLibraries/PULPOpen/inc/perf_utils.h
new file mode 100644
index 0000000000..c710402ed2
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/perf_utils.h
@@ -0,0 +1,149 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Performance Counter Utilities for PULP Benchmarking
+ */
+
+#ifndef __PERF_UTILS_H__
+#define __PERF_UTILS_H__
+
+#include "pmsis.h"
+
+// Performance event IDs (compatible with PMSIS)
+#define PI_PERF_CYCLES CSR_PCER_CYCLES
+#define PI_PERF_INSTR CSR_PCER_INSTR
+#define PI_PERF_LD_STALL CSR_PCER_LD_STALL
+#define PI_PERF_JMP_STALL CSR_PCER_JMP_STALL
+#define PI_PERF_IMISS CSR_PCER_IMISS
+#define PI_PERF_LD CSR_PCER_LD
+#define PI_PERF_ST CSR_PCER_ST
+#define PI_PERF_JUMP CSR_PCER_JUMP
+#define PI_PERF_BRANCH CSR_PCER_BRANCH
+#define PI_PERF_TAKEN_BRANCH CSR_PCER_TAKEN_BRANCH
+#define PI_PERF_RVC CSR_PCER_RVC
+#define PI_PERF_LD_EXT CSR_PCER_LD_EXT
+#define PI_PERF_ST_EXT CSR_PCER_ST_EXT
+#define PI_PERF_LD_EXT_CYC CSR_PCER_LD_EXT_CYC
+#define PI_PERF_ST_EXT_CYC CSR_PCER_ST_EXT_CYC
+#define PI_PERF_TCDM_CONT CSR_PCER_TCDM_CONT
+
+// Benchmark statistics structure
+typedef struct {
+  unsigned int cycles;
+  unsigned int instr;
+  unsigned int ld;
+  unsigned int st;
+  unsigned int ld_stall;
+  unsigned int jmp_stall;
+  unsigned int imiss;
+  unsigned int branch;
+  unsigned int taken_branch;
+  unsigned int rvc;
+  unsigned int ld_ext;
+  unsigned int st_ext;
+  unsigned int ld_ext_cyc;
+  unsigned int st_ext_cyc;
+  unsigned int tcdm_cont;
+} perf_stats_t;
+
+// Initialize performance counters for comprehensive benchmarking
+static inline void perf_bench_init() {
+  // Enable all performance counters
+  pi_perf_conf(
+      (1 << PI_PERF_CYCLES) | (1 << PI_PERF_INSTR) | (1 << PI_PERF_LD_STALL) |
+      (1 << PI_PERF_JMP_STALL) | (1 << PI_PERF_IMISS) | (1 << PI_PERF_LD) |
+      (1 << PI_PERF_ST) | (1 << PI_PERF_JUMP) | (1 << PI_PERF_BRANCH) |
+      (1 << PI_PERF_TAKEN_BRANCH) | (1 << PI_PERF_RVC) | (1 << PI_PERF_LD_EXT) |
+      (1 << PI_PERF_ST_EXT) | (1 << PI_PERF_LD_EXT_CYC) |
+      (1 << PI_PERF_ST_EXT_CYC) | (1 << PI_PERF_TCDM_CONT));
+}
+
+// Start performance monitoring
+static inline void perf_bench_start() {
+  pi_perf_reset();
+  pi_perf_start();
+}
+
+// Stop performance monitoring
+static inline void perf_bench_stop() { pi_perf_stop(); }
+
+// Read all performance counters into structure
+static inline void perf_bench_read(perf_stats_t *stats) {
+  stats->cycles = pi_perf_read(PI_PERF_CYCLES);
+  stats->instr = pi_perf_read(PI_PERF_INSTR);
+  stats->ld = pi_perf_read(PI_PERF_LD);
+  stats->st = pi_perf_read(PI_PERF_ST);
+  stats->ld_stall = pi_perf_read(PI_PERF_LD_STALL);
+  stats->jmp_stall = pi_perf_read(PI_PERF_JMP_STALL);
+  stats->imiss = pi_perf_read(PI_PERF_IMISS);
+  stats->branch = pi_perf_read(PI_PERF_BRANCH);
+  stats->taken_branch = pi_perf_read(PI_PERF_TAKEN_BRANCH);
+  stats->rvc = pi_perf_read(PI_PERF_RVC);
+  stats->ld_ext = pi_perf_read(PI_PERF_LD_EXT);
+  stats->st_ext = pi_perf_read(PI_PERF_ST_EXT);
+  stats->ld_ext_cyc = pi_perf_read(PI_PERF_LD_EXT_CYC);
+  stats->st_ext_cyc = pi_perf_read(PI_PERF_ST_EXT_CYC);
+  stats->tcdm_cont = pi_perf_read(PI_PERF_TCDM_CONT);
+}
+
+// Print performance statistics (core 0 only to avoid clutter)
+static inline void perf_bench_print(const char *label, perf_stats_t *stats) {
+  if (pi_core_id() == 0) {
+    printf("\n=== Performance Statistics: %s ===\n", label);
+    printf("Cycles:              %10u\n", stats->cycles);
+    printf("Instructions:        %10u\n", stats->instr);
+    printf("IPC:                 %10.3f\n",
+           stats->cycles > 0 ? (float)stats->instr / stats->cycles : 0.0f);
+    printf("\n--- Instruction Mix ---\n");
+    printf("Loads:               %10u (%.2f%%)\n", stats->ld,
+           stats->instr > 0 ? 100.0f * stats->ld / stats->instr : 0.0f);
+    printf("Stores:              %10u (%.2f%%)\n", stats->st,
+           stats->instr > 0 ? 100.0f * stats->st / stats->instr : 0.0f);
+    printf("Branches:            %10u (%.2f%%)\n", stats->branch,
+           stats->instr > 0 ? 100.0f * stats->branch / stats->instr : 0.0f);
+    printf("Taken Branches:      %10u (%.2f%%)\n", stats->taken_branch,
+           stats->branch > 0 ? 100.0f * stats->taken_branch / stats->branch
+                             : 0.0f);
+    printf("Compressed (RVC):    %10u (%.2f%%)\n", stats->rvc,
+           stats->instr > 0 ? 100.0f * stats->rvc / stats->instr : 0.0f);
+    printf("\n--- Stalls & Hazards ---\n");
+    printf("Load Stalls:         %10u\n", stats->ld_stall);
+    printf("Jump Stalls:         %10u\n", stats->jmp_stall);
+    printf("I-cache Misses:      %10u\n", stats->imiss);
+    printf("TCDM Contentions:    %10u\n", stats->tcdm_cont);
+    printf("\n--- Memory Hierarchy ---\n");
+    printf("External Loads:      %10u (%.2f%%)\n", stats->ld_ext,
+           stats->ld > 0 ? 100.0f * stats->ld_ext / stats->ld : 0.0f);
+    printf("External Stores:     %10u (%.2f%%)\n", stats->st_ext,
+           stats->st > 0 ? 100.0f * stats->st_ext / stats->st : 0.0f);
+    printf("Ext Load Cycles:     %10u (avg: %.2f)\n", stats->ld_ext_cyc,
+           stats->ld_ext > 0 ? (float)stats->ld_ext_cyc / stats->ld_ext : 0.0f);
+    printf("Ext Store Cycles:    %10u (avg: %.2f)\n", stats->st_ext_cyc,
+           stats->st_ext > 0 ? (float)stats->st_ext_cyc / stats->st_ext : 0.0f);
+    printf("========================================\n\n");
+  }
+}
+
+// Compute difference between two stats (for analyzing specific code sections)
+static inline void perf_bench_diff(perf_stats_t *result, perf_stats_t *end,
+                                   perf_stats_t *start) {
+  result->cycles = end->cycles - start->cycles;
+  result->instr = end->instr - start->instr;
+  result->ld = end->ld - start->ld;
+  result->st = end->st - start->st;
+  result->ld_stall = end->ld_stall - start->ld_stall;
+  result->jmp_stall = end->jmp_stall - start->jmp_stall;
+  result->imiss = end->imiss - start->imiss;
+  result->branch = end->branch - start->branch;
+  result->taken_branch = end->taken_branch - start->taken_branch;
+  result->rvc = end->rvc - start->rvc;
+  result->ld_ext = end->ld_ext - start->ld_ext;
+  result->st_ext = end->st_ext - start->st_ext;
+  result->ld_ext_cyc = end->ld_ext_cyc - start->ld_ext_cyc;
+  result->st_ext_cyc = end->st_ext_cyc - start->st_ext_cyc;
+  result->tcdm_cont = end->tcdm_cont - start->tcdm_cont;
+}
+
+#endif // __PERF_UTILS_H__
diff --git a/docs/tutorials/microbenchmark.rst b/docs/tutorials/microbenchmark.rst
new file mode 100644
index 0000000000..c005090020
--- /dev/null
+++ b/docs/tutorials/microbenchmark.rst
@@ -0,0 +1,24 @@
+.. SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
+Microbenchmark
+==============
+
+Pass ``--profileMicrobenchmark`` to any PULPOpen runner (``testMVP.py``, ``generateNetwork.py``, ``deeployRunner_*.py``) to wrap each layer in ``RunNetwork`` with PULP performance counters. Off by default; zero overhead when unused.
+
+The flag flows through :py:attr:`Deeploy.DeeployTypes.CodeGenVerbosity.microbenchmarkProfiling` into :py:class:`Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPMicrobenchmark.PULPMicrobenchmark`, which is registered last in the PULPOpen ``ForkTransformer`` and ``ClusterTransformer`` chains so it covers the full per-layer body (tiling, DMA, memory management). The C-side helpers live in ``TargetLibraries/PULPOpen/inc/perf_utils.h``.
+
+Each layer prints one block on ``core 0``:
+
+.. code-block:: text
+
+    === Performance Statistics: Add_0 ===
+    Cycles:                    1442
+    Instructions:               149
+    IPC:                      0.103
+    Loads / Stores / Branches / Taken Branches / RVC
+    Load Stalls / Jump Stalls / I-cache Misses / TCDM Contentions
+    External Loads / Stores and their cycle counts
+
+External-memory and TCDM-contention counters are zero when the wrapped region has no L2/L3 traffic or no bank conflicts (e.g. small untiled kernels that fit in L1). Some events may not be modelled by GVSoC — verify on a tiled test before assuming a counter is broken.
diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst
index 0b3d97c761..c0a9660104 100644
--- a/docs/tutorials/overview.rst
+++ b/docs/tutorials/overview.rst
@@ -14,5 +14,6 @@ Each tutorial covers a specific topic and includes code examples to illustrate t
 
    introduction
    debugging
+   microbenchmark