pytorch · wwwind · May 6, 2026 · May 20, 2026 · digantdesai · May 21, 2026
@@ -375,6 +375,39 @@ List of model specific and optional passes:
     - `graph_module = ToDevicePass("cpu")(graph_module).graph_module`
     - backends/arm/test/misc/test_post_quant_device_switch.py
 
+## Profiling of VGF Backend
+
+VGF profiling now emits both host-side ExecuTorch event tracer ranges and Vulkan timestamp-query measurements. The host ranges split init into `VGF_INIT_*` phases, including `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE`, and split execute into `VGF_COPY_INPUTS`, `VGF_QUEUE_SUBMIT`, `VGF_QUEUE_WAIT_IDLE`, `VGF_TIMESTAMP_QUERY_READBACK`, `VGF_DISPATCH_AND_WAIT`, and `VGF_COPY_OUTPUTS`. Vulkan timestamp queries are inserted into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`, producing `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. To collect a profile, build the VGF runner with event tracing enabled, run the model with an ETDump path, then convert the ETDump to Chrome trace JSON:
+
+```bash
+mkdir -p etdumps traces
+
+./cmake-out-vgf/executor_runner \
+  --model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \
+  --num_executions 10 \
+  --etdump_path ./etdumps/vgf_timestamps.etdp \
+  --print_output none
+
+python ./backends/arm/scripts/etdump_to_chrome_trace.py \
+  --etdump_path ./etdumps/vgf_timestamps.etdp \
+  --output ./etdumps/vgf_timestamps_trace.json
+```
+
+Open the result in Chrome by navigating to `chrome://tracing`, selecting **Load**, and choosing `./traces/vgf_timestamps_trace.json`. The key fields to inspect are `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE` for pipeline creation/init cost, `VGF_QUEUE_SUBMIT` and `VGF_QUEUE_WAIT_IDLE` for host-side submission/wait overhead, and `VGF_DATA_GRAPH_DEVICE_TIME` for device-side data-graph execution time.
+
+VGF profiling can emit optional Vulkan timestamp-query measurements. Vulkan timestamp queries are controlled by the `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` environment variable. Set it to `1` to insert timestamp queries into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`. When enabled, the backend emits `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. If `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` is unset or set to `0`, only host-side ExecuTorch event tracer ranges are collected and no Vulkan timestamp-query readback is performed. Note that the timestamp-query measurements will be printed out and not included into `.etdp`.
+
+So, in this case the command is:
+
+```bash
+EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES=1 \
+./cmake-out-vgf/executor_runner \
+  --model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \
+  --num_executions 10 \
+  --etdump_path ./etdumps/vgf_timestamps.etdp \
+  --print_output none
+```
+
 ## Help & Improvements
 
 If you have problems or questions, or have suggestions for ways to improve the Arm backend, please reach out

@@ -6,13 +6,20 @@
  */
 
 #include <cinttypes>
+#include <list>
+#include <numeric>
+
 using namespace std;
 
 #include <c10/util/safe_numerics.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
+#ifdef ET_EVENT_TRACER_ENABLED
+#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
+#endif
+
 using executorch::aten::Tensor;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
@@ -27,6 +34,13 @@ using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 
+#ifdef ET_EVENT_TRACER_ENABLED
+using executorch::runtime::event_tracer_end_profiling_delegate;
+using executorch::runtime::event_tracer_start_profiling_delegate;
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerEntry;
+#endif
+
 // We use the platform and runtime environment provided by the Vulkan delegate
 #include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
 
@@ -69,7 +83,8 @@ VkResult vkml_allocate_basics(
     VkPhysicalDevice* physical_device,
     VkDevice* device,
     VkQueue* queue,
-    VkCommandPool* command_pool);
+    VkCommandPool* command_pool,
+    uint32_t* queue_family_index);
 
 void vkml_free_basics(
     VkInstance* instance,
@@ -104,7 +119,8 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
         &vk_physical_device,
         &vk_device,
         &vk_queue,
-        &vk_command_pool);
+        &vk_command_pool,
+        &vk_queue_family_index);
     if (result != VK_SUCCESS) {
       ET_LOG(
           Error, "Failed to initialize the Vulkan device error 0x%08X", result);
@@ -142,8 +158,31 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
       ArrayRef<CompileSpec> compile_specs) const override {
     ET_LOG(Info, "Entered VGF init");
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    EventTracer* event_tracer = context.event_tracer();
+
+    EventTracerEntry init_total_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_INIT_TOTAL",
+        /*delegate_debug_id=*/-1);
+
+    EventTracerEntry ensure_initialized_event =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "VGF_INIT_ENSURE_INITIALIZED",
+            /*delegate_debug_id=*/-1);
+#endif
+
     const_cast<VGFBackend*>(this)->ensure_initialized();
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, ensure_initialized_event);
+#endif
+
     if (!is_initialized_) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(event_tracer, init_total_event);
+#endif
       ET_LOG(
           Error,
           "VGF backend is unavailable because Vulkan initialization failed");
@@ -152,30 +191,89 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
 
     const char* vgf_data = reinterpret_cast<const char*>(processed->data());
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    EventTracerEntry allocate_repr_event =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "VGF_INIT_ALLOCATE_REPR",
+            /*delegate_debug_id=*/-1);
+#endif
+
     MemoryAllocator* allocator = context.get_runtime_allocator();
     VgfRepr* repr = allocator->allocateInstance<VgfRepr>();
     new (repr) VgfRepr(
-        vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool);
+        vk_instance,
+        vk_physical_device,
+        vk_device,
+        vk_queue,
+        vk_command_pool,
+        vk_queue_family_index);
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, allocate_repr_event);
+
+    EventTracerEntry process_vgf_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_INIT_PROCESS_VGF_BACKEND",
+        /*delegate_debug_id=*/-1);
+#endif
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    auto valid_vgf = repr->process_vgf(
+        vgf_data, processed->size(), compile_specs, event_tracer);
+#else
     auto valid_vgf =
         repr->process_vgf(vgf_data, processed->size(), compile_specs);
+#endif
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, process_vgf_event);
+#endif
+
     if (!valid_vgf) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(event_tracer, init_total_event);
+#endif
       ET_LOG(Error, "Failed to process VGF blob.");
       return Error::Internal;
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, init_total_event);
+#endif
+
     return repr;
   }
 
   Error execute(
-      ET_UNUSED BackendExecutionContext& context,
+      BackendExecutionContext& context,
       DelegateHandle* handle,
       Span<EValue*> args) const override {
     VgfRepr* repr = static_cast<VgfRepr*>(handle);
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    EventTracer* event_tracer = context.event_tracer();
+
+    EventTracerEntry vgf_execute_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_EXECUTE",
+        /*delegate_debug_id=*/-1);
+
+    EventTracerEntry copy_inputs_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_COPY_INPUTS",
+        /*delegate_debug_id=*/-1);
+#else
+    (void)context;
+#endif
+
     // Copy all inputs from EValue to VkDeviceMemory
     for (int i = 0; i < repr->IOs.size(); i++) {
       if (!args[i]->isTensor()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(
             Error,
             "Expected EValue %d to be tensor, got %d",
@@ -206,22 +304,59 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
 
       void* data;
       if (!repr->map_io(io, &data)) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(Error, "Failed to map Vulkan IO memory");
         return Error::Internal;
       }
       memcpy(data, tensor->mutable_data_ptr(), io_size);
       repr->unmap_io(io);
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
+
+    EventTracerEntry dispatch_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_DISPATCH_AND_WAIT",
+        /*delegate_debug_id=*/-1);
+#endif
+
     // Execute the workload
-    if (!repr->execute_vgf()) {
+    bool execute_ok = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    execute_ok = repr->execute_vgf(event_tracer);
+#else
+    execute_ok = repr->execute_vgf();
+#endif
+
+    if (!execute_ok) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(event_tracer, dispatch_event);
+      event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
       ET_LOG(Error, "Failed to execute the VGF representation");
       return Error::Internal;
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, dispatch_event);
+
+    EventTracerEntry copy_outputs_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_COPY_OUTPUTS",
+        /*delegate_debug_id=*/-1);
+#endif
+
     // Copy all outputs from VKDeviceMemory to EValue
     for (int i = 0; i < repr->IOs.size(); i++) {
       if (!args[i]->isTensor()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(
             Error,
             "Expected EValue %d to be tensor, got %d",
@@ -251,13 +386,22 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
 
       void* data;
       if (!repr->map_io(io, &data)) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(Error, "Failed to map Vulkan IO memory");
         return Error::Internal;
       }
       memcpy(tensor->mutable_data_ptr(), data, io_size);
       repr->unmap_io(io);
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
+    event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
+
     return Error::Ok;
   }
 
@@ -272,6 +416,7 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
   VkDevice vk_device = VK_NULL_HANDLE;
   VkQueue vk_queue = VK_NULL_HANDLE;
   VkCommandPool vk_command_pool = VK_NULL_HANDLE;
+  uint32_t vk_queue_family_index = UINT32_MAX;
   bool is_initialized_ = false;
 };
 
@@ -286,7 +431,8 @@ VkResult vkml_allocate_basics(
     VkPhysicalDevice* physical_device,
     VkDevice* device,
     VkQueue* queue,
-    VkCommandPool* command_pool) {
+    VkCommandPool* command_pool,
+    uint32_t* queue_family_index) {
   VkResult result;
 
   if (VK_SUCCESS != volkInitialize()) {
@@ -408,6 +554,9 @@ VkResult vkml_allocate_basics(
     ET_LOG(Error, "Failed to find suitable queue");
     return VK_ERROR_UNKNOWN;
   }
+  if (queue_family_index != nullptr) {
+    *queue_family_index = qf;
+  }
 
   // Device with ML tensor extension
   float qp = 1.0f;
@@ -544,4 +693,4 @@ VkResult vkml_allocate_basics(
 
 } // namespace vgf
 } // namespace backends
-} // namespace executorch
+} // namespace executorch