diff --git a/backends/arm/README.md b/backends/arm/README.md
index e9afa5a928d..965566d1458 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -375,6 +375,39 @@ List of model specific and optional passes:
     - `graph_module = ToDevicePass("cpu")(graph_module).graph_module`
     - backends/arm/test/misc/test_post_quant_device_switch.py
 
+## Profiling of VGF Backend
+
+VGF profiling now emits both host-side ExecuTorch event tracer ranges and Vulkan timestamp-query measurements. The host ranges split init into `VGF_INIT_*` phases, including `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE`, and split execute into `VGF_COPY_INPUTS`, `VGF_QUEUE_SUBMIT`, `VGF_QUEUE_WAIT_IDLE`, `VGF_TIMESTAMP_QUERY_READBACK`, `VGF_DISPATCH_AND_WAIT`, and `VGF_COPY_OUTPUTS`. Vulkan timestamp queries are inserted into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`, producing `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. To collect a profile, build the VGF runner with event tracing enabled, run the model with an ETDump path, then convert the ETDump to Chrome trace JSON:
+
+```bash
+mkdir -p etdumps traces
+
+./cmake-out-vgf/executor_runner \
+  --model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \
+  --num_executions 10 \
+  --etdump_path ./etdumps/vgf_timestamps.etdp \
+  --print_output none
+
+python ./backends/arm/scripts/etdump_to_chrome_trace.py \
+  --etdump_path ./etdumps/vgf_timestamps.etdp \
+  --output ./etdumps/vgf_timestamps_trace.json
+```
+
+Open the result in Chrome by navigating to `chrome://tracing`, selecting **Load**, and choosing `./traces/vgf_timestamps_trace.json`. The key fields to inspect are `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE` for pipeline creation/init cost, `VGF_QUEUE_SUBMIT` and `VGF_QUEUE_WAIT_IDLE` for host-side submission/wait overhead, and `VGF_DATA_GRAPH_DEVICE_TIME` for device-side data-graph execution time.
+
+VGF profiling can emit optional Vulkan timestamp-query measurements. Vulkan timestamp queries are controlled by the `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` environment variable. Set it to `1` to insert timestamp queries into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`. When enabled, the backend emits `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. If `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` is unset or set to `0`, only host-side ExecuTorch event tracer ranges are collected and no Vulkan timestamp-query readback is performed. Note that the timestamp-query measurements will be printed out and not included into `.etdp`.
+
+So, in this case the command is:
+
+```bash
+EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES=1 \
+./cmake-out-vgf/executor_runner \
+  --model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \
+  --num_executions 10 \
+  --etdump_path ./etdumps/vgf_timestamps.etdp \
+  --print_output none
+```
+
 ## Help & Improvements
 
 If you have problems or questions, or have suggestions for ways to improve the Arm backend, please reach out
diff --git a/backends/arm/runtime/VGFBackend.cpp b/backends/arm/runtime/VGFBackend.cpp
index 8ac804f7744..f6a7002d862 100644
--- a/backends/arm/runtime/VGFBackend.cpp
+++ b/backends/arm/runtime/VGFBackend.cpp
@@ -6,6 +6,9 @@
  */
 
 #include <cinttypes>
+#include <list>
+#include <numeric>
+
 using namespace std;
 
 #include <c10/util/safe_numerics.h>
@@ -13,6 +16,10 @@ using namespace std;
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
+#ifdef ET_EVENT_TRACER_ENABLED
+#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
+#endif
+
 using executorch::aten::Tensor;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
@@ -27,6 +34,13 @@ using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 
+#ifdef ET_EVENT_TRACER_ENABLED
+using executorch::runtime::event_tracer_end_profiling_delegate;
+using executorch::runtime::event_tracer_start_profiling_delegate;
+using executorch::runtime::EventTracer;
+using executorch::runtime::EventTracerEntry;
+#endif
+
 // We use the platform and runtime environment provided by the Vulkan delegate
 #include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
 
@@ -69,7 +83,8 @@ VkResult vkml_allocate_basics(
     VkPhysicalDevice* physical_device,
     VkDevice* device,
     VkQueue* queue,
-    VkCommandPool* command_pool);
+    VkCommandPool* command_pool,
+    uint32_t* queue_family_index);
 
 void vkml_free_basics(
     VkInstance* instance,
@@ -104,7 +119,8 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
         &vk_physical_device,
         &vk_device,
         &vk_queue,
-        &vk_command_pool);
+        &vk_command_pool,
+        &vk_queue_family_index);
     if (result != VK_SUCCESS) {
       ET_LOG(
           Error, "Failed to initialize the Vulkan device error 0x%08X", result);
@@ -142,8 +158,31 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
       ArrayRef<CompileSpec> compile_specs) const override {
     ET_LOG(Info, "Entered VGF init");
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    EventTracer* event_tracer = context.event_tracer();
+
+    EventTracerEntry init_total_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_INIT_TOTAL",
+        /*delegate_debug_id=*/-1);
+
+    EventTracerEntry ensure_initialized_event =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "VGF_INIT_ENSURE_INITIALIZED",
+            /*delegate_debug_id=*/-1);
+#endif
+
     const_cast<VGFBackend*>(this)->ensure_initialized();
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, ensure_initialized_event);
+#endif
+
     if (!is_initialized_) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(event_tracer, init_total_event);
+#endif
       ET_LOG(
           Error,
           "VGF backend is unavailable because Vulkan initialization failed");
@@ -152,30 +191,89 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
 
     const char* vgf_data = reinterpret_cast<const char*>(processed->data());
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    EventTracerEntry allocate_repr_event =
+        event_tracer_start_profiling_delegate(
+            event_tracer,
+            "VGF_INIT_ALLOCATE_REPR",
+            /*delegate_debug_id=*/-1);
+#endif
+
     MemoryAllocator* allocator = context.get_runtime_allocator();
     VgfRepr* repr = allocator->allocateInstance<VgfRepr>();
     new (repr) VgfRepr(
-        vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool);
+        vk_instance,
+        vk_physical_device,
+        vk_device,
+        vk_queue,
+        vk_command_pool,
+        vk_queue_family_index);
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, allocate_repr_event);
+
+    EventTracerEntry process_vgf_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_INIT_PROCESS_VGF_BACKEND",
+        /*delegate_debug_id=*/-1);
+#endif
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    auto valid_vgf = repr->process_vgf(
+        vgf_data, processed->size(), compile_specs, event_tracer);
+#else
     auto valid_vgf =
         repr->process_vgf(vgf_data, processed->size(), compile_specs);
+#endif
+
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, process_vgf_event);
+#endif
+
     if (!valid_vgf) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(event_tracer, init_total_event);
+#endif
       ET_LOG(Error, "Failed to process VGF blob.");
       return Error::Internal;
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, init_total_event);
+#endif
+
     return repr;
   }
 
   Error execute(
-      ET_UNUSED BackendExecutionContext& context,
+      BackendExecutionContext& context,
       DelegateHandle* handle,
       Span<EValue*> args) const override {
     VgfRepr* repr = static_cast<VgfRepr*>(handle);
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    EventTracer* event_tracer = context.event_tracer();
+
+    EventTracerEntry vgf_execute_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_EXECUTE",
+        /*delegate_debug_id=*/-1);
+
+    EventTracerEntry copy_inputs_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_COPY_INPUTS",
+        /*delegate_debug_id=*/-1);
+#else
+    (void)context;
+#endif
+
     // Copy all inputs from EValue to VkDeviceMemory
     for (int i = 0; i < repr->IOs.size(); i++) {
       if (!args[i]->isTensor()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(
             Error,
             "Expected EValue %d to be tensor, got %d",
@@ -206,6 +304,10 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
 
       void* data;
       if (!repr->map_io(io, &data)) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(Error, "Failed to map Vulkan IO memory");
         return Error::Internal;
       }
@@ -213,15 +315,48 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
       repr->unmap_io(io);
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
+
+    EventTracerEntry dispatch_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_DISPATCH_AND_WAIT",
+        /*delegate_debug_id=*/-1);
+#endif
+
     // Execute the workload
-    if (!repr->execute_vgf()) {
+    bool execute_ok = false;
+#ifdef ET_EVENT_TRACER_ENABLED
+    execute_ok = repr->execute_vgf(event_tracer);
+#else
+    execute_ok = repr->execute_vgf();
+#endif
+
+    if (!execute_ok) {
+#ifdef ET_EVENT_TRACER_ENABLED
+      event_tracer_end_profiling_delegate(event_tracer, dispatch_event);
+      event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
       ET_LOG(Error, "Failed to execute the VGF representation");
       return Error::Internal;
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, dispatch_event);
+
+    EventTracerEntry copy_outputs_event = event_tracer_start_profiling_delegate(
+        event_tracer,
+        "VGF_COPY_OUTPUTS",
+        /*delegate_debug_id=*/-1);
+#endif
+
     // Copy all outputs from VKDeviceMemory to EValue
     for (int i = 0; i < repr->IOs.size(); i++) {
       if (!args[i]->isTensor()) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(
             Error,
             "Expected EValue %d to be tensor, got %d",
@@ -251,6 +386,10 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
 
       void* data;
       if (!repr->map_io(io, &data)) {
+#ifdef ET_EVENT_TRACER_ENABLED
+        event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
+        event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
         ET_LOG(Error, "Failed to map Vulkan IO memory");
         return Error::Internal;
       }
@@ -258,6 +397,11 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
       repr->unmap_io(io);
     }
 
+#ifdef ET_EVENT_TRACER_ENABLED
+    event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
+    event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
+#endif
+
     return Error::Ok;
   }
 
@@ -272,6 +416,7 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
   VkDevice vk_device = VK_NULL_HANDLE;
   VkQueue vk_queue = VK_NULL_HANDLE;
   VkCommandPool vk_command_pool = VK_NULL_HANDLE;
+  uint32_t vk_queue_family_index = UINT32_MAX;
   bool is_initialized_ = false;
 };
 
@@ -286,7 +431,8 @@ VkResult vkml_allocate_basics(
     VkPhysicalDevice* physical_device,
     VkDevice* device,
     VkQueue* queue,
-    VkCommandPool* command_pool) {
+    VkCommandPool* command_pool,
+    uint32_t* queue_family_index) {
   VkResult result;
 
   if (VK_SUCCESS != volkInitialize()) {
@@ -408,6 +554,9 @@ VkResult vkml_allocate_basics(
     ET_LOG(Error, "Failed to find suitable queue");
     return VK_ERROR_UNKNOWN;
   }
+  if (queue_family_index != nullptr) {
+    *queue_family_index = qf;
+  }
 
   // Device with ML tensor extension
   float qp = 1.0f;
@@ -544,4 +693,4 @@ VkResult vkml_allocate_basics(
 
 } // namespace vgf
 } // namespace backends
-} // namespace executorch
+} // namespace executorch
\ No newline at end of file
diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp
index b62a6b2ec23..3e62ce1735f 100644
--- a/backends/arm/runtime/VGFSetup.cpp
+++ b/backends/arm/runtime/VGFSetup.cpp
@@ -12,6 +12,13 @@
 
 #include <executorch/backends/arm/runtime/VGFSetup.h>
 
+#include <cstdlib>
+#include <limits>
+
+#ifdef ET_EVENT_TRACER_ENABLED
+#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
+#endif
+
 #include <vgf/decoder.hpp>
 #include <vgf/vulkan_helpers.generated.hpp>
 
@@ -31,8 +38,41 @@ namespace {
 constexpr int64_t kScalarSentinelDimension = 1;
 }
 
-#if defined(ET_ARM_VGF_DEBUG)
+#ifdef ET_EVENT_TRACER_ENABLED
+namespace {
+class ScopedVgfProfileEvent {
+ public:
+  ScopedVgfProfileEvent(
+      executorch::runtime::EventTracer* event_tracer,
+      const char* name)
+      : event_tracer_(event_tracer),
+        entry_(executorch::runtime::event_tracer_start_profiling_delegate(
+            event_tracer_,
+            name,
+            /*delegate_debug_id=*/-1)) {}
+
+  ~ScopedVgfProfileEvent() {
+    executorch::runtime::event_tracer_end_profiling_delegate(
+        event_tracer_, entry_);
+  }
+
+ private:
+  executorch::runtime::EventTracer* event_tracer_;
+  executorch::runtime::EventTracerEntry entry_;
+};
+} // namespace
+
+#define VGF_CONCAT_INNER(a, b) a##b
+#define VGF_CONCAT(a, b) VGF_CONCAT_INNER(a, b)
+#define VGF_PROFILE_SCOPE(event_tracer, name)                      \
+  ScopedVgfProfileEvent VGF_CONCAT(_vgf_profile_scope_, __LINE__)( \
+      event_tracer, name)
+#else
+#define VGF_PROFILE_SCOPE(event_tracer, name) (void)(event_tracer)
+#endif
+
 // Debug function to inspect memory properties
+#if defined(ET_ARM_VGF_DEBUG)
 static string memory_flags_to_string(VkMemoryPropertyFlags flags) {
   if (flags == 0)
     return "0";
@@ -102,6 +142,153 @@ uint32_t get_memory_index(
   return memory_type;
 }
 
+bool VgfRepr::init_timestamp_queries() {
+  const char* enable = std::getenv("EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES");
+  if (enable == nullptr || enable[0] == '\0') {
+    ET_LOG(Info, "VGF timestamp queries disabled");
+    return true;
+  }
+
+  if (timestamp_queries_enabled || vk_timestamp_query_pool != VK_NULL_HANDLE) {
+    return true;
+  }
+
+  if (vk_queue_family_index == UINT32_MAX) {
+    ET_LOG(Info, "VGF timestamp queries disabled: unknown queue family index");
+    return true;
+  }
+
+  uint32_t queue_family_count = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      vk_physical, &queue_family_count, nullptr);
+
+  if (vk_queue_family_index >= queue_family_count) {
+    ET_LOG(
+        Info,
+        "VGF timestamp queries disabled: queue family index %u is out of range",
+        vk_queue_family_index);
+    return true;
+  }
+
+  vector<VkQueueFamilyProperties> queue_family_properties(queue_family_count);
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      vk_physical, &queue_family_count, queue_family_properties.data());
+
+  timestamp_valid_bits =
+      queue_family_properties[vk_queue_family_index].timestampValidBits;
+
+  if (timestamp_valid_bits == 0) {
+    ET_LOG(
+        Info,
+        "VGF timestamp queries disabled: queue family %u does not support timestamps",
+        vk_queue_family_index);
+    return true;
+  }
+
+  VkPhysicalDeviceProperties physical_device_properties;
+  vkGetPhysicalDeviceProperties(vk_physical, &physical_device_properties);
+
+  timestamp_period_ns =
+      static_cast<double>(physical_device_properties.limits.timestampPeriod);
+
+  if (timestamp_period_ns <= 0.0) {
+    ET_LOG(
+        Info,
+        "VGF timestampPeriod is %.6f; using fallback 52.0 ns/tick",
+        timestamp_period_ns);
+    timestamp_period_ns = 52.0;
+  }
+
+  VkQueryPoolCreateInfo query_pool_info{
+      .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .queryType = VK_QUERY_TYPE_TIMESTAMP,
+      .queryCount = 2,
+      .pipelineStatistics = 0,
+  };
+
+  VkResult result = vkCreateQueryPool(
+      vk_device, &query_pool_info, nullptr, &vk_timestamp_query_pool);
+
+  if (result != VK_SUCCESS) {
+    ET_LOG(
+        Info,
+        "VGF timestamp queries disabled: vkCreateQueryPool failed with %d",
+        result);
+    vk_timestamp_query_pool = VK_NULL_HANDLE;
+    return true;
+  }
+
+  timestamp_queries_enabled = true;
+
+  ET_LOG(
+      Info,
+      "VGF timestamp queries enabled: queue_family=%u valid_bits=%u period_ns=%.6f",
+      vk_queue_family_index,
+      timestamp_valid_bits,
+      timestamp_period_ns);
+
+  return true;
+}
+
+void VgfRepr::read_timestamp_queries(
+    executorch::runtime::EventTracer* event_tracer) {
+  if (!timestamp_queries_enabled || vk_timestamp_query_pool == VK_NULL_HANDLE) {
+    return;
+  }
+
+  uint64_t timestamps[2] = {0, 0};
+  VkResult result;
+
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_TIMESTAMP_QUERY_READBACK");
+
+    result = vkGetQueryPoolResults(
+        vk_device,
+        vk_timestamp_query_pool,
+        0,
+        2,
+        sizeof(timestamps),
+        timestamps,
+        sizeof(uint64_t),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
+  }
+
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to read VGF timestamp query results: %d", result);
+    return;
+  }
+
+  uint64_t start = timestamps[0];
+  uint64_t end = timestamps[1];
+
+  uint64_t mask = std::numeric_limits<uint64_t>::max();
+  if (timestamp_valid_bits < 64) {
+    mask = (1ULL << timestamp_valid_bits) - 1ULL;
+    start &= mask;
+    end &= mask;
+  }
+
+  uint64_t delta_ticks;
+  if (end >= start) {
+    delta_ticks = end - start;
+  } else {
+    delta_ticks = (mask - start) + end + 1ULL;
+  }
+
+  const double duration_ns =
+      static_cast<double>(delta_ticks) * timestamp_period_ns;
+  const double duration_ms = duration_ns / 1000000.0;
+
+  ET_LOG(
+      Info,
+      "VGF_DATA_GRAPH_DEVICE_TIME ticks=%llu duration_ns=%.3f duration_ms=%.6f",
+      static_cast<unsigned long long>(delta_ticks),
+      duration_ns,
+      duration_ms);
+}
+
 /**
  * Tensor allocation helper function
  */
@@ -339,41 +526,51 @@ static void debug_print_modules(
 bool VgfRepr::process_vgf(
     const char* vgf_data,
     size_t vgf_size,
-    ArrayRef<CompileSpec> specs) {
+    ArrayRef<CompileSpec> specs,
+    executorch::runtime::EventTracer* event_tracer) {
+  VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_PROCESS_VGF");
+  (void)specs;
+
   ET_LOG(Info, "Preparing VGF as Vulkan objects");
 
   VkResult result;
 
-  // Prepare temporary decoders
-  unique_ptr<vgflib::HeaderDecoder> header_decoder =
-      vgflib::CreateHeaderDecoder(vgf_data, vgflib::HeaderSize(), vgf_size);
-  if (!header_decoder) {
-    ET_LOG(Error, "Failed to create VGF header decoder");
-    return false;
-  }
+  unique_ptr<vgflib::HeaderDecoder> header_decoder;
+  unique_ptr<vgflib::ModelSequenceTableDecoder> sequence_decoder;
+  unique_ptr<vgflib::ModuleTableDecoder> module_decoder;
+  unique_ptr<vgflib::ModelResourceTableDecoder> resource_decoder;
+  unique_ptr<vgflib::ConstantDecoder> constant_decoder;
 
-  unique_ptr<vgflib::ModelSequenceTableDecoder> sequence_decoder =
-      vgflib::CreateModelSequenceTableDecoder(
-          vgf_data + header_decoder->GetModelSequenceTableOffset(),
-          header_decoder->GetModelSequenceTableSize());
-  unique_ptr<vgflib::ModuleTableDecoder> module_decoder =
-      vgflib::CreateModuleTableDecoder(
-          vgf_data + header_decoder->GetModuleTableOffset(),
-          header_decoder->GetModuleTableSize());
-  unique_ptr<vgflib::ModelResourceTableDecoder> resource_decoder =
-      vgflib::CreateModelResourceTableDecoder(
-          vgf_data + header_decoder->GetModelResourceTableOffset(),
-          header_decoder->GetModelResourceTableSize());
-  unique_ptr<vgflib::ConstantDecoder> constant_decoder =
-      vgflib::CreateConstantDecoder(
-          vgf_data + header_decoder->GetConstantsOffset(),
-          header_decoder->GetConstantsSize());
-  // Check the VGF decoders
-  if (not(header_decoder && module_decoder && sequence_decoder &&
-          resource_decoder && constant_decoder && header_decoder->IsValid() &&
-          header_decoder->CheckVersion())) {
-    ET_LOG(Error, "Failed to process VGF file internalsr");
-    return false;
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_DECODE_TABLES");
+
+    // Prepare temporary decoders
+    header_decoder =
+        vgflib::CreateHeaderDecoder(vgf_data, vgflib::HeaderSize(), vgf_size);
+    if (!header_decoder) {
+      ET_LOG(Error, "Failed to create VGF header decoder");
+      return false;
+    }
+
+    sequence_decoder = vgflib::CreateModelSequenceTableDecoder(
+        vgf_data + header_decoder->GetModelSequenceTableOffset(),
+        header_decoder->GetModelSequenceTableSize());
+    module_decoder = vgflib::CreateModuleTableDecoder(
+        vgf_data + header_decoder->GetModuleTableOffset(),
+        header_decoder->GetModuleTableSize());
+    resource_decoder = vgflib::CreateModelResourceTableDecoder(
+        vgf_data + header_decoder->GetModelResourceTableOffset(),
+        header_decoder->GetModelResourceTableSize());
+    constant_decoder = vgflib::CreateConstantDecoder(
+        vgf_data + header_decoder->GetConstantsOffset(),
+        header_decoder->GetConstantsSize());
+    // Check the VGF decoders
+    if (not(header_decoder && module_decoder && sequence_decoder &&
+            resource_decoder && constant_decoder && header_decoder->IsValid() &&
+            header_decoder->CheckVersion())) {
+      ET_LOG(Error, "Failed to process VGF file internalsr");
+      return false;
+    }
   }
 
   // Parse the sequences in the VGF (while there can be multiple sequences of
@@ -381,22 +578,27 @@ bool VgfRepr::process_vgf(
   // GRAPH segment to be present.
   const int segment_id = 0;
 
-  debug_print_sequence(sequence_decoder);
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_PARSE_SEQUENCE_AND_MODULE");
+
+    debug_print_sequence(sequence_decoder);
 #if defined(ET_ARM_VGF_DEBUG)
-  debug_print_resources(resource_decoder);
+    debug_print_resources(resource_decoder);
 #endif
-  if (sequence_decoder->modelSequenceTableSize() != 1) {
-    ET_LOG(Error, "Expected sequence length 1");
-    return false;
-  }
-  if (sequence_decoder->getSegmentType(segment_id) !=
-      vgflib::ModuleType::GRAPH) {
-    ET_LOG(Error, "Expected segment to be of type GRAPH");
-    return false;
+    if (sequence_decoder->modelSequenceTableSize() != 1) {
+      ET_LOG(Error, "Expected sequence length 1");
+      return false;
+    }
+    if (sequence_decoder->getSegmentType(segment_id) !=
+        vgflib::ModuleType::GRAPH) {
+      ET_LOG(Error, "Expected segment to be of type GRAPH");
+      return false;
+    }
+
+    // Extract first segment and its associated module
+    debug_print_modules(module_decoder);
   }
 
-  // Extract first segment and it's associated module
-  debug_print_modules(module_decoder);
   auto segment_name = string(sequence_decoder->getSegmentName(segment_id));
   auto segment_module = sequence_decoder->getSegmentModuleIndex(segment_id);
 
@@ -405,18 +607,22 @@ bool VgfRepr::process_vgf(
       string(module_decoder->getModuleEntryPoint(segment_module));
   auto segment_m_spirv = module_decoder->getModuleCode(segment_module);
 
-  // Build a shader from the module
-  VkShaderModuleCreateInfo smci{
-      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
-      .pNext = nullptr,
-      .flags = 0,
-      .codeSize = segment_m_spirv.size() * sizeof(uint32_t),
-      .pCode = segment_m_spirv.begin(),
-  };
-  result = vkCreateShaderModule(vk_device, &smci, nullptr, &vk_shader);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to load shader from segment %d", segment_module);
-    return false;
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_SHADER_MODULE");
+
+    // Build a shader from the module
+    VkShaderModuleCreateInfo smci{
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .codeSize = segment_m_spirv.size() * sizeof(uint32_t),
+        .pCode = segment_m_spirv.begin(),
+    };
+    result = vkCreateShaderModule(vk_device, &smci, nullptr, &vk_shader);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to load shader from segment %d", segment_module);
+      return false;
+    }
   }
 
   // Record our shader and entrypoint string
@@ -428,538 +634,674 @@ bool VgfRepr::process_vgf(
   vector<tuple<VkTensorARM, VkTensorViewARM>> resources;
   vector<VkDataGraphPipelineConstantARM> constants;
 
-  int IO_count = resource_decoder->size();
-  for (int i = 0; i < IO_count; i++) {
-    auto resource_type = resource_decoder->getDescriptorType(i).value_or(0);
-    auto resource_format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i));
-
-    // Get tensor shape and strides
-    auto shape = resource_decoder->getTensorShape(i);
-    auto stride = resource_decoder->getTensorStride(i);
-    const auto shape_size = shape.size();
-
-    switch (resource_decoder->getCategory(i)) {
-      case vgflib::ResourceCategory::INPUT:
-      case vgflib::ResourceCategory::OUTPUT: {
-        // Expect IO to be a tensor type
-        if (resource_type != VK_DESCRIPTOR_TYPE_TENSOR_ARM) {
-          ET_LOG(
-              Error,
-              "Expected tensor type descriptor %u got %u",
-              VK_DESCRIPTOR_TYPE_TENSOR_ARM,
-              resource_type);
-          return false;
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_RESOURCE_TABLE");
+
+    int IO_count = resource_decoder->size();
+    for (int i = 0; i < IO_count; i++) {
+      auto resource_type = resource_decoder->getDescriptorType(i).value_or(0);
+      auto resource_format =
+          vgflib::ToVkFormat(resource_decoder->getVkFormat(i));
+
+      // Get tensor shape and strides
+      auto shape = resource_decoder->getTensorShape(i);
+      auto stride = resource_decoder->getTensorStride(i);
+      const auto shape_size = shape.size();
+
+      switch (resource_decoder->getCategory(i)) {
+        case vgflib::ResourceCategory::INPUT:
+        case vgflib::ResourceCategory::OUTPUT: {
+          // Expect IO to be a tensor type
+          if (resource_type != VK_DESCRIPTOR_TYPE_TENSOR_ARM) {
+            ET_LOG(
+                Error,
+                "Expected tensor type descriptor %u got %u",
+                VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+                resource_type);
+            return false;
+          }
+
+          // Allocate a tensor with backing memory
+          VkTensorARM tensor;
+          VkTensorViewARM tensor_view;
+          VkDeviceMemory tensor_memory;
+          VkTensorDescriptionARM tensor_description;
+
+          {
+            VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_IO_TENSOR");
+
+            result = allocate_tensor(
+                vk_physical,
+                vk_device,
+                resource_format,
+                shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+                shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
+                static_cast<uint32_t>(stride.size()),
+                stride.begin(),
+                &tensor_description,
+                &tensor_view,
+                &tensor,
+                &tensor_memory);
+          }
+
+          if (result != VK_SUCCESS) {
+            ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
+            return false;
+          }
+          size_t e_size = get_format_size(resource_format);
+          if (0 == e_size) {
+            ET_LOG(Error, "failed to get element size of VkFormat");
+            return false;
+          }
+
+          bool is_in = resource_decoder->getCategory(i) ==
+              vgflib::ResourceCategory::INPUT;
+          IOs.push_back(
+              IO{vector<int64_t>(shape.begin(), shape.end()),
+                 vector<int64_t>(stride.begin(), stride.end()),
+                 e_size,
+                 tensor,
+                 tensor_view,
+                 tensor_memory,
+                 is_in});
+          resources.push_back({tensor, tensor_view});
+          descriptors.push_back(tensor_description);
+          break;
         }
-
-        // Allocate a tensor with backing memory
-        VkTensorARM tensor;
-        VkTensorViewARM tensor_view;
-        VkDeviceMemory tensor_memory;
-        VkTensorDescriptionARM tensor_description;
-        result = allocate_tensor(
-            vk_physical,
-            vk_device,
-            resource_format,
-            shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
-            shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
-            static_cast<uint32_t>(stride.size()),
-            stride.begin(),
-            &tensor_description,
-            &tensor_view,
-            &tensor,
-            &tensor_memory);
-        if (result != VK_SUCCESS) {
-          ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
+        case vgflib::ResourceCategory::CONSTANT:
+          // Constants just need a descriptor
+          descriptors.push_back(VkTensorDescriptionARM{
+              .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
+              .pNext = nullptr,
+              .tiling = VK_TENSOR_TILING_LINEAR_ARM,
+              .format = resource_format,
+              .dimensionCount =
+                  shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+              .pDimensions =
+                  shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
+              // Note: stride_data of 0's causes size==0, null means
+              // stride==size
+              .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
+              .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
+          });
+          break;
+        case vgflib::ResourceCategory::INTERMEDIATE:
+          ET_LOG(Error, "Unsupported resource category INTERMEDIATE");
           return false;
-        }
-        size_t e_size = get_format_size(resource_format);
-        if (0 == e_size) {
-          ET_LOG(Error, "failed to get element size of VkFormat");
+        default:
+          ET_LOG(Info, "Unsupported resource category UNKNOWN");
           return false;
-        }
-
-        bool is_in =
-            resource_decoder->getCategory(i) == vgflib::ResourceCategory::INPUT;
-        IOs.push_back(
-            IO{vector<int64_t>(shape.begin(), shape.end()),
-               vector<int64_t>(stride.begin(), stride.end()),
-               e_size,
-               tensor,
-               tensor_view,
-               tensor_memory,
-               is_in});
-        resources.push_back({tensor, tensor_view});
-        descriptors.push_back(tensor_description);
-        break;
       }
-      case vgflib::ResourceCategory::CONSTANT:
-        // Constants just need a descriptor
-        descriptors.push_back(VkTensorDescriptionARM{
-            .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
-            .pNext = nullptr,
-            .tiling = VK_TENSOR_TILING_LINEAR_ARM,
-            .format = resource_format,
-            .dimensionCount =
-                shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
-            .pDimensions =
-                shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
-            // Note: stride_data of 0's causes size==0, null means stride==size
-            .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
-            .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
-        });
-        break;
-      case vgflib::ResourceCategory::INTERMEDIATE:
-        ET_LOG(Error, "Unsupported resource category INTERMEDIATE");
-        return false;
-      default:
-        ET_LOG(Info, "Unsupported resource category UNKNOWN");
-        return false;
     }
   }
 
-  // Constants table - mapping of shader bindings to MRT's and their descriptors
-  auto constant_indexes =
-      sequence_decoder->getSegmentConstantIndexes(segment_id);
-  for (uint32_t i : constant_indexes) {
-    auto mrt_i = constant_decoder->getConstantMrtIndex(i);
-    auto constant_data = constant_decoder->getConstant(i);
-    constants.push_back(VkDataGraphPipelineConstantARM{
-        .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM,
-        .pNext = &descriptors[mrt_i],
-        .id = i,
-        .pConstantData = constant_data.begin(),
-    });
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CONSTANT_TABLE");
+
+    // Constants table - mapping of shader bindings to MRT's and their
+    // descriptors
+    auto constant_indexes =
+        sequence_decoder->getSegmentConstantIndexes(segment_id);
+    for (uint32_t i : constant_indexes) {
+      auto mrt_i = constant_decoder->getConstantMrtIndex(i);
+      auto constant_data = constant_decoder->getConstant(i);
+      constants.push_back(VkDataGraphPipelineConstantARM{
+          .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM,
+          .pNext = &descriptors[mrt_i],
+          .id = i,
+          .pConstantData = constant_data.begin(),
+      });
+    }
   }
 
   // Prepare our layout bindings from the segment's information
   vector<VkDescriptorSetLayoutBinding> layout_bindings;
   vector<VkDataGraphPipelineResourceInfoARM> data_graph_resources;
 
-  auto set_count =
-      sequence_decoder->getSegmentDescriptorSetInfosSize(segment_id);
-  for (uint32_t d_idx = 0; d_idx < set_count; d_idx++) {
-    auto handle =
-        sequence_decoder->getDescriptorBindingSlotsHandle(segment_id, d_idx);
-    auto binding_count = sequence_decoder->getBindingsSize(handle);
-    for (int binding = 0; binding < binding_count; binding++) {
-      auto binding_index =
-          sequence_decoder->getBindingSlotBinding(handle, binding);
-      auto MRT_index =
-          sequence_decoder->getBindingSlotMrtIndex(handle, binding);
-      auto MRT_type = resource_decoder->getDescriptorType(MRT_index).value();
-
-      const VkDescriptorSetLayoutBinding layout_binding{
-          .binding = binding_index,
-          .descriptorType = vgflib::ToVkDescriptorType(MRT_type),
-          .descriptorCount = 1,
-          .stageFlags = VK_SHADER_STAGE_ALL,
-          .pImmutableSamplers = nullptr,
-      };
-      layout_bindings.push_back(layout_binding);
-
-      const VkDataGraphPipelineResourceInfoARM resource{
-          .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM,
-          // Note: we populate the resource_descriptors 1:1 with the MRT table,
-          // so can directly use that index into the resource_descriptors
-          .pNext = &descriptors[MRT_index],
-          .descriptorSet = d_idx,
-          .binding = binding_index,
-          .arrayElement = 0,
-      };
-      data_graph_resources.push_back(resource);
+  uint32_t set_count = 0;
+
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_DESCRIPTOR_METADATA");
+
+    set_count = sequence_decoder->getSegmentDescriptorSetInfosSize(segment_id);
+    for (uint32_t d_idx = 0; d_idx < set_count; d_idx++) {
+      auto handle =
+          sequence_decoder->getDescriptorBindingSlotsHandle(segment_id, d_idx);
+      auto binding_count = sequence_decoder->getBindingsSize(handle);
+      for (int binding = 0; binding < binding_count; binding++) {
+        auto binding_index =
+            sequence_decoder->getBindingSlotBinding(handle, binding);
+        auto MRT_index =
+            sequence_decoder->getBindingSlotMrtIndex(handle, binding);
+        auto MRT_type = resource_decoder->getDescriptorType(MRT_index).value();
+
+        const VkDescriptorSetLayoutBinding layout_binding{
+            .binding = binding_index,
+            .descriptorType = vgflib::ToVkDescriptorType(MRT_type),
+            .descriptorCount = 1,
+            .stageFlags = VK_SHADER_STAGE_ALL,
+            .pImmutableSamplers = nullptr,
+        };
+        layout_bindings.push_back(layout_binding);
+
+        const VkDataGraphPipelineResourceInfoARM resource{
+            .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM,
+            // Note: we populate the resource_descriptors 1:1 with the MRT
+            // table, so can directly use that index into the
+            // resource_descriptors
+            .pNext = &descriptors[MRT_index],
+            .descriptorSet = d_idx,
+            .binding = binding_index,
+            .arrayElement = 0,
+        };
+        data_graph_resources.push_back(resource);
+      }
     }
   }
 
-  // create fixed layout for this module
-  const VkDescriptorSetLayoutCreateInfo layout_info = {
-      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
-      .pNext = nullptr,
-      .flags = 0,
-      .bindingCount = static_cast<uint32_t>(layout_bindings.size()),
-      .pBindings = layout_bindings.data(),
-  };
-  result =
-      vkCreateDescriptorSetLayout(vk_device, &layout_info, nullptr, &vk_layout);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to create descriptor layout");
-    return false;
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_DESCRIPTOR_SET_LAYOUT");
+
+    // create fixed layout for this module
+    const VkDescriptorSetLayoutCreateInfo layout_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .bindingCount = static_cast<uint32_t>(layout_bindings.size()),
+        .pBindings = layout_bindings.data(),
+    };
+    result = vkCreateDescriptorSetLayout(
+        vk_device, &layout_info, nullptr, &vk_layout);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to create descriptor layout");
+      return false;
+    }
   }
 
-  std::vector<VkDescriptorPoolSize> poolSizes;
-  poolSizes.reserve(layout_bindings.size());
-  for (const auto& b : layout_bindings) {
-    bool found = false;
-    for (size_t idx = 0; idx < poolSizes.size(); ++idx) {
-      if (poolSizes[idx].type == b.descriptorType) {
-        poolSizes[idx].descriptorCount += b.descriptorCount;
-        found = true;
-        break;
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_DESCRIPTOR_POOL");
+
+    std::vector<VkDescriptorPoolSize> poolSizes;
+    poolSizes.reserve(layout_bindings.size());
+    for (const auto& b : layout_bindings) {
+      bool found = false;
+      for (size_t idx = 0; idx < poolSizes.size(); ++idx) {
+        if (poolSizes[idx].type == b.descriptorType) {
+          poolSizes[idx].descriptorCount += b.descriptorCount;
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        poolSizes.push_back({b.descriptorType, b.descriptorCount});
       }
     }
-    if (!found) {
-      poolSizes.push_back({b.descriptorType, b.descriptorCount});
+
+    // Create descriptor pool and descriptors for pipeline
+    const VkDescriptorPoolCreateInfo descriptor_pool_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .maxSets = static_cast<uint32_t>(set_count),
+        .poolSizeCount = static_cast<uint32_t>(poolSizes.size()),
+        .pPoolSizes = poolSizes.data(),
+    };
+    result = vkCreateDescriptorPool(
+        vk_device, &descriptor_pool_info, nullptr, &vk_descriptor_pool);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to create descriptor pool");
+      return false;
     }
   }
 
-  // Create descriptor pool and descriptors for pipeline
-  const VkDescriptorPoolCreateInfo descriptor_pool_info = {
-      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
-      .pNext = nullptr,
-      .flags = 0,
-      .maxSets = static_cast<uint32_t>(set_count),
-      .poolSizeCount = static_cast<uint32_t>(poolSizes.size()),
-      .pPoolSizes = poolSizes.data(),
-  };
-  result = vkCreateDescriptorPool(
-      vk_device, &descriptor_pool_info, nullptr, &vk_descriptor_pool);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to create descriptor pool");
-    return false;
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_DESCRIPTOR_SETS");
+
+    const VkDescriptorSetAllocateInfo descriptor_set_info = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .descriptorPool = vk_descriptor_pool,
+        .descriptorSetCount = static_cast<uint32_t>(set_count),
+        .pSetLayouts = &vk_layout,
+    };
+
+    // Alloc descriptor sets
+    // currently, as we require modelSequenceTableSize to == 1
+    // we can only get one descriptor set.
+    descriptor_sets.resize(layout_bindings.size());
+    result = vkAllocateDescriptorSets(
+        vk_device, &descriptor_set_info, descriptor_sets.data());
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to allocate descriptor sets");
+      return false;
+    }
   }
 
-  const VkDescriptorSetAllocateInfo descriptor_set_info = {
-      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
-      .pNext = nullptr,
-      .descriptorPool = vk_descriptor_pool,
-      .descriptorSetCount = static_cast<uint32_t>(set_count),
-      .pSetLayouts = &vk_layout,
-  };
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_UPDATE_DESCRIPTOR_SETS");
 
-  // Alloc descriptor sets
-  // currently, as we require modelSequenceTableSize to == 1
-  // we can only get one descriptor set.
-  descriptor_sets.resize(layout_bindings.size());
-  result = vkAllocateDescriptorSets(
-      vk_device, &descriptor_set_info, descriptor_sets.data());
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to allocate descriptor sets");
-    return false;
+    // write descriptor updates for every input
+    auto input_slots =
+        sequence_decoder->getSegmentInputBindingSlotsHandle(segment_id);
+    auto input_size = sequence_decoder->getBindingsSize(input_slots);
+    for (uint32_t i = 0; i < input_size; i++) {
+      auto binding = sequence_decoder->getBindingSlotBinding(input_slots, i);
+      auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(input_slots, i);
+
+      VkWriteDescriptorSetTensorARM write_desc = {
+          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+          .pNext = nullptr,
+          .tensorViewCount = 1,
+          .pTensorViews = &get<1>(resources[i]),
+      };
+      VkWriteDescriptorSet desc_set = {
+          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+          .pNext = &write_desc,
+          .dstSet = descriptor_sets[0],
+          .dstBinding = binding,
+          .dstArrayElement = 0,
+          .descriptorCount = 1,
+          .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+          .pImageInfo = nullptr,
+          .pBufferInfo = nullptr,
+          .pTexelBufferView = nullptr,
+      };
+      vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+    }
+
+    // write descriptor updates for every output
+    auto output_slots =
+        sequence_decoder->getSegmentOutputBindingSlotsHandle(segment_id);
+    auto output_size = sequence_decoder->getBindingsSize(output_slots);
+    for (uint32_t i = 0; i < output_size; i++) {
+      auto binding = sequence_decoder->getBindingSlotBinding(output_slots, i);
+      auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(output_slots, i);
+
+      VkWriteDescriptorSetTensorARM write_desc = {
+          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+          .pNext = nullptr,
+          .tensorViewCount = 1,
+          .pTensorViews = &get<1>(resources[i + input_size]),
+      };
+      VkWriteDescriptorSet desc_set = {
+          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+          .pNext = &write_desc,
+          .dstSet = descriptor_sets[0],
+          .dstBinding = binding,
+          .dstArrayElement = 0,
+          .descriptorCount = 1,
+          .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+          .pImageInfo = nullptr,
+          .pBufferInfo = nullptr,
+          .pTexelBufferView = nullptr,
+      };
+      vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+    }
   }
 
-  // write descriptor updates for every input
-  auto input_slots =
-      sequence_decoder->getSegmentInputBindingSlotsHandle(segment_id);
-  auto input_size = sequence_decoder->getBindingsSize(input_slots);
-  for (uint32_t i = 0; i < input_size; i++) {
-    auto binding = sequence_decoder->getBindingSlotBinding(input_slots, i);
-    auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(input_slots, i);
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_PIPELINE_LAYOUT");
 
-    VkWriteDescriptorSetTensorARM write_desc = {
-        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+    // create our pipeline
+    VkPipelineLayoutCreateInfo pipeline_layout_info = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
         .pNext = nullptr,
-        .tensorViewCount = 1,
-        .pTensorViews = &get<1>(resources[i]),
+        .flags = 0,
+        .setLayoutCount = 1,
+        .pSetLayouts = &vk_layout,
+        .pushConstantRangeCount = 0,
+        .pPushConstantRanges = nullptr,
     };
-    VkWriteDescriptorSet desc_set = {
-        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
-        .pNext = &write_desc,
-        .dstSet = descriptor_sets[0],
-        .dstBinding = binding,
-        .dstArrayElement = 0,
-        .descriptorCount = 1,
-        .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
-        .pImageInfo = nullptr,
-        .pBufferInfo = nullptr,
-        .pTexelBufferView = nullptr,
-    };
-    vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+    result = vkCreatePipelineLayout(
+        vk_device, &pipeline_layout_info, nullptr, &vk_pipeline_layout);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to create pipeline layout");
+      return false;
+    }
   }
 
-  // write descriptor updates for every output
-  auto output_slots =
-      sequence_decoder->getSegmentOutputBindingSlotsHandle(segment_id);
-  auto output_size = sequence_decoder->getBindingsSize(output_slots);
-  for (uint32_t i = 0; i < output_size; i++) {
-    auto binding = sequence_decoder->getBindingSlotBinding(output_slots, i);
-    auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(output_slots, i);
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_DATA_GRAPH_PIPELINE");
 
-    VkWriteDescriptorSetTensorARM write_desc = {
-        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+    // Shader Module Create
+    VkDataGraphPipelineShaderModuleCreateInfoARM shader_info{
+        .sType =
+            VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM,
         .pNext = nullptr,
-        .tensorViewCount = 1,
-        .pTensorViews = &get<1>(resources[i + input_size]),
-    };
-    VkWriteDescriptorSet desc_set = {
-        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
-        .pNext = &write_desc,
-        .dstSet = descriptor_sets[0],
-        .dstBinding = binding,
-        .dstArrayElement = 0,
-        .descriptorCount = 1,
-        .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
-        .pImageInfo = nullptr,
-        .pBufferInfo = nullptr,
-        .pTexelBufferView = nullptr,
+        .module = get<0>(shader_modules[0]),
+        .pName = get<1>(shader_modules[0]).c_str(),
+        .pSpecializationInfo = nullptr,
+        .constantCount = static_cast<uint32_t>(constants.size()),
+        .pConstants = constants.data(),
     };
-    vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
-  }
 
-  // create our pipeline
-  VkPipelineLayoutCreateInfo pipeline_layout_info = {
-      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
-      .pNext = nullptr,
-      .flags = 0,
-      .setLayoutCount = 1,
-      .pSetLayouts = &vk_layout,
-      .pushConstantRangeCount = 0,
-      .pPushConstantRanges = nullptr,
-  };
-  result = vkCreatePipelineLayout(
-      vk_device, &pipeline_layout_info, nullptr, &vk_pipeline_layout);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to create pipeline layout");
-    return false;
-  }
-
-  // Shader Module Create
-  VkDataGraphPipelineShaderModuleCreateInfoARM shader_info{
-      .sType =
-          VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM,
-      .pNext = nullptr,
-      .module = get<0>(shader_modules[0]),
-      .pName = get<1>(shader_modules[0]).c_str(),
-      .pSpecializationInfo = nullptr,
-      .constantCount = static_cast<uint32_t>(constants.size()),
-      .pConstants = constants.data(),
-  };
+    // Prepare Graph Pipeline
+    VkDataGraphPipelineCreateInfoARM graph_pipeline_info{
+        .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM,
+        .pNext = &shader_info,
+        .flags = VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR,
+        .layout = vk_pipeline_layout,
+        .resourceInfoCount = static_cast<uint32_t>(data_graph_resources.size()),
+        .pResourceInfos = data_graph_resources.data(),
+    };
 
-  // Prepare Graph Pipeline
-  VkDataGraphPipelineCreateInfoARM graph_pipeline_info{
-      .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM,
-      .pNext = &shader_info,
-      .flags = VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR,
-      .layout = vk_pipeline_layout,
-      .resourceInfoCount = static_cast<uint32_t>(data_graph_resources.size()),
-      .pResourceInfos = data_graph_resources.data(),
-  };
+    result = vkCreateDataGraphPipelinesARM(
+        vk_device, // device
+        VK_NULL_HANDLE, // deferredOperation
+        VK_NULL_HANDLE, // VkPipelineCache
+        1, // createInfoCount
+        &graph_pipeline_info, // pCreateInfos
+        nullptr, // pAllocator
+        &vk_pipeline // pPipelines (VkPipeline*)
+    );
 
-  result = vkCreateDataGraphPipelinesARM(
-      vk_device, // device
-      VK_NULL_HANDLE, // deferredOperation
-      VK_NULL_HANDLE, // VkPipelineCache
-      1, // createInfoCount
-      &graph_pipeline_info, // pCreateInfos
-      nullptr, // pAllocator
-      &vk_pipeline // pPipelines (VkPipeline*)
-  );
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to create DataGraphPipeline");
-    return false;
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to create DataGraphPipeline");
+      return false;
+    }
   }
 
   // prepare the graph pipeline session
-  VkDataGraphPipelineSessionCreateInfoARM pipeline_session_info{
-      .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM,
-      .pNext = nullptr,
-      .flags = 0,
-      .dataGraphPipeline = vk_pipeline,
-  };
-  result = vkCreateDataGraphPipelineSessionARM(
-      vk_device, &pipeline_session_info, nullptr, &vk_session);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to create DataGraphPipelineSession");
-    return false;
-  }
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_PIPELINE_SESSION");
 
-  // Allocate command buffer
-  VkCommandBufferAllocateInfo buffer_allocate_info{
-      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
-      .pNext = nullptr,
-      .commandPool = vk_command_pool,
-      .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-      .commandBufferCount = 1};
-  result = vkAllocateCommandBuffers(
-      vk_device, &buffer_allocate_info, &vk_execute_cmd);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to allocate command buffers");
-    return false;
+    VkDataGraphPipelineSessionCreateInfoARM pipeline_session_info{
+        .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM,
+        .pNext = nullptr,
+        .flags = 0,
+        .dataGraphPipeline = vk_pipeline,
+    };
+    result = vkCreateDataGraphPipelineSessionARM(
+        vk_device, &pipeline_session_info, nullptr, &vk_session);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to create DataGraphPipelineSession");
+      return false;
+    }
   }
 
-  // Allocate intermediates memory based on the pipeline requirements provided
-  // by the driver
-  VkDataGraphPipelineSessionBindPointRequirementsInfoARM
-      bind_point_requirements_info = {
-          .sType =
-              VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENTS_INFO_ARM,
-          .pNext = nullptr,
-          .session = vk_session,
-      };
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_COMMAND_BUFFER");
 
-  uint32_t bind_point_count = 0;
-  result = vkGetDataGraphPipelineSessionBindPointRequirementsARM(
-      vk_device, &bind_point_requirements_info, &bind_point_count, nullptr);
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to get session bind point count");
-    return false;
+    // Allocate command buffer
+    VkCommandBufferAllocateInfo buffer_allocate_info{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .pNext = nullptr,
+        .commandPool = vk_command_pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1};
+    result = vkAllocateCommandBuffers(
+        vk_device, &buffer_allocate_info, &vk_execute_cmd);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to allocate command buffers");
+      return false;
+    }
   }
 
-  vector<VkDataGraphPipelineSessionBindPointRequirementARM>
-      bind_point_requirements;
-  bind_point_requirements.resize(bind_point_count);
-  result = vkGetDataGraphPipelineSessionBindPointRequirementsARM(
-      vk_device,
-      &bind_point_requirements_info,
-      &bind_point_count,
-      bind_point_requirements.data());
-  if (result != VK_SUCCESS) {
-    ET_LOG(Error, "Failed to get session bind point requirements");
-    return false;
-  }
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_TRANSIENT_MEMORY");
 
-  // Given the bind points, just make individual allocations and bind them
-  for (const auto& bind_point_requirement : bind_point_requirements) {
-    // These are the only allowed type and bindpoint with the current spec
-    if (bind_point_requirement.bindPointType !=
-        VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM) {
-      ET_LOG(
-          Error,
-          "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM");
-      return false;
-    }
-    if (bind_point_requirement.bindPoint !=
-        VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM) {
-      ET_LOG(
-          Error,
-          "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM");
+    // Allocate intermediates memory based on the pipeline requirements provided
+    // by the driver
+    VkDataGraphPipelineSessionBindPointRequirementsInfoARM
+        bind_point_requirements_info = {
+            .sType =
+                VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENTS_INFO_ARM,
+            .pNext = nullptr,
+            .session = vk_session,
+        };
+
+    uint32_t bind_point_count = 0;
+    result = vkGetDataGraphPipelineSessionBindPointRequirementsARM(
+        vk_device, &bind_point_requirements_info, &bind_point_count, nullptr);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to get session bind point count");
       return false;
     }
-    if (bind_point_requirement.numObjects != 1) {
-      ET_LOG(Error, "Expected only one object for the bindpoint");
+
+    vector<VkDataGraphPipelineSessionBindPointRequirementARM>
+        bind_point_requirements;
+    bind_point_requirements.resize(bind_point_count);
+    result = vkGetDataGraphPipelineSessionBindPointRequirementsARM(
+        vk_device,
+        &bind_point_requirements_info,
+        &bind_point_count,
+        bind_point_requirements.data());
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to get session bind point requirements");
       return false;
     }
 
-    VkDataGraphPipelineSessionMemoryRequirementsInfoARM memory_requirements_info = {
-        .sType =
-            VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_MEMORY_REQUIREMENTS_INFO_ARM,
-        .pNext = nullptr,
-        .session = vk_session,
-        .bindPoint = bind_point_requirement.bindPoint,
-        .objectIndex = 0, // NOTE: tied to numObjects assert above
-    };
-    VkMemoryRequirements2 memory_requirements = {
-        .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
-        .pNext = nullptr,
-    };
-    vkGetDataGraphPipelineSessionMemoryRequirementsARM(
-        vk_device, &memory_requirements_info, &memory_requirements);
+    // Given the bind points, just make individual allocations and bind them
+    for (const auto& bind_point_requirement : bind_point_requirements) {
+      // These are the only allowed type and bindpoint with the current spec
+      if (bind_point_requirement.bindPointType !=
+          VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM) {
+        ET_LOG(
+            Error,
+            "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM");
+        return false;
+      }
+      if (bind_point_requirement.bindPoint !=
+          VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM) {
+        ET_LOG(
+            Error,
+            "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM");
+        return false;
+      }
+      if (bind_point_requirement.numObjects != 1) {
+        ET_LOG(Error, "Expected only one object for the bindpoint");
+        return false;
+      }
 
-    VkMemoryPropertyFlags aims = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-        VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-    uint32_t memory_index =
-        get_memory_index(vk_physical, memory_requirements, aims);
+      VkDataGraphPipelineSessionMemoryRequirementsInfoARM
+          memory_requirements_info = {
+              .sType =
+                  VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_MEMORY_REQUIREMENTS_INFO_ARM,
+              .pNext = nullptr,
+              .session = vk_session,
+              .bindPoint = bind_point_requirement.bindPoint,
+              .objectIndex = 0, // NOTE: tied to numObjects assert above
+          };
+      VkMemoryRequirements2 memory_requirements = {
+          .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+          .pNext = nullptr,
+      };
+      vkGetDataGraphPipelineSessionMemoryRequirementsARM(
+          vk_device, &memory_requirements_info, &memory_requirements);
 
-    VkMemoryAllocateInfo memory_allocate_info = {
-        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-        .pNext = nullptr,
-        .allocationSize = memory_requirements.memoryRequirements.size,
-        .memoryTypeIndex = memory_index,
-    };
+      VkMemoryPropertyFlags aims = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+      uint32_t memory_index =
+          get_memory_index(vk_physical, memory_requirements, aims);
 
-    VkDeviceMemory memory;
-    result =
-        vkAllocateMemory(vk_device, &memory_allocate_info, nullptr, &memory);
-    if (result != VK_SUCCESS) {
-      ET_LOG(Error, "Failed to allocate memory for intermediates");
-      return false;
+      VkMemoryAllocateInfo memory_allocate_info = {
+          .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+          .pNext = nullptr,
+          .allocationSize = memory_requirements.memoryRequirements.size,
+          .memoryTypeIndex = memory_index,
+      };
+
+      VkDeviceMemory memory;
+      result =
+          vkAllocateMemory(vk_device, &memory_allocate_info, nullptr, &memory);
+      if (result != VK_SUCCESS) {
+        ET_LOG(Error, "Failed to allocate memory for intermediates");
+        return false;
+      }
+      // so we can free this object in destructor
+      intermediates.push_back(memory);
+
+      VkBindDataGraphPipelineSessionMemoryInfoARM bind_info = {
+          .sType =
+              VK_STRUCTURE_TYPE_BIND_DATA_GRAPH_PIPELINE_SESSION_MEMORY_INFO_ARM,
+          .pNext = nullptr,
+          .session = vk_session,
+          .bindPoint = bind_point_requirement.bindPoint,
+          .objectIndex = 0, // NOTE: tied to numObjects assert above
+          .memory = memory,
+          .memoryOffset = 0,
+      };
+      result =
+          vkBindDataGraphPipelineSessionMemoryARM(vk_device, 1, &bind_info);
+      if (result != VK_SUCCESS) {
+        ET_LOG(Error, "Failed to bind intermediates memory");
+        return false;
+      }
     }
-    // so we can free this object in destructor
-    intermediates.push_back(memory);
+  }
 
-    VkBindDataGraphPipelineSessionMemoryInfoARM bind_info = {
-        .sType =
-            VK_STRUCTURE_TYPE_BIND_DATA_GRAPH_PIPELINE_SESSION_MEMORY_INFO_ARM,
-        .pNext = nullptr,
-        .session = vk_session,
-        .bindPoint = bind_point_requirement.bindPoint,
-        .objectIndex = 0, // NOTE: tied to numObjects assert above
-        .memory = memory,
-        .memoryOffset = 0,
-    };
-    result = vkBindDataGraphPipelineSessionMemoryARM(vk_device, 1, &bind_info);
-    if (result != VK_SUCCESS) {
-      ET_LOG(Error, "Failed to bind intermediates memory");
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_TIMESTAMP_QUERIES");
+
+    if (!init_timestamp_queries()) {
+      ET_LOG(Error, "Failed to initialize VGF timestamp queries");
       return false;
     }
   }
 
-  // Populate command once with our dispatch information
-  VkCommandBufferBeginInfo beginInfo{
-      VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
-  vkBeginCommandBuffer(vk_execute_cmd, &beginInfo);
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_RECORD_COMMAND_BUFFER");
+
+    // Populate command once with our dispatch information
+    VkCommandBufferBeginInfo beginInfo{
+        VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
+    vkBeginCommandBuffer(vk_execute_cmd, &beginInfo);
+
+    // Sync what will be the data coming in from host
+    VkMemoryBarrier2 barrier = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+        .srcStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+        .srcAccessMask = VK_ACCESS_2_HOST_WRITE_BIT,
+        .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+        .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT,
+    };
+    VkDependencyInfo dependency_info = {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .memoryBarrierCount = 1,
+        .pMemoryBarriers = &barrier,
+    };
+    vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info);
+
+    // bind pipeline + descriptor set
+    vkCmdBindPipeline(
+        vk_execute_cmd, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, vk_pipeline);
+
+    vkCmdBindDescriptorSets(
+        vk_execute_cmd,
+        VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM,
+        vk_pipeline_layout,
+        0, // first set
+        1,
+        descriptor_sets.data(), // descriptor set count + pointer
+        0,
+        nullptr // no dynamic offsets
+    );
+
+    // Dispatch the graph command
+    if (timestamp_queries_enabled &&
+        vk_timestamp_query_pool != VK_NULL_HANDLE) {
+      vkCmdResetQueryPool(vk_execute_cmd, vk_timestamp_query_pool, 0, 2);
+
+      if (vkCmdWriteTimestamp2) {
+        vkCmdWriteTimestamp2(
+            vk_execute_cmd,
+            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+            vk_timestamp_query_pool,
+            0);
+      } else {
+        vkCmdWriteTimestamp(
+            vk_execute_cmd,
+            VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+            vk_timestamp_query_pool,
+            0);
+      }
+    }
 
-  // Sync what will be the data coming in from host
-  VkMemoryBarrier2 barrier = {
-      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
-      .srcStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
-      .srcAccessMask = VK_ACCESS_2_HOST_WRITE_BIT,
-      .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-      .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT,
-  };
-  VkDependencyInfo dependency_info = {
-      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-      .memoryBarrierCount = 1,
-      .pMemoryBarriers = &barrier,
-  };
-  vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info);
-
-  // bind pipeline + descriptor set
-  vkCmdBindPipeline(
-      vk_execute_cmd, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, vk_pipeline);
-
-  vkCmdBindDescriptorSets(
-      vk_execute_cmd,
-      VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM,
-      vk_pipeline_layout,
-      0, // first set
-      1,
-      descriptor_sets.data(), // descriptor set count + pointer
-      0,
-      nullptr // no dynamic offsets
-  );
-
-  // Dispatch the graph command
-  vkCmdDispatchDataGraphARM(vk_execute_cmd, vk_session, nullptr);
-
-  // Sync data back
-  VkMemoryBarrier2 barrier_2 = {
-      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
-      .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
-      .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
-      .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
-      .dstAccessMask = VK_ACCESS_2_HOST_READ_BIT,
-  };
-  VkDependencyInfo dependency_info_2 = {
-      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
-      .memoryBarrierCount = 1,
-      .pMemoryBarriers = &barrier_2,
-  };
-  vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info_2);
+    // Dispatch the graph command
+    vkCmdDispatchDataGraphARM(vk_execute_cmd, vk_session, nullptr);
+
+    if (timestamp_queries_enabled &&
+        vk_timestamp_query_pool != VK_NULL_HANDLE) {
+      if (vkCmdWriteTimestamp2) {
+        vkCmdWriteTimestamp2(
+            vk_execute_cmd,
+            VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+            vk_timestamp_query_pool,
+            1);
+      } else {
+        vkCmdWriteTimestamp(
+            vk_execute_cmd,
+            VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+            vk_timestamp_query_pool,
+            1);
+      }
+    }
 
-  // end the command buffer
-  vkEndCommandBuffer(vk_execute_cmd);
+    // Sync data back
+    VkMemoryBarrier2 barrier_2 = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
+        .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+        .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT,
+        .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT,
+        .dstAccessMask = VK_ACCESS_2_HOST_READ_BIT,
+    };
+    VkDependencyInfo dependency_info_2 = {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .memoryBarrierCount = 1,
+        .pMemoryBarriers = &barrier_2,
+    };
+    vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info_2);
+
+    // end the command buffer
+    vkEndCommandBuffer(vk_execute_cmd);
+  }
 
   return true;
 }
 
-bool VgfRepr::execute_vgf() {
+bool VgfRepr::execute_vgf(executorch::runtime::EventTracer* event_tracer) {
   ET_LOG(Info, "Executing vgf");
 
-  // Submit & wait for idle
   VkSubmitInfo submit{VK_STRUCTURE_TYPE_SUBMIT_INFO};
   submit.commandBufferCount = 1;
   submit.pCommandBuffers = &vk_execute_cmd;
-  VkResult result = vkQueueSubmit(vk_queue, 1, &submit, VK_NULL_HANDLE);
+
+  VkResult result;
+
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_QUEUE_SUBMIT");
+
+    result = vkQueueSubmit(vk_queue, 1, &submit, VK_NULL_HANDLE);
+  }
+
   if (result != VK_SUCCESS) {
     ET_LOG(Error, "VGF/VkCommandBuffer command submission failed");
     return false;
   }
-  vkQueueWaitIdle(vk_queue);
+
+  {
+    VGF_PROFILE_SCOPE(event_tracer, "VGF_QUEUE_WAIT_IDLE");
+
+    result = vkQueueWaitIdle(vk_queue);
+  }
+
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "VGF/VkQueue wait idle failed");
+    return false;
+  }
+
+  read_timestamp_queries(event_tracer);
 
   return true;
 }
 
 void VgfRepr::free_vgf() {
+  if (vk_timestamp_query_pool != VK_NULL_HANDLE) {
+    vkDestroyQueryPool(vk_device, vk_timestamp_query_pool, nullptr);
+    vk_timestamp_query_pool = VK_NULL_HANDLE;
+  }
+
   vkFreeCommandBuffers(vk_device, vk_command_pool, 1, &vk_execute_cmd);
   vkDestroyDataGraphPipelineSessionARM(vk_device, vk_session, nullptr);
   vkDestroyPipeline(vk_device, vk_pipeline, nullptr);
diff --git a/backends/arm/runtime/VGFSetup.h b/backends/arm/runtime/VGFSetup.h
index 8e07b36e303..a8b1173ec16 100644
--- a/backends/arm/runtime/VGFSetup.h
+++ b/backends/arm/runtime/VGFSetup.h
@@ -11,6 +11,7 @@
 using namespace std;
 
 #include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/event_tracer.h>
 
 using executorch::runtime::ArrayRef;
 using executorch::runtime::CompileSpec;
@@ -48,12 +49,14 @@ class VgfRepr {
       VkPhysicalDevice phys,
       VkDevice dev,
       VkQueue queue,
-      VkCommandPool pool)
+      VkCommandPool pool,
+      uint32_t queue_family_index = UINT32_MAX)
       : vk_instance(inst),
         vk_physical(phys),
         vk_device(dev),
         vk_queue(queue),
-        vk_command_pool(pool) {}
+        vk_command_pool(pool),
+        vk_queue_family_index(queue_family_index) {}
 
   /*
    * Process a VGF ready for execution, allocate necessary Vulkan objects.
@@ -61,13 +64,13 @@ class VgfRepr {
   bool process_vgf(
       const char* vgf_data,
       size_t vgf_size,
-      ArrayRef<CompileSpec> specs);
+      ArrayRef<CompileSpec> specs,
+      executorch::runtime::EventTracer* event_tracer = nullptr);
 
   /*
    * Execute the VGF we've previously processed.
    */
-  bool execute_vgf();
-
+  bool execute_vgf(executorch::runtime::EventTracer* event_tracer = nullptr);
   /*
    * Free any allocations made in process_vgf.
    */
@@ -105,6 +108,12 @@ class VgfRepr {
   VkDevice vk_device;
   VkQueue vk_queue;
   VkCommandPool vk_command_pool;
+  uint32_t vk_queue_family_index = UINT32_MAX;
+
+  bool timestamp_queries_enabled = false;
+  uint32_t timestamp_valid_bits = 0;
+  double timestamp_period_ns = 0.0;
+  VkQueryPool vk_timestamp_query_pool = VK_NULL_HANDLE;
 
   // per-VgfRepr-instance objects allocated in process_vgf, used (can be more
   // than once) in execute_vgf
@@ -117,6 +126,9 @@ class VgfRepr {
   VkShaderModule vk_shader;
   // Note: the vector of tensor memory is stored in IOs above
   vector<VkDescriptorSet> descriptor_sets;
+
+  bool init_timestamp_queries();
+  void read_timestamp_queries(executorch::runtime::EventTracer* event_tracer);
 };
 
 } // namespace vgf
diff --git a/backends/arm/scripts/etdump_to_chrome_trace.py b/backends/arm/scripts/etdump_to_chrome_trace.py
new file mode 100755
index 00000000000..252f26cc71f
--- /dev/null
+++ b/backends/arm/scripts/etdump_to_chrome_trace.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# The script reads profiling events from an ETDump file using the ExecuTorch
+# Inspector API, optionally enriches them with ETRecord metadata, and writes a
+# JSON trace that can be loaded in chrome://tracing or Perfetto. Each ExecuTorch
+# event block is represented as a Chrome trace thread, and each profiling sample
+# is emitted as a complete-duration event with timestamps and durations in
+# microseconds.
+#
+# Example:
+#   python backends/arm/scripts/etdump_to_chrome_trace.py \
+#     --etdump_path ./etdumps/vgf_timestamps.etdp \
+#     --output ./traces/vgf_timestamps_trace.json
+
+import argparse
+import json
+
+from executorch.devtools import Inspector
+from executorch.devtools.inspector import TimeScale
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--etdump_path", required=True)
+    parser.add_argument("--etrecord_path", required=False, default=None)
+    parser.add_argument("--output", required=True)
+    parser.add_argument(
+        "--source_time_scale",
+        default="ns",
+        choices=[ts.value for ts in TimeScale],
+    )
+    args = parser.parse_args()
+
+    inspector = Inspector(
+        etdump_path=args.etdump_path,
+        etrecord=args.etrecord_path,
+        source_time_scale=TimeScale(args.source_time_scale),
+        target_time_scale=TimeScale.US,
+    )
+
+    trace_events = []
+
+    # Chrome trace uses microseconds for "ts" and "dur".
+    source_to_us = {
+        "ns": 1.0 / 1000.0,
+        "us": 1.0,
+        "ms": 1000.0,
+        "s": 1000_000.0,
+        "cycles": 1.0,
+    }[args.source_time_scale]
+
+    for block_idx, event_block in enumerate(inspector.event_blocks):
+        tid_name = event_block.name
+
+        trace_events.append(
+            {
+                "name": "thread_name",
+                "ph": "M",
+                "pid": 1,
+                "tid": block_idx,
+                "args": {"name": tid_name},
+            }
+        )
+
+        for event in event_block.events:
+            if event.perf_data is None or event.start_time is None:
+                continue
+
+            durations_us = event.perf_data.raw
+            start_times = event.start_time
+
+            for iter_idx, (start_time, duration_us) in enumerate(
+                zip(start_times, durations_us)
+            ):
+                trace_events.append(
+                    {
+                        "name": event.name,
+                        "cat": event_block.name,
+                        "ph": "X",
+                        "ts": float(start_time) * source_to_us,
+                        "dur": float(duration_us),
+                        "pid": 1,
+                        "tid": block_idx,
+                        "args": {
+                            "event_block": event_block.name,
+                            "iteration": iter_idx,
+                            "is_delegated_op": event.is_delegated_op,
+                            "delegate_backend_name": event.delegate_backend_name,
+                            "op_types": event.op_types,
+                        },
+                    }
+                )
+
+    with open(args.output, "w") as f:
+        json.dump({"traceEvents": trace_events}, f)
+
+    print(f"Wrote Chrome trace JSON: {args.output}")
+    print(f"Events: {len(trace_events)}")
+
+
+if __name__ == "__main__":
+    main()