Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions backends/arm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,39 @@ List of model specific and optional passes:
- `graph_module = ToDevicePass("cpu")(graph_module).graph_module`
- backends/arm/test/misc/test_post_quant_device_switch.py

## Profiling of VGF Backend

VGF profiling now emits both host-side ExecuTorch event tracer ranges and Vulkan timestamp-query measurements. The host ranges split init into `VGF_INIT_*` phases, including `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE`, and split execute into `VGF_COPY_INPUTS`, `VGF_QUEUE_SUBMIT`, `VGF_QUEUE_WAIT_IDLE`, `VGF_TIMESTAMP_QUERY_READBACK`, `VGF_DISPATCH_AND_WAIT`, and `VGF_COPY_OUTPUTS`. Vulkan timestamp queries are inserted into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`, producing `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. To collect a profile, build the VGF runner with event tracing enabled, run the model with an ETDump path, then convert the ETDump to Chrome trace JSON:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you planning to expand this to hardware performance counters like occupancy going beyond time measurements, and chrometrace.


```bash
mkdir -p etdumps traces

./cmake-out-vgf/executor_runner \
--model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \
--num_executions 10 \
--etdump_path ./etdumps/vgf_timestamps.etdp \
--print_output none

python ./backends/arm/scripts/etdump_to_chrome_trace.py \
--etdump_path ./etdumps/vgf_timestamps.etdp \
--output ./etdumps/vgf_timestamps_trace.json
```

Open the result in Chrome by navigating to `chrome://tracing`, selecting **Load**, and choosing `./traces/vgf_timestamps_trace.json`. The key fields to inspect are `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE` for pipeline creation/init cost, `VGF_QUEUE_SUBMIT` and `VGF_QUEUE_WAIT_IDLE` for host-side submission/wait overhead, and `VGF_DATA_GRAPH_DEVICE_TIME` for device-side data-graph execution time.

VGF profiling can emit optional Vulkan timestamp-query measurements. Vulkan timestamp queries are controlled by the `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` environment variable. Set it to `1` to insert timestamp queries into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`. When enabled, the backend emits `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. If `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` is unset or set to `0`, only host-side ExecuTorch event tracer ranges are collected and no Vulkan timestamp-query readback is performed. Note that the timestamp-query measurements will be printed out and not included into `.etdp`.

So, in this case the command is:

```bash
EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES=1 \
./cmake-out-vgf/executor_runner \
--model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \
--num_executions 10 \
--etdump_path ./etdumps/vgf_timestamps.etdp \
--print_output none
```

## Help & Improvements

If you have problems or questions, or have suggestions for ways to improve the Arm backend, please reach out
Expand Down
163 changes: 156 additions & 7 deletions backends/arm/runtime/VGFBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,20 @@
*/

#include <cinttypes>
#include <list>
#include <numeric>

using namespace std;

#include <c10/util/safe_numerics.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>

#ifdef ET_EVENT_TRACER_ENABLED
#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
#endif

using executorch::aten::Tensor;
using executorch::runtime::ArrayRef;
using executorch::runtime::Backend;
Expand All @@ -27,6 +34,13 @@ using executorch::runtime::MemoryAllocator;
using executorch::runtime::Result;
using executorch::runtime::Span;

#ifdef ET_EVENT_TRACER_ENABLED
using executorch::runtime::event_tracer_end_profiling_delegate;
using executorch::runtime::event_tracer_start_profiling_delegate;
using executorch::runtime::EventTracer;
using executorch::runtime::EventTracerEntry;
#endif

// We use the platform and runtime environment provided by the Vulkan delegate
#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>

Expand Down Expand Up @@ -69,7 +83,8 @@ VkResult vkml_allocate_basics(
VkPhysicalDevice* physical_device,
VkDevice* device,
VkQueue* queue,
VkCommandPool* command_pool);
VkCommandPool* command_pool,
uint32_t* queue_family_index);

void vkml_free_basics(
VkInstance* instance,
Expand Down Expand Up @@ -104,7 +119,8 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
&vk_physical_device,
&vk_device,
&vk_queue,
&vk_command_pool);
&vk_command_pool,
&vk_queue_family_index);
if (result != VK_SUCCESS) {
ET_LOG(
Error, "Failed to initialize the Vulkan device error 0x%08X", result);
Expand Down Expand Up @@ -142,8 +158,31 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
ArrayRef<CompileSpec> compile_specs) const override {
ET_LOG(Info, "Entered VGF init");

#ifdef ET_EVENT_TRACER_ENABLED
EventTracer* event_tracer = context.event_tracer();

EventTracerEntry init_total_event = event_tracer_start_profiling_delegate(
event_tracer,
"VGF_INIT_TOTAL",
/*delegate_debug_id=*/-1);

EventTracerEntry ensure_initialized_event =
event_tracer_start_profiling_delegate(
event_tracer,
"VGF_INIT_ENSURE_INITIALIZED",
/*delegate_debug_id=*/-1);
#endif

const_cast<VGFBackend*>(this)->ensure_initialized();

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, ensure_initialized_event);
#endif

if (!is_initialized_) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, init_total_event);
#endif
ET_LOG(
Error,
"VGF backend is unavailable because Vulkan initialization failed");
Expand All @@ -152,30 +191,89 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {

const char* vgf_data = reinterpret_cast<const char*>(processed->data());

#ifdef ET_EVENT_TRACER_ENABLED
EventTracerEntry allocate_repr_event =
event_tracer_start_profiling_delegate(
event_tracer,
"VGF_INIT_ALLOCATE_REPR",
/*delegate_debug_id=*/-1);
#endif

MemoryAllocator* allocator = context.get_runtime_allocator();
VgfRepr* repr = allocator->allocateInstance<VgfRepr>();
new (repr) VgfRepr(
vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool);
vk_instance,
vk_physical_device,
vk_device,
vk_queue,
vk_command_pool,
vk_queue_family_index);

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, allocate_repr_event);

EventTracerEntry process_vgf_event = event_tracer_start_profiling_delegate(
event_tracer,
"VGF_INIT_PROCESS_VGF_BACKEND",
/*delegate_debug_id=*/-1);
#endif

#ifdef ET_EVENT_TRACER_ENABLED
auto valid_vgf = repr->process_vgf(
vgf_data, processed->size(), compile_specs, event_tracer);
#else
auto valid_vgf =
repr->process_vgf(vgf_data, processed->size(), compile_specs);
#endif

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, process_vgf_event);
#endif

if (!valid_vgf) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, init_total_event);
Copy link
Copy Markdown
Contributor

@digantdesai digantdesai May 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you see nested profiling events leading to some noise? I.e. the time spent in recording the inner profiling event measured in the outer one? I understand this is needed for chrometrace etc. Just want to make sure this record overhead isn't too much.

#endif
ET_LOG(Error, "Failed to process VGF blob.");
return Error::Internal;
}

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, init_total_event);
#endif

return repr;
}

Error execute(
ET_UNUSED BackendExecutionContext& context,
BackendExecutionContext& context,
DelegateHandle* handle,
Span<EValue*> args) const override {
VgfRepr* repr = static_cast<VgfRepr*>(handle);

#ifdef ET_EVENT_TRACER_ENABLED
EventTracer* event_tracer = context.event_tracer();

EventTracerEntry vgf_execute_event = event_tracer_start_profiling_delegate(
event_tracer,
"VGF_EXECUTE",
/*delegate_debug_id=*/-1);

EventTracerEntry copy_inputs_event = event_tracer_start_profiling_delegate(
event_tracer,
"VGF_COPY_INPUTS",
/*delegate_debug_id=*/-1);
#else
(void)context;
#endif

// Copy all inputs from EValue to VkDeviceMemory
for (int i = 0; i < repr->IOs.size(); i++) {
if (!args[i]->isTensor()) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
#endif
ET_LOG(
Error,
"Expected EValue %d to be tensor, got %d",
Expand Down Expand Up @@ -206,22 +304,59 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {

void* data;
if (!repr->map_io(io, &data)) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);
event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
#endif
ET_LOG(Error, "Failed to map Vulkan IO memory");
return Error::Internal;
}
memcpy(data, tensor->mutable_data_ptr(), io_size);
repr->unmap_io(io);
}

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event);

EventTracerEntry dispatch_event = event_tracer_start_profiling_delegate(
event_tracer,
"VGF_DISPATCH_AND_WAIT",
/*delegate_debug_id=*/-1);
#endif

// Execute the workload
if (!repr->execute_vgf()) {
bool execute_ok = false;
#ifdef ET_EVENT_TRACER_ENABLED
execute_ok = repr->execute_vgf(event_tracer);
#else
execute_ok = repr->execute_vgf();
#endif

if (!execute_ok) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, dispatch_event);
event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
#endif
ET_LOG(Error, "Failed to execute the VGF representation");
return Error::Internal;
}

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, dispatch_event);

EventTracerEntry copy_outputs_event = event_tracer_start_profiling_delegate(
event_tracer,
"VGF_COPY_OUTPUTS",
/*delegate_debug_id=*/-1);
#endif

// Copy all outputs from VKDeviceMemory to EValue
for (int i = 0; i < repr->IOs.size(); i++) {
if (!args[i]->isTensor()) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
#endif
ET_LOG(
Error,
"Expected EValue %d to be tensor, got %d",
Expand Down Expand Up @@ -251,13 +386,22 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {

void* data;
if (!repr->map_io(io, &data)) {
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
#endif
ET_LOG(Error, "Failed to map Vulkan IO memory");
return Error::Internal;
}
memcpy(tensor->mutable_data_ptr(), data, io_size);
repr->unmap_io(io);
}

#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event);
event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event);
#endif

return Error::Ok;
}

Expand All @@ -272,6 +416,7 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface {
VkDevice vk_device = VK_NULL_HANDLE;
VkQueue vk_queue = VK_NULL_HANDLE;
VkCommandPool vk_command_pool = VK_NULL_HANDLE;
uint32_t vk_queue_family_index = UINT32_MAX;
bool is_initialized_ = false;
};

Expand All @@ -286,7 +431,8 @@ VkResult vkml_allocate_basics(
VkPhysicalDevice* physical_device,
VkDevice* device,
VkQueue* queue,
VkCommandPool* command_pool) {
VkCommandPool* command_pool,
uint32_t* queue_family_index) {
VkResult result;

if (VK_SUCCESS != volkInitialize()) {
Expand Down Expand Up @@ -408,6 +554,9 @@ VkResult vkml_allocate_basics(
ET_LOG(Error, "Failed to find suitable queue");
return VK_ERROR_UNKNOWN;
}
if (queue_family_index != nullptr) {
*queue_family_index = qf;
}

// Device with ML tensor extension
float qp = 1.0f;
Expand Down Expand Up @@ -544,4 +693,4 @@ VkResult vkml_allocate_basics(

} // namespace vgf
} // namespace backends
} // namespace executorch
} // namespace executorch
Loading
Loading