diff --git a/backends/arm/README.md b/backends/arm/README.md index e9afa5a928d..965566d1458 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -375,6 +375,39 @@ List of model specific and optional passes: - `graph_module = ToDevicePass("cpu")(graph_module).graph_module` - backends/arm/test/misc/test_post_quant_device_switch.py +## Profiling of VGF Backend + +VGF profiling now emits both host-side ExecuTorch event tracer ranges and Vulkan timestamp-query measurements. The host ranges split init into `VGF_INIT_*` phases, including `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE`, and split execute into `VGF_COPY_INPUTS`, `VGF_QUEUE_SUBMIT`, `VGF_QUEUE_WAIT_IDLE`, `VGF_TIMESTAMP_QUERY_READBACK`, `VGF_DISPATCH_AND_WAIT`, and `VGF_COPY_OUTPUTS`. Vulkan timestamp queries are inserted into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`, producing `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. To collect a profile, build the VGF runner with event tracing enabled, run the model with an ETDump path, then convert the ETDump to Chrome trace JSON: + +```bash +mkdir -p etdumps traces + +./cmake-out-vgf/executor_runner \ + --model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \ + --num_executions 10 \ + --etdump_path ./etdumps/vgf_timestamps.etdp \ + --print_output none + +python ./backends/arm/scripts/etdump_to_chrome_trace.py \ + --etdump_path ./etdumps/vgf_timestamps.etdp \ + --output ./etdumps/vgf_timestamps_trace.json +``` + +Open the result in Chrome by navigating to `chrome://tracing`, selecting **Load**, and choosing `./traces/vgf_timestamps_trace.json`. The key fields to inspect are `VGF_INIT_CREATE_DATA_GRAPH_PIPELINE` for pipeline creation/init cost, `VGF_QUEUE_SUBMIT` and `VGF_QUEUE_WAIT_IDLE` for host-side submission/wait overhead, and `VGF_DATA_GRAPH_DEVICE_TIME` for device-side data-graph execution time. + +VGF profiling can emit optional Vulkan timestamp-query measurements. Vulkan timestamp queries are controlled by the `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` environment variable. Set it to `1` to insert timestamp queries into the recorded VGF command buffer around `vkCmdDispatchDataGraphARM()`. When enabled, the backend emits `VGF_DATA_GRAPH_DEVICE_TIME`, which measures device-side elapsed time for the submitted data-graph command buffer region. If `EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES` is unset or set to `0`, only host-side ExecuTorch event tracer ranges are collected and no Vulkan timestamp-query readback is performed. Note that the timestamp-query measurements will be printed out and not included into `.etdp`. + +So, in this case the command is: + +```bash +EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES=1 \ +./cmake-out-vgf/executor_runner \ + --model_path vgf_mobilenetv2_out/mobilenet_v2_vgf_int8.pte \ + --num_executions 10 \ + --etdump_path ./etdumps/vgf_timestamps.etdp \ + --print_output none +``` + ## Help & Improvements If you have problems or questions, or have suggestions for ways to improve the Arm backend, please reach out diff --git a/backends/arm/runtime/VGFBackend.cpp b/backends/arm/runtime/VGFBackend.cpp index 8ac804f7744..f6a7002d862 100644 --- a/backends/arm/runtime/VGFBackend.cpp +++ b/backends/arm/runtime/VGFBackend.cpp @@ -6,6 +6,9 @@ */ #include +#include +#include + using namespace std; #include @@ -13,6 +16,10 @@ using namespace std; #include #include +#ifdef ET_EVENT_TRACER_ENABLED +#include +#endif + using executorch::aten::Tensor; using executorch::runtime::ArrayRef; using executorch::runtime::Backend; @@ -27,6 +34,13 @@ using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; using executorch::runtime::Span; +#ifdef ET_EVENT_TRACER_ENABLED +using executorch::runtime::event_tracer_end_profiling_delegate; +using executorch::runtime::event_tracer_start_profiling_delegate; +using executorch::runtime::EventTracer; +using executorch::runtime::EventTracerEntry; +#endif + // We use the platform and runtime environment provided by the Vulkan delegate #include @@ -69,7 +83,8 @@ VkResult vkml_allocate_basics( VkPhysicalDevice* physical_device, VkDevice* device, VkQueue* queue, - VkCommandPool* command_pool); + VkCommandPool* command_pool, + uint32_t* queue_family_index); void vkml_free_basics( VkInstance* instance, @@ -104,7 +119,8 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { &vk_physical_device, &vk_device, &vk_queue, - &vk_command_pool); + &vk_command_pool, + &vk_queue_family_index); if (result != VK_SUCCESS) { ET_LOG( Error, "Failed to initialize the Vulkan device error 0x%08X", result); @@ -142,8 +158,31 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { ArrayRef compile_specs) const override { ET_LOG(Info, "Entered VGF init"); +#ifdef ET_EVENT_TRACER_ENABLED + EventTracer* event_tracer = context.event_tracer(); + + EventTracerEntry init_total_event = event_tracer_start_profiling_delegate( + event_tracer, + "VGF_INIT_TOTAL", + /*delegate_debug_id=*/-1); + + EventTracerEntry ensure_initialized_event = + event_tracer_start_profiling_delegate( + event_tracer, + "VGF_INIT_ENSURE_INITIALIZED", + /*delegate_debug_id=*/-1); +#endif + const_cast(this)->ensure_initialized(); + +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, ensure_initialized_event); +#endif + if (!is_initialized_) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, init_total_event); +#endif ET_LOG( Error, "VGF backend is unavailable because Vulkan initialization failed"); @@ -152,30 +191,89 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { const char* vgf_data = reinterpret_cast(processed->data()); +#ifdef ET_EVENT_TRACER_ENABLED + EventTracerEntry allocate_repr_event = + event_tracer_start_profiling_delegate( + event_tracer, + "VGF_INIT_ALLOCATE_REPR", + /*delegate_debug_id=*/-1); +#endif + MemoryAllocator* allocator = context.get_runtime_allocator(); VgfRepr* repr = allocator->allocateInstance(); new (repr) VgfRepr( - vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool); + vk_instance, + vk_physical_device, + vk_device, + vk_queue, + vk_command_pool, + vk_queue_family_index); + +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, allocate_repr_event); + + EventTracerEntry process_vgf_event = event_tracer_start_profiling_delegate( + event_tracer, + "VGF_INIT_PROCESS_VGF_BACKEND", + /*delegate_debug_id=*/-1); +#endif +#ifdef ET_EVENT_TRACER_ENABLED + auto valid_vgf = repr->process_vgf( + vgf_data, processed->size(), compile_specs, event_tracer); +#else auto valid_vgf = repr->process_vgf(vgf_data, processed->size(), compile_specs); +#endif + +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, process_vgf_event); +#endif + if (!valid_vgf) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, init_total_event); +#endif ET_LOG(Error, "Failed to process VGF blob."); return Error::Internal; } +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, init_total_event); +#endif + return repr; } Error execute( - ET_UNUSED BackendExecutionContext& context, + BackendExecutionContext& context, DelegateHandle* handle, Span args) const override { VgfRepr* repr = static_cast(handle); +#ifdef ET_EVENT_TRACER_ENABLED + EventTracer* event_tracer = context.event_tracer(); + + EventTracerEntry vgf_execute_event = event_tracer_start_profiling_delegate( + event_tracer, + "VGF_EXECUTE", + /*delegate_debug_id=*/-1); + + EventTracerEntry copy_inputs_event = event_tracer_start_profiling_delegate( + event_tracer, + "VGF_COPY_INPUTS", + /*delegate_debug_id=*/-1); +#else + (void)context; +#endif + // Copy all inputs from EValue to VkDeviceMemory for (int i = 0; i < repr->IOs.size(); i++) { if (!args[i]->isTensor()) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event); + event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); +#endif ET_LOG( Error, "Expected EValue %d to be tensor, got %d", @@ -206,6 +304,10 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { void* data; if (!repr->map_io(io, &data)) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event); + event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); +#endif ET_LOG(Error, "Failed to map Vulkan IO memory"); return Error::Internal; } @@ -213,15 +315,48 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { repr->unmap_io(io); } +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event); + + EventTracerEntry dispatch_event = event_tracer_start_profiling_delegate( + event_tracer, + "VGF_DISPATCH_AND_WAIT", + /*delegate_debug_id=*/-1); +#endif + // Execute the workload - if (!repr->execute_vgf()) { + bool execute_ok = false; +#ifdef ET_EVENT_TRACER_ENABLED + execute_ok = repr->execute_vgf(event_tracer); +#else + execute_ok = repr->execute_vgf(); +#endif + + if (!execute_ok) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, dispatch_event); + event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); +#endif ET_LOG(Error, "Failed to execute the VGF representation"); return Error::Internal; } +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, dispatch_event); + + EventTracerEntry copy_outputs_event = event_tracer_start_profiling_delegate( + event_tracer, + "VGF_COPY_OUTPUTS", + /*delegate_debug_id=*/-1); +#endif + // Copy all outputs from VKDeviceMemory to EValue for (int i = 0; i < repr->IOs.size(); i++) { if (!args[i]->isTensor()) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event); + event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); +#endif ET_LOG( Error, "Expected EValue %d to be tensor, got %d", @@ -251,6 +386,10 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { void* data; if (!repr->map_io(io, &data)) { +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event); + event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); +#endif ET_LOG(Error, "Failed to map Vulkan IO memory"); return Error::Internal; } @@ -258,6 +397,11 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { repr->unmap_io(io); } +#ifdef ET_EVENT_TRACER_ENABLED + event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event); + event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); +#endif + return Error::Ok; } @@ -272,6 +416,7 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { VkDevice vk_device = VK_NULL_HANDLE; VkQueue vk_queue = VK_NULL_HANDLE; VkCommandPool vk_command_pool = VK_NULL_HANDLE; + uint32_t vk_queue_family_index = UINT32_MAX; bool is_initialized_ = false; }; @@ -286,7 +431,8 @@ VkResult vkml_allocate_basics( VkPhysicalDevice* physical_device, VkDevice* device, VkQueue* queue, - VkCommandPool* command_pool) { + VkCommandPool* command_pool, + uint32_t* queue_family_index) { VkResult result; if (VK_SUCCESS != volkInitialize()) { @@ -408,6 +554,9 @@ VkResult vkml_allocate_basics( ET_LOG(Error, "Failed to find suitable queue"); return VK_ERROR_UNKNOWN; } + if (queue_family_index != nullptr) { + *queue_family_index = qf; + } // Device with ML tensor extension float qp = 1.0f; @@ -544,4 +693,4 @@ VkResult vkml_allocate_basics( } // namespace vgf } // namespace backends -} // namespace executorch +} // namespace executorch \ No newline at end of file diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp index b62a6b2ec23..3e62ce1735f 100644 --- a/backends/arm/runtime/VGFSetup.cpp +++ b/backends/arm/runtime/VGFSetup.cpp @@ -12,6 +12,13 @@ #include +#include +#include + +#ifdef ET_EVENT_TRACER_ENABLED +#include +#endif + #include #include @@ -31,8 +38,41 @@ namespace { constexpr int64_t kScalarSentinelDimension = 1; } -#if defined(ET_ARM_VGF_DEBUG) +#ifdef ET_EVENT_TRACER_ENABLED +namespace { +class ScopedVgfProfileEvent { + public: + ScopedVgfProfileEvent( + executorch::runtime::EventTracer* event_tracer, + const char* name) + : event_tracer_(event_tracer), + entry_(executorch::runtime::event_tracer_start_profiling_delegate( + event_tracer_, + name, + /*delegate_debug_id=*/-1)) {} + + ~ScopedVgfProfileEvent() { + executorch::runtime::event_tracer_end_profiling_delegate( + event_tracer_, entry_); + } + + private: + executorch::runtime::EventTracer* event_tracer_; + executorch::runtime::EventTracerEntry entry_; +}; +} // namespace + +#define VGF_CONCAT_INNER(a, b) a##b +#define VGF_CONCAT(a, b) VGF_CONCAT_INNER(a, b) +#define VGF_PROFILE_SCOPE(event_tracer, name) \ + ScopedVgfProfileEvent VGF_CONCAT(_vgf_profile_scope_, __LINE__)( \ + event_tracer, name) +#else +#define VGF_PROFILE_SCOPE(event_tracer, name) (void)(event_tracer) +#endif + // Debug function to inspect memory properties +#if defined(ET_ARM_VGF_DEBUG) static string memory_flags_to_string(VkMemoryPropertyFlags flags) { if (flags == 0) return "0"; @@ -102,6 +142,153 @@ uint32_t get_memory_index( return memory_type; } +bool VgfRepr::init_timestamp_queries() { + const char* enable = std::getenv("EXECUTORCH_VGF_ENABLE_TIMESTAMP_QUERIES"); + if (enable == nullptr || enable[0] == '\0') { + ET_LOG(Info, "VGF timestamp queries disabled"); + return true; + } + + if (timestamp_queries_enabled || vk_timestamp_query_pool != VK_NULL_HANDLE) { + return true; + } + + if (vk_queue_family_index == UINT32_MAX) { + ET_LOG(Info, "VGF timestamp queries disabled: unknown queue family index"); + return true; + } + + uint32_t queue_family_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties( + vk_physical, &queue_family_count, nullptr); + + if (vk_queue_family_index >= queue_family_count) { + ET_LOG( + Info, + "VGF timestamp queries disabled: queue family index %u is out of range", + vk_queue_family_index); + return true; + } + + vector queue_family_properties(queue_family_count); + vkGetPhysicalDeviceQueueFamilyProperties( + vk_physical, &queue_family_count, queue_family_properties.data()); + + timestamp_valid_bits = + queue_family_properties[vk_queue_family_index].timestampValidBits; + + if (timestamp_valid_bits == 0) { + ET_LOG( + Info, + "VGF timestamp queries disabled: queue family %u does not support timestamps", + vk_queue_family_index); + return true; + } + + VkPhysicalDeviceProperties physical_device_properties; + vkGetPhysicalDeviceProperties(vk_physical, &physical_device_properties); + + timestamp_period_ns = + static_cast(physical_device_properties.limits.timestampPeriod); + + if (timestamp_period_ns <= 0.0) { + ET_LOG( + Info, + "VGF timestampPeriod is %.6f; using fallback 52.0 ns/tick", + timestamp_period_ns); + timestamp_period_ns = 52.0; + } + + VkQueryPoolCreateInfo query_pool_info{ + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = 2, + .pipelineStatistics = 0, + }; + + VkResult result = vkCreateQueryPool( + vk_device, &query_pool_info, nullptr, &vk_timestamp_query_pool); + + if (result != VK_SUCCESS) { + ET_LOG( + Info, + "VGF timestamp queries disabled: vkCreateQueryPool failed with %d", + result); + vk_timestamp_query_pool = VK_NULL_HANDLE; + return true; + } + + timestamp_queries_enabled = true; + + ET_LOG( + Info, + "VGF timestamp queries enabled: queue_family=%u valid_bits=%u period_ns=%.6f", + vk_queue_family_index, + timestamp_valid_bits, + timestamp_period_ns); + + return true; +} + +void VgfRepr::read_timestamp_queries( + executorch::runtime::EventTracer* event_tracer) { + if (!timestamp_queries_enabled || vk_timestamp_query_pool == VK_NULL_HANDLE) { + return; + } + + uint64_t timestamps[2] = {0, 0}; + VkResult result; + + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_TIMESTAMP_QUERY_READBACK"); + + result = vkGetQueryPoolResults( + vk_device, + vk_timestamp_query_pool, + 0, + 2, + sizeof(timestamps), + timestamps, + sizeof(uint64_t), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + } + + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to read VGF timestamp query results: %d", result); + return; + } + + uint64_t start = timestamps[0]; + uint64_t end = timestamps[1]; + + uint64_t mask = std::numeric_limits::max(); + if (timestamp_valid_bits < 64) { + mask = (1ULL << timestamp_valid_bits) - 1ULL; + start &= mask; + end &= mask; + } + + uint64_t delta_ticks; + if (end >= start) { + delta_ticks = end - start; + } else { + delta_ticks = (mask - start) + end + 1ULL; + } + + const double duration_ns = + static_cast(delta_ticks) * timestamp_period_ns; + const double duration_ms = duration_ns / 1000000.0; + + ET_LOG( + Info, + "VGF_DATA_GRAPH_DEVICE_TIME ticks=%llu duration_ns=%.3f duration_ms=%.6f", + static_cast(delta_ticks), + duration_ns, + duration_ms); +} + /** * Tensor allocation helper function */ @@ -339,41 +526,51 @@ static void debug_print_modules( bool VgfRepr::process_vgf( const char* vgf_data, size_t vgf_size, - ArrayRef specs) { + ArrayRef specs, + executorch::runtime::EventTracer* event_tracer) { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_PROCESS_VGF"); + (void)specs; + ET_LOG(Info, "Preparing VGF as Vulkan objects"); VkResult result; - // Prepare temporary decoders - unique_ptr header_decoder = - vgflib::CreateHeaderDecoder(vgf_data, vgflib::HeaderSize(), vgf_size); - if (!header_decoder) { - ET_LOG(Error, "Failed to create VGF header decoder"); - return false; - } + unique_ptr header_decoder; + unique_ptr sequence_decoder; + unique_ptr module_decoder; + unique_ptr resource_decoder; + unique_ptr constant_decoder; - unique_ptr sequence_decoder = - vgflib::CreateModelSequenceTableDecoder( - vgf_data + header_decoder->GetModelSequenceTableOffset(), - header_decoder->GetModelSequenceTableSize()); - unique_ptr module_decoder = - vgflib::CreateModuleTableDecoder( - vgf_data + header_decoder->GetModuleTableOffset(), - header_decoder->GetModuleTableSize()); - unique_ptr resource_decoder = - vgflib::CreateModelResourceTableDecoder( - vgf_data + header_decoder->GetModelResourceTableOffset(), - header_decoder->GetModelResourceTableSize()); - unique_ptr constant_decoder = - vgflib::CreateConstantDecoder( - vgf_data + header_decoder->GetConstantsOffset(), - header_decoder->GetConstantsSize()); - // Check the VGF decoders - if (not(header_decoder && module_decoder && sequence_decoder && - resource_decoder && constant_decoder && header_decoder->IsValid() && - header_decoder->CheckVersion())) { - ET_LOG(Error, "Failed to process VGF file internalsr"); - return false; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_DECODE_TABLES"); + + // Prepare temporary decoders + header_decoder = + vgflib::CreateHeaderDecoder(vgf_data, vgflib::HeaderSize(), vgf_size); + if (!header_decoder) { + ET_LOG(Error, "Failed to create VGF header decoder"); + return false; + } + + sequence_decoder = vgflib::CreateModelSequenceTableDecoder( + vgf_data + header_decoder->GetModelSequenceTableOffset(), + header_decoder->GetModelSequenceTableSize()); + module_decoder = vgflib::CreateModuleTableDecoder( + vgf_data + header_decoder->GetModuleTableOffset(), + header_decoder->GetModuleTableSize()); + resource_decoder = vgflib::CreateModelResourceTableDecoder( + vgf_data + header_decoder->GetModelResourceTableOffset(), + header_decoder->GetModelResourceTableSize()); + constant_decoder = vgflib::CreateConstantDecoder( + vgf_data + header_decoder->GetConstantsOffset(), + header_decoder->GetConstantsSize()); + // Check the VGF decoders + if (not(header_decoder && module_decoder && sequence_decoder && + resource_decoder && constant_decoder && header_decoder->IsValid() && + header_decoder->CheckVersion())) { + ET_LOG(Error, "Failed to process VGF file internalsr"); + return false; + } } // Parse the sequences in the VGF (while there can be multiple sequences of @@ -381,22 +578,27 @@ bool VgfRepr::process_vgf( // GRAPH segment to be present. const int segment_id = 0; - debug_print_sequence(sequence_decoder); + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_PARSE_SEQUENCE_AND_MODULE"); + + debug_print_sequence(sequence_decoder); #if defined(ET_ARM_VGF_DEBUG) - debug_print_resources(resource_decoder); + debug_print_resources(resource_decoder); #endif - if (sequence_decoder->modelSequenceTableSize() != 1) { - ET_LOG(Error, "Expected sequence length 1"); - return false; - } - if (sequence_decoder->getSegmentType(segment_id) != - vgflib::ModuleType::GRAPH) { - ET_LOG(Error, "Expected segment to be of type GRAPH"); - return false; + if (sequence_decoder->modelSequenceTableSize() != 1) { + ET_LOG(Error, "Expected sequence length 1"); + return false; + } + if (sequence_decoder->getSegmentType(segment_id) != + vgflib::ModuleType::GRAPH) { + ET_LOG(Error, "Expected segment to be of type GRAPH"); + return false; + } + + // Extract first segment and its associated module + debug_print_modules(module_decoder); } - // Extract first segment and it's associated module - debug_print_modules(module_decoder); auto segment_name = string(sequence_decoder->getSegmentName(segment_id)); auto segment_module = sequence_decoder->getSegmentModuleIndex(segment_id); @@ -405,18 +607,22 @@ bool VgfRepr::process_vgf( string(module_decoder->getModuleEntryPoint(segment_module)); auto segment_m_spirv = module_decoder->getModuleCode(segment_module); - // Build a shader from the module - VkShaderModuleCreateInfo smci{ - .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .codeSize = segment_m_spirv.size() * sizeof(uint32_t), - .pCode = segment_m_spirv.begin(), - }; - result = vkCreateShaderModule(vk_device, &smci, nullptr, &vk_shader); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to load shader from segment %d", segment_module); - return false; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_SHADER_MODULE"); + + // Build a shader from the module + VkShaderModuleCreateInfo smci{ + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .codeSize = segment_m_spirv.size() * sizeof(uint32_t), + .pCode = segment_m_spirv.begin(), + }; + result = vkCreateShaderModule(vk_device, &smci, nullptr, &vk_shader); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to load shader from segment %d", segment_module); + return false; + } } // Record our shader and entrypoint string @@ -428,538 +634,674 @@ bool VgfRepr::process_vgf( vector> resources; vector constants; - int IO_count = resource_decoder->size(); - for (int i = 0; i < IO_count; i++) { - auto resource_type = resource_decoder->getDescriptorType(i).value_or(0); - auto resource_format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)); - - // Get tensor shape and strides - auto shape = resource_decoder->getTensorShape(i); - auto stride = resource_decoder->getTensorStride(i); - const auto shape_size = shape.size(); - - switch (resource_decoder->getCategory(i)) { - case vgflib::ResourceCategory::INPUT: - case vgflib::ResourceCategory::OUTPUT: { - // Expect IO to be a tensor type - if (resource_type != VK_DESCRIPTOR_TYPE_TENSOR_ARM) { - ET_LOG( - Error, - "Expected tensor type descriptor %u got %u", - VK_DESCRIPTOR_TYPE_TENSOR_ARM, - resource_type); - return false; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_RESOURCE_TABLE"); + + int IO_count = resource_decoder->size(); + for (int i = 0; i < IO_count; i++) { + auto resource_type = resource_decoder->getDescriptorType(i).value_or(0); + auto resource_format = + vgflib::ToVkFormat(resource_decoder->getVkFormat(i)); + + // Get tensor shape and strides + auto shape = resource_decoder->getTensorShape(i); + auto stride = resource_decoder->getTensorStride(i); + const auto shape_size = shape.size(); + + switch (resource_decoder->getCategory(i)) { + case vgflib::ResourceCategory::INPUT: + case vgflib::ResourceCategory::OUTPUT: { + // Expect IO to be a tensor type + if (resource_type != VK_DESCRIPTOR_TYPE_TENSOR_ARM) { + ET_LOG( + Error, + "Expected tensor type descriptor %u got %u", + VK_DESCRIPTOR_TYPE_TENSOR_ARM, + resource_type); + return false; + } + + // Allocate a tensor with backing memory + VkTensorARM tensor; + VkTensorViewARM tensor_view; + VkDeviceMemory tensor_memory; + VkTensorDescriptionARM tensor_description; + + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_IO_TENSOR"); + + result = allocate_tensor( + vk_physical, + vk_device, + resource_format, + shape_size == 0 ? 1 : static_cast(shape_size), + shape_size == 0 ? &kScalarSentinelDimension : shape.begin(), + static_cast(stride.size()), + stride.begin(), + &tensor_description, + &tensor_view, + &tensor, + &tensor_memory); + } + + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i); + return false; + } + size_t e_size = get_format_size(resource_format); + if (0 == e_size) { + ET_LOG(Error, "failed to get element size of VkFormat"); + return false; + } + + bool is_in = resource_decoder->getCategory(i) == + vgflib::ResourceCategory::INPUT; + IOs.push_back( + IO{vector(shape.begin(), shape.end()), + vector(stride.begin(), stride.end()), + e_size, + tensor, + tensor_view, + tensor_memory, + is_in}); + resources.push_back({tensor, tensor_view}); + descriptors.push_back(tensor_description); + break; } - - // Allocate a tensor with backing memory - VkTensorARM tensor; - VkTensorViewARM tensor_view; - VkDeviceMemory tensor_memory; - VkTensorDescriptionARM tensor_description; - result = allocate_tensor( - vk_physical, - vk_device, - resource_format, - shape_size == 0 ? 1 : static_cast(shape_size), - shape_size == 0 ? &kScalarSentinelDimension : shape.begin(), - static_cast(stride.size()), - stride.begin(), - &tensor_description, - &tensor_view, - &tensor, - &tensor_memory); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i); + case vgflib::ResourceCategory::CONSTANT: + // Constants just need a descriptor + descriptors.push_back(VkTensorDescriptionARM{ + .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, + .pNext = nullptr, + .tiling = VK_TENSOR_TILING_LINEAR_ARM, + .format = resource_format, + .dimensionCount = + shape_size == 0 ? 1 : static_cast(shape_size), + .pDimensions = + shape_size == 0 ? &kScalarSentinelDimension : shape.begin(), + // Note: stride_data of 0's causes size==0, null means + // stride==size + .pStrides = (0 == stride.size() ? nullptr : stride.begin()), + .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM, + }); + break; + case vgflib::ResourceCategory::INTERMEDIATE: + ET_LOG(Error, "Unsupported resource category INTERMEDIATE"); return false; - } - size_t e_size = get_format_size(resource_format); - if (0 == e_size) { - ET_LOG(Error, "failed to get element size of VkFormat"); + default: + ET_LOG(Info, "Unsupported resource category UNKNOWN"); return false; - } - - bool is_in = - resource_decoder->getCategory(i) == vgflib::ResourceCategory::INPUT; - IOs.push_back( - IO{vector(shape.begin(), shape.end()), - vector(stride.begin(), stride.end()), - e_size, - tensor, - tensor_view, - tensor_memory, - is_in}); - resources.push_back({tensor, tensor_view}); - descriptors.push_back(tensor_description); - break; } - case vgflib::ResourceCategory::CONSTANT: - // Constants just need a descriptor - descriptors.push_back(VkTensorDescriptionARM{ - .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM, - .pNext = nullptr, - .tiling = VK_TENSOR_TILING_LINEAR_ARM, - .format = resource_format, - .dimensionCount = - shape_size == 0 ? 1 : static_cast(shape_size), - .pDimensions = - shape_size == 0 ? &kScalarSentinelDimension : shape.begin(), - // Note: stride_data of 0's causes size==0, null means stride==size - .pStrides = (0 == stride.size() ? nullptr : stride.begin()), - .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM, - }); - break; - case vgflib::ResourceCategory::INTERMEDIATE: - ET_LOG(Error, "Unsupported resource category INTERMEDIATE"); - return false; - default: - ET_LOG(Info, "Unsupported resource category UNKNOWN"); - return false; } } - // Constants table - mapping of shader bindings to MRT's and their descriptors - auto constant_indexes = - sequence_decoder->getSegmentConstantIndexes(segment_id); - for (uint32_t i : constant_indexes) { - auto mrt_i = constant_decoder->getConstantMrtIndex(i); - auto constant_data = constant_decoder->getConstant(i); - constants.push_back(VkDataGraphPipelineConstantARM{ - .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM, - .pNext = &descriptors[mrt_i], - .id = i, - .pConstantData = constant_data.begin(), - }); + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CONSTANT_TABLE"); + + // Constants table - mapping of shader bindings to MRT's and their + // descriptors + auto constant_indexes = + sequence_decoder->getSegmentConstantIndexes(segment_id); + for (uint32_t i : constant_indexes) { + auto mrt_i = constant_decoder->getConstantMrtIndex(i); + auto constant_data = constant_decoder->getConstant(i); + constants.push_back(VkDataGraphPipelineConstantARM{ + .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM, + .pNext = &descriptors[mrt_i], + .id = i, + .pConstantData = constant_data.begin(), + }); + } } // Prepare our layout bindings from the segment's information vector layout_bindings; vector data_graph_resources; - auto set_count = - sequence_decoder->getSegmentDescriptorSetInfosSize(segment_id); - for (uint32_t d_idx = 0; d_idx < set_count; d_idx++) { - auto handle = - sequence_decoder->getDescriptorBindingSlotsHandle(segment_id, d_idx); - auto binding_count = sequence_decoder->getBindingsSize(handle); - for (int binding = 0; binding < binding_count; binding++) { - auto binding_index = - sequence_decoder->getBindingSlotBinding(handle, binding); - auto MRT_index = - sequence_decoder->getBindingSlotMrtIndex(handle, binding); - auto MRT_type = resource_decoder->getDescriptorType(MRT_index).value(); - - const VkDescriptorSetLayoutBinding layout_binding{ - .binding = binding_index, - .descriptorType = vgflib::ToVkDescriptorType(MRT_type), - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_ALL, - .pImmutableSamplers = nullptr, - }; - layout_bindings.push_back(layout_binding); - - const VkDataGraphPipelineResourceInfoARM resource{ - .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM, - // Note: we populate the resource_descriptors 1:1 with the MRT table, - // so can directly use that index into the resource_descriptors - .pNext = &descriptors[MRT_index], - .descriptorSet = d_idx, - .binding = binding_index, - .arrayElement = 0, - }; - data_graph_resources.push_back(resource); + uint32_t set_count = 0; + + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_DESCRIPTOR_METADATA"); + + set_count = sequence_decoder->getSegmentDescriptorSetInfosSize(segment_id); + for (uint32_t d_idx = 0; d_idx < set_count; d_idx++) { + auto handle = + sequence_decoder->getDescriptorBindingSlotsHandle(segment_id, d_idx); + auto binding_count = sequence_decoder->getBindingsSize(handle); + for (int binding = 0; binding < binding_count; binding++) { + auto binding_index = + sequence_decoder->getBindingSlotBinding(handle, binding); + auto MRT_index = + sequence_decoder->getBindingSlotMrtIndex(handle, binding); + auto MRT_type = resource_decoder->getDescriptorType(MRT_index).value(); + + const VkDescriptorSetLayoutBinding layout_binding{ + .binding = binding_index, + .descriptorType = vgflib::ToVkDescriptorType(MRT_type), + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_ALL, + .pImmutableSamplers = nullptr, + }; + layout_bindings.push_back(layout_binding); + + const VkDataGraphPipelineResourceInfoARM resource{ + .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM, + // Note: we populate the resource_descriptors 1:1 with the MRT + // table, so can directly use that index into the + // resource_descriptors + .pNext = &descriptors[MRT_index], + .descriptorSet = d_idx, + .binding = binding_index, + .arrayElement = 0, + }; + data_graph_resources.push_back(resource); + } } } - // create fixed layout for this module - const VkDescriptorSetLayoutCreateInfo layout_info = { - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .bindingCount = static_cast(layout_bindings.size()), - .pBindings = layout_bindings.data(), - }; - result = - vkCreateDescriptorSetLayout(vk_device, &layout_info, nullptr, &vk_layout); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to create descriptor layout"); - return false; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_DESCRIPTOR_SET_LAYOUT"); + + // create fixed layout for this module + const VkDescriptorSetLayoutCreateInfo layout_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast(layout_bindings.size()), + .pBindings = layout_bindings.data(), + }; + result = vkCreateDescriptorSetLayout( + vk_device, &layout_info, nullptr, &vk_layout); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to create descriptor layout"); + return false; + } } - std::vector poolSizes; - poolSizes.reserve(layout_bindings.size()); - for (const auto& b : layout_bindings) { - bool found = false; - for (size_t idx = 0; idx < poolSizes.size(); ++idx) { - if (poolSizes[idx].type == b.descriptorType) { - poolSizes[idx].descriptorCount += b.descriptorCount; - found = true; - break; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_DESCRIPTOR_POOL"); + + std::vector poolSizes; + poolSizes.reserve(layout_bindings.size()); + for (const auto& b : layout_bindings) { + bool found = false; + for (size_t idx = 0; idx < poolSizes.size(); ++idx) { + if (poolSizes[idx].type == b.descriptorType) { + poolSizes[idx].descriptorCount += b.descriptorCount; + found = true; + break; + } + } + if (!found) { + poolSizes.push_back({b.descriptorType, b.descriptorCount}); } } - if (!found) { - poolSizes.push_back({b.descriptorType, b.descriptorCount}); + + // Create descriptor pool and descriptors for pipeline + const VkDescriptorPoolCreateInfo descriptor_pool_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .maxSets = static_cast(set_count), + .poolSizeCount = static_cast(poolSizes.size()), + .pPoolSizes = poolSizes.data(), + }; + result = vkCreateDescriptorPool( + vk_device, &descriptor_pool_info, nullptr, &vk_descriptor_pool); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to create descriptor pool"); + return false; } } - // Create descriptor pool and descriptors for pipeline - const VkDescriptorPoolCreateInfo descriptor_pool_info = { - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .maxSets = static_cast(set_count), - .poolSizeCount = static_cast(poolSizes.size()), - .pPoolSizes = poolSizes.data(), - }; - result = vkCreateDescriptorPool( - vk_device, &descriptor_pool_info, nullptr, &vk_descriptor_pool); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to create descriptor pool"); - return false; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_DESCRIPTOR_SETS"); + + const VkDescriptorSetAllocateInfo descriptor_set_info = { + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .pNext = nullptr, + .descriptorPool = vk_descriptor_pool, + .descriptorSetCount = static_cast(set_count), + .pSetLayouts = &vk_layout, + }; + + // Alloc descriptor sets + // currently, as we require modelSequenceTableSize to == 1 + // we can only get one descriptor set. + descriptor_sets.resize(layout_bindings.size()); + result = vkAllocateDescriptorSets( + vk_device, &descriptor_set_info, descriptor_sets.data()); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to allocate descriptor sets"); + return false; + } } - const VkDescriptorSetAllocateInfo descriptor_set_info = { - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, - .pNext = nullptr, - .descriptorPool = vk_descriptor_pool, - .descriptorSetCount = static_cast(set_count), - .pSetLayouts = &vk_layout, - }; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_UPDATE_DESCRIPTOR_SETS"); - // Alloc descriptor sets - // currently, as we require modelSequenceTableSize to == 1 - // we can only get one descriptor set. - descriptor_sets.resize(layout_bindings.size()); - result = vkAllocateDescriptorSets( - vk_device, &descriptor_set_info, descriptor_sets.data()); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to allocate descriptor sets"); - return false; + // write descriptor updates for every input + auto input_slots = + sequence_decoder->getSegmentInputBindingSlotsHandle(segment_id); + auto input_size = sequence_decoder->getBindingsSize(input_slots); + for (uint32_t i = 0; i < input_size; i++) { + auto binding = sequence_decoder->getBindingSlotBinding(input_slots, i); + auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(input_slots, i); + + VkWriteDescriptorSetTensorARM write_desc = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, + .pNext = nullptr, + .tensorViewCount = 1, + .pTensorViews = &get<1>(resources[i]), + }; + VkWriteDescriptorSet desc_set = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = &write_desc, + .dstSet = descriptor_sets[0], + .dstBinding = binding, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM, + .pImageInfo = nullptr, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr); + } + + // write descriptor updates for every output + auto output_slots = + sequence_decoder->getSegmentOutputBindingSlotsHandle(segment_id); + auto output_size = sequence_decoder->getBindingsSize(output_slots); + for (uint32_t i = 0; i < output_size; i++) { + auto binding = sequence_decoder->getBindingSlotBinding(output_slots, i); + auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(output_slots, i); + + VkWriteDescriptorSetTensorARM write_desc = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, + .pNext = nullptr, + .tensorViewCount = 1, + .pTensorViews = &get<1>(resources[i + input_size]), + }; + VkWriteDescriptorSet desc_set = { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = &write_desc, + .dstSet = descriptor_sets[0], + .dstBinding = binding, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM, + .pImageInfo = nullptr, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr); + } } - // write descriptor updates for every input - auto input_slots = - sequence_decoder->getSegmentInputBindingSlotsHandle(segment_id); - auto input_size = sequence_decoder->getBindingsSize(input_slots); - for (uint32_t i = 0; i < input_size; i++) { - auto binding = sequence_decoder->getBindingSlotBinding(input_slots, i); - auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(input_slots, i); + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_PIPELINE_LAYOUT"); - VkWriteDescriptorSetTensorARM write_desc = { - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, + // create our pipeline + VkPipelineLayoutCreateInfo pipeline_layout_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .pNext = nullptr, - .tensorViewCount = 1, - .pTensorViews = &get<1>(resources[i]), + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = &vk_layout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr, }; - VkWriteDescriptorSet desc_set = { - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .pNext = &write_desc, - .dstSet = descriptor_sets[0], - .dstBinding = binding, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM, - .pImageInfo = nullptr, - .pBufferInfo = nullptr, - .pTexelBufferView = nullptr, - }; - vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr); + result = vkCreatePipelineLayout( + vk_device, &pipeline_layout_info, nullptr, &vk_pipeline_layout); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to create pipeline layout"); + return false; + } } - // write descriptor updates for every output - auto output_slots = - sequence_decoder->getSegmentOutputBindingSlotsHandle(segment_id); - auto output_size = sequence_decoder->getBindingsSize(output_slots); - for (uint32_t i = 0; i < output_size; i++) { - auto binding = sequence_decoder->getBindingSlotBinding(output_slots, i); - auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(output_slots, i); + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_DATA_GRAPH_PIPELINE"); - VkWriteDescriptorSetTensorARM write_desc = { - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM, + // Shader Module Create + VkDataGraphPipelineShaderModuleCreateInfoARM shader_info{ + .sType = + VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM, .pNext = nullptr, - .tensorViewCount = 1, - .pTensorViews = &get<1>(resources[i + input_size]), - }; - VkWriteDescriptorSet desc_set = { - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .pNext = &write_desc, - .dstSet = descriptor_sets[0], - .dstBinding = binding, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM, - .pImageInfo = nullptr, - .pBufferInfo = nullptr, - .pTexelBufferView = nullptr, + .module = get<0>(shader_modules[0]), + .pName = get<1>(shader_modules[0]).c_str(), + .pSpecializationInfo = nullptr, + .constantCount = static_cast(constants.size()), + .pConstants = constants.data(), }; - vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr); - } - // create our pipeline - VkPipelineLayoutCreateInfo pipeline_layout_info = { - .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .setLayoutCount = 1, - .pSetLayouts = &vk_layout, - .pushConstantRangeCount = 0, - .pPushConstantRanges = nullptr, - }; - result = vkCreatePipelineLayout( - vk_device, &pipeline_layout_info, nullptr, &vk_pipeline_layout); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to create pipeline layout"); - return false; - } - - // Shader Module Create - VkDataGraphPipelineShaderModuleCreateInfoARM shader_info{ - .sType = - VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM, - .pNext = nullptr, - .module = get<0>(shader_modules[0]), - .pName = get<1>(shader_modules[0]).c_str(), - .pSpecializationInfo = nullptr, - .constantCount = static_cast(constants.size()), - .pConstants = constants.data(), - }; + // Prepare Graph Pipeline + VkDataGraphPipelineCreateInfoARM graph_pipeline_info{ + .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM, + .pNext = &shader_info, + .flags = VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR, + .layout = vk_pipeline_layout, + .resourceInfoCount = static_cast(data_graph_resources.size()), + .pResourceInfos = data_graph_resources.data(), + }; - // Prepare Graph Pipeline - VkDataGraphPipelineCreateInfoARM graph_pipeline_info{ - .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM, - .pNext = &shader_info, - .flags = VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR, - .layout = vk_pipeline_layout, - .resourceInfoCount = static_cast(data_graph_resources.size()), - .pResourceInfos = data_graph_resources.data(), - }; + result = vkCreateDataGraphPipelinesARM( + vk_device, // device + VK_NULL_HANDLE, // deferredOperation + VK_NULL_HANDLE, // VkPipelineCache + 1, // createInfoCount + &graph_pipeline_info, // pCreateInfos + nullptr, // pAllocator + &vk_pipeline // pPipelines (VkPipeline*) + ); - result = vkCreateDataGraphPipelinesARM( - vk_device, // device - VK_NULL_HANDLE, // deferredOperation - VK_NULL_HANDLE, // VkPipelineCache - 1, // createInfoCount - &graph_pipeline_info, // pCreateInfos - nullptr, // pAllocator - &vk_pipeline // pPipelines (VkPipeline*) - ); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to create DataGraphPipeline"); - return false; + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to create DataGraphPipeline"); + return false; + } } // prepare the graph pipeline session - VkDataGraphPipelineSessionCreateInfoARM pipeline_session_info{ - .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM, - .pNext = nullptr, - .flags = 0, - .dataGraphPipeline = vk_pipeline, - }; - result = vkCreateDataGraphPipelineSessionARM( - vk_device, &pipeline_session_info, nullptr, &vk_session); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to create DataGraphPipelineSession"); - return false; - } + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_CREATE_PIPELINE_SESSION"); - // Allocate command buffer - VkCommandBufferAllocateInfo buffer_allocate_info{ - .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, - .pNext = nullptr, - .commandPool = vk_command_pool, - .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, - .commandBufferCount = 1}; - result = vkAllocateCommandBuffers( - vk_device, &buffer_allocate_info, &vk_execute_cmd); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to allocate command buffers"); - return false; + VkDataGraphPipelineSessionCreateInfoARM pipeline_session_info{ + .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM, + .pNext = nullptr, + .flags = 0, + .dataGraphPipeline = vk_pipeline, + }; + result = vkCreateDataGraphPipelineSessionARM( + vk_device, &pipeline_session_info, nullptr, &vk_session); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to create DataGraphPipelineSession"); + return false; + } } - // Allocate intermediates memory based on the pipeline requirements provided - // by the driver - VkDataGraphPipelineSessionBindPointRequirementsInfoARM - bind_point_requirements_info = { - .sType = - VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENTS_INFO_ARM, - .pNext = nullptr, - .session = vk_session, - }; + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_COMMAND_BUFFER"); - uint32_t bind_point_count = 0; - result = vkGetDataGraphPipelineSessionBindPointRequirementsARM( - vk_device, &bind_point_requirements_info, &bind_point_count, nullptr); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to get session bind point count"); - return false; + // Allocate command buffer + VkCommandBufferAllocateInfo buffer_allocate_info{ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .pNext = nullptr, + .commandPool = vk_command_pool, + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1}; + result = vkAllocateCommandBuffers( + vk_device, &buffer_allocate_info, &vk_execute_cmd); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to allocate command buffers"); + return false; + } } - vector - bind_point_requirements; - bind_point_requirements.resize(bind_point_count); - result = vkGetDataGraphPipelineSessionBindPointRequirementsARM( - vk_device, - &bind_point_requirements_info, - &bind_point_count, - bind_point_requirements.data()); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to get session bind point requirements"); - return false; - } + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_ALLOCATE_TRANSIENT_MEMORY"); - // Given the bind points, just make individual allocations and bind them - for (const auto& bind_point_requirement : bind_point_requirements) { - // These are the only allowed type and bindpoint with the current spec - if (bind_point_requirement.bindPointType != - VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM) { - ET_LOG( - Error, - "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM"); - return false; - } - if (bind_point_requirement.bindPoint != - VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM) { - ET_LOG( - Error, - "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM"); + // Allocate intermediates memory based on the pipeline requirements provided + // by the driver + VkDataGraphPipelineSessionBindPointRequirementsInfoARM + bind_point_requirements_info = { + .sType = + VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENTS_INFO_ARM, + .pNext = nullptr, + .session = vk_session, + }; + + uint32_t bind_point_count = 0; + result = vkGetDataGraphPipelineSessionBindPointRequirementsARM( + vk_device, &bind_point_requirements_info, &bind_point_count, nullptr); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to get session bind point count"); return false; } - if (bind_point_requirement.numObjects != 1) { - ET_LOG(Error, "Expected only one object for the bindpoint"); + + vector + bind_point_requirements; + bind_point_requirements.resize(bind_point_count); + result = vkGetDataGraphPipelineSessionBindPointRequirementsARM( + vk_device, + &bind_point_requirements_info, + &bind_point_count, + bind_point_requirements.data()); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to get session bind point requirements"); return false; } - VkDataGraphPipelineSessionMemoryRequirementsInfoARM memory_requirements_info = { - .sType = - VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_MEMORY_REQUIREMENTS_INFO_ARM, - .pNext = nullptr, - .session = vk_session, - .bindPoint = bind_point_requirement.bindPoint, - .objectIndex = 0, // NOTE: tied to numObjects assert above - }; - VkMemoryRequirements2 memory_requirements = { - .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, - .pNext = nullptr, - }; - vkGetDataGraphPipelineSessionMemoryRequirementsARM( - vk_device, &memory_requirements_info, &memory_requirements); + // Given the bind points, just make individual allocations and bind them + for (const auto& bind_point_requirement : bind_point_requirements) { + // These are the only allowed type and bindpoint with the current spec + if (bind_point_requirement.bindPointType != + VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM) { + ET_LOG( + Error, + "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TYPE_MEMORY_ARM"); + return false; + } + if (bind_point_requirement.bindPoint != + VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM) { + ET_LOG( + Error, + "Expected VK_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_TRANSIENT_ARM"); + return false; + } + if (bind_point_requirement.numObjects != 1) { + ET_LOG(Error, "Expected only one object for the bindpoint"); + return false; + } - VkMemoryPropertyFlags aims = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - uint32_t memory_index = - get_memory_index(vk_physical, memory_requirements, aims); + VkDataGraphPipelineSessionMemoryRequirementsInfoARM + memory_requirements_info = { + .sType = + VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_MEMORY_REQUIREMENTS_INFO_ARM, + .pNext = nullptr, + .session = vk_session, + .bindPoint = bind_point_requirement.bindPoint, + .objectIndex = 0, // NOTE: tied to numObjects assert above + }; + VkMemoryRequirements2 memory_requirements = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, + .pNext = nullptr, + }; + vkGetDataGraphPipelineSessionMemoryRequirementsARM( + vk_device, &memory_requirements_info, &memory_requirements); - VkMemoryAllocateInfo memory_allocate_info = { - .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = nullptr, - .allocationSize = memory_requirements.memoryRequirements.size, - .memoryTypeIndex = memory_index, - }; + VkMemoryPropertyFlags aims = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + uint32_t memory_index = + get_memory_index(vk_physical, memory_requirements, aims); - VkDeviceMemory memory; - result = - vkAllocateMemory(vk_device, &memory_allocate_info, nullptr, &memory); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to allocate memory for intermediates"); - return false; + VkMemoryAllocateInfo memory_allocate_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .pNext = nullptr, + .allocationSize = memory_requirements.memoryRequirements.size, + .memoryTypeIndex = memory_index, + }; + + VkDeviceMemory memory; + result = + vkAllocateMemory(vk_device, &memory_allocate_info, nullptr, &memory); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to allocate memory for intermediates"); + return false; + } + // so we can free this object in destructor + intermediates.push_back(memory); + + VkBindDataGraphPipelineSessionMemoryInfoARM bind_info = { + .sType = + VK_STRUCTURE_TYPE_BIND_DATA_GRAPH_PIPELINE_SESSION_MEMORY_INFO_ARM, + .pNext = nullptr, + .session = vk_session, + .bindPoint = bind_point_requirement.bindPoint, + .objectIndex = 0, // NOTE: tied to numObjects assert above + .memory = memory, + .memoryOffset = 0, + }; + result = + vkBindDataGraphPipelineSessionMemoryARM(vk_device, 1, &bind_info); + if (result != VK_SUCCESS) { + ET_LOG(Error, "Failed to bind intermediates memory"); + return false; + } } - // so we can free this object in destructor - intermediates.push_back(memory); + } - VkBindDataGraphPipelineSessionMemoryInfoARM bind_info = { - .sType = - VK_STRUCTURE_TYPE_BIND_DATA_GRAPH_PIPELINE_SESSION_MEMORY_INFO_ARM, - .pNext = nullptr, - .session = vk_session, - .bindPoint = bind_point_requirement.bindPoint, - .objectIndex = 0, // NOTE: tied to numObjects assert above - .memory = memory, - .memoryOffset = 0, - }; - result = vkBindDataGraphPipelineSessionMemoryARM(vk_device, 1, &bind_info); - if (result != VK_SUCCESS) { - ET_LOG(Error, "Failed to bind intermediates memory"); + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_TIMESTAMP_QUERIES"); + + if (!init_timestamp_queries()) { + ET_LOG(Error, "Failed to initialize VGF timestamp queries"); return false; } } - // Populate command once with our dispatch information - VkCommandBufferBeginInfo beginInfo{ - VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO}; - vkBeginCommandBuffer(vk_execute_cmd, &beginInfo); + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_INIT_RECORD_COMMAND_BUFFER"); + + // Populate command once with our dispatch information + VkCommandBufferBeginInfo beginInfo{ + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO}; + vkBeginCommandBuffer(vk_execute_cmd, &beginInfo); + + // Sync what will be the data coming in from host + VkMemoryBarrier2 barrier = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, + .srcAccessMask = VK_ACCESS_2_HOST_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT, + }; + VkDependencyInfo dependency_info = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier, + }; + vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info); + + // bind pipeline + descriptor set + vkCmdBindPipeline( + vk_execute_cmd, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, vk_pipeline); + + vkCmdBindDescriptorSets( + vk_execute_cmd, + VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, + vk_pipeline_layout, + 0, // first set + 1, + descriptor_sets.data(), // descriptor set count + pointer + 0, + nullptr // no dynamic offsets + ); + + // Dispatch the graph command + if (timestamp_queries_enabled && + vk_timestamp_query_pool != VK_NULL_HANDLE) { + vkCmdResetQueryPool(vk_execute_cmd, vk_timestamp_query_pool, 0, 2); + + if (vkCmdWriteTimestamp2) { + vkCmdWriteTimestamp2( + vk_execute_cmd, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + vk_timestamp_query_pool, + 0); + } else { + vkCmdWriteTimestamp( + vk_execute_cmd, + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + vk_timestamp_query_pool, + 0); + } + } - // Sync what will be the data coming in from host - VkMemoryBarrier2 barrier = { - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, - .srcStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, - .srcAccessMask = VK_ACCESS_2_HOST_WRITE_BIT, - .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .dstAccessMask = VK_ACCESS_2_SHADER_READ_BIT, - }; - VkDependencyInfo dependency_info = { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .memoryBarrierCount = 1, - .pMemoryBarriers = &barrier, - }; - vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info); - - // bind pipeline + descriptor set - vkCmdBindPipeline( - vk_execute_cmd, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, vk_pipeline); - - vkCmdBindDescriptorSets( - vk_execute_cmd, - VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, - vk_pipeline_layout, - 0, // first set - 1, - descriptor_sets.data(), // descriptor set count + pointer - 0, - nullptr // no dynamic offsets - ); - - // Dispatch the graph command - vkCmdDispatchDataGraphARM(vk_execute_cmd, vk_session, nullptr); - - // Sync data back - VkMemoryBarrier2 barrier_2 = { - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, - .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, - .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, - .dstAccessMask = VK_ACCESS_2_HOST_READ_BIT, - }; - VkDependencyInfo dependency_info_2 = { - .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, - .memoryBarrierCount = 1, - .pMemoryBarriers = &barrier_2, - }; - vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info_2); + // Dispatch the graph command + vkCmdDispatchDataGraphARM(vk_execute_cmd, vk_session, nullptr); + + if (timestamp_queries_enabled && + vk_timestamp_query_pool != VK_NULL_HANDLE) { + if (vkCmdWriteTimestamp2) { + vkCmdWriteTimestamp2( + vk_execute_cmd, + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + vk_timestamp_query_pool, + 1); + } else { + vkCmdWriteTimestamp( + vk_execute_cmd, + VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + vk_timestamp_query_pool, + 1); + } + } - // end the command buffer - vkEndCommandBuffer(vk_execute_cmd); + // Sync data back + VkMemoryBarrier2 barrier_2 = { + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2, + .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, + .srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT, + .dstStageMask = VK_PIPELINE_STAGE_2_HOST_BIT, + .dstAccessMask = VK_ACCESS_2_HOST_READ_BIT, + }; + VkDependencyInfo dependency_info_2 = { + .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO, + .memoryBarrierCount = 1, + .pMemoryBarriers = &barrier_2, + }; + vkCmdPipelineBarrier2(vk_execute_cmd, &dependency_info_2); + + // end the command buffer + vkEndCommandBuffer(vk_execute_cmd); + } return true; } -bool VgfRepr::execute_vgf() { +bool VgfRepr::execute_vgf(executorch::runtime::EventTracer* event_tracer) { ET_LOG(Info, "Executing vgf"); - // Submit & wait for idle VkSubmitInfo submit{VK_STRUCTURE_TYPE_SUBMIT_INFO}; submit.commandBufferCount = 1; submit.pCommandBuffers = &vk_execute_cmd; - VkResult result = vkQueueSubmit(vk_queue, 1, &submit, VK_NULL_HANDLE); + + VkResult result; + + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_QUEUE_SUBMIT"); + + result = vkQueueSubmit(vk_queue, 1, &submit, VK_NULL_HANDLE); + } + if (result != VK_SUCCESS) { ET_LOG(Error, "VGF/VkCommandBuffer command submission failed"); return false; } - vkQueueWaitIdle(vk_queue); + + { + VGF_PROFILE_SCOPE(event_tracer, "VGF_QUEUE_WAIT_IDLE"); + + result = vkQueueWaitIdle(vk_queue); + } + + if (result != VK_SUCCESS) { + ET_LOG(Error, "VGF/VkQueue wait idle failed"); + return false; + } + + read_timestamp_queries(event_tracer); return true; } void VgfRepr::free_vgf() { + if (vk_timestamp_query_pool != VK_NULL_HANDLE) { + vkDestroyQueryPool(vk_device, vk_timestamp_query_pool, nullptr); + vk_timestamp_query_pool = VK_NULL_HANDLE; + } + vkFreeCommandBuffers(vk_device, vk_command_pool, 1, &vk_execute_cmd); vkDestroyDataGraphPipelineSessionARM(vk_device, vk_session, nullptr); vkDestroyPipeline(vk_device, vk_pipeline, nullptr); diff --git a/backends/arm/runtime/VGFSetup.h b/backends/arm/runtime/VGFSetup.h index 8e07b36e303..a8b1173ec16 100644 --- a/backends/arm/runtime/VGFSetup.h +++ b/backends/arm/runtime/VGFSetup.h @@ -11,6 +11,7 @@ using namespace std; #include +#include using executorch::runtime::ArrayRef; using executorch::runtime::CompileSpec; @@ -48,12 +49,14 @@ class VgfRepr { VkPhysicalDevice phys, VkDevice dev, VkQueue queue, - VkCommandPool pool) + VkCommandPool pool, + uint32_t queue_family_index = UINT32_MAX) : vk_instance(inst), vk_physical(phys), vk_device(dev), vk_queue(queue), - vk_command_pool(pool) {} + vk_command_pool(pool), + vk_queue_family_index(queue_family_index) {} /* * Process a VGF ready for execution, allocate necessary Vulkan objects. @@ -61,13 +64,13 @@ class VgfRepr { bool process_vgf( const char* vgf_data, size_t vgf_size, - ArrayRef specs); + ArrayRef specs, + executorch::runtime::EventTracer* event_tracer = nullptr); /* * Execute the VGF we've previously processed. */ - bool execute_vgf(); - + bool execute_vgf(executorch::runtime::EventTracer* event_tracer = nullptr); /* * Free any allocations made in process_vgf. */ @@ -105,6 +108,12 @@ class VgfRepr { VkDevice vk_device; VkQueue vk_queue; VkCommandPool vk_command_pool; + uint32_t vk_queue_family_index = UINT32_MAX; + + bool timestamp_queries_enabled = false; + uint32_t timestamp_valid_bits = 0; + double timestamp_period_ns = 0.0; + VkQueryPool vk_timestamp_query_pool = VK_NULL_HANDLE; // per-VgfRepr-instance objects allocated in process_vgf, used (can be more // than once) in execute_vgf @@ -117,6 +126,9 @@ class VgfRepr { VkShaderModule vk_shader; // Note: the vector of tensor memory is stored in IOs above vector descriptor_sets; + + bool init_timestamp_queries(); + void read_timestamp_queries(executorch::runtime::EventTracer* event_tracer); }; } // namespace vgf diff --git a/backends/arm/scripts/etdump_to_chrome_trace.py b/backends/arm/scripts/etdump_to_chrome_trace.py new file mode 100755 index 00000000000..252f26cc71f --- /dev/null +++ b/backends/arm/scripts/etdump_to_chrome_trace.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# The script reads profiling events from an ETDump file using the ExecuTorch +# Inspector API, optionally enriches them with ETRecord metadata, and writes a +# JSON trace that can be loaded in chrome://tracing or Perfetto. Each ExecuTorch +# event block is represented as a Chrome trace thread, and each profiling sample +# is emitted as a complete-duration event with timestamps and durations in +# microseconds. +# +# Example: +# python backends/arm/scripts/etdump_to_chrome_trace.py \ +# --etdump_path ./etdumps/vgf_timestamps.etdp \ +# --output ./traces/vgf_timestamps_trace.json + +import argparse +import json + +from executorch.devtools import Inspector +from executorch.devtools.inspector import TimeScale + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--etdump_path", required=True) + parser.add_argument("--etrecord_path", required=False, default=None) + parser.add_argument("--output", required=True) + parser.add_argument( + "--source_time_scale", + default="ns", + choices=[ts.value for ts in TimeScale], + ) + args = parser.parse_args() + + inspector = Inspector( + etdump_path=args.etdump_path, + etrecord=args.etrecord_path, + source_time_scale=TimeScale(args.source_time_scale), + target_time_scale=TimeScale.US, + ) + + trace_events = [] + + # Chrome trace uses microseconds for "ts" and "dur". + source_to_us = { + "ns": 1.0 / 1000.0, + "us": 1.0, + "ms": 1000.0, + "s": 1000_000.0, + "cycles": 1.0, + }[args.source_time_scale] + + for block_idx, event_block in enumerate(inspector.event_blocks): + tid_name = event_block.name + + trace_events.append( + { + "name": "thread_name", + "ph": "M", + "pid": 1, + "tid": block_idx, + "args": {"name": tid_name}, + } + ) + + for event in event_block.events: + if event.perf_data is None or event.start_time is None: + continue + + durations_us = event.perf_data.raw + start_times = event.start_time + + for iter_idx, (start_time, duration_us) in enumerate( + zip(start_times, durations_us) + ): + trace_events.append( + { + "name": event.name, + "cat": event_block.name, + "ph": "X", + "ts": float(start_time) * source_to_us, + "dur": float(duration_us), + "pid": 1, + "tid": block_idx, + "args": { + "event_block": event_block.name, + "iteration": iter_idx, + "is_delegated_op": event.is_delegated_op, + "delegate_backend_name": event.delegate_backend_name, + "op_types": event.op_types, + }, + } + ) + + with open(args.output, "w") as f: + json.dump({"traceEvents": trace_events}, f) + + print(f"Wrote Chrome trace JSON: {args.output}") + print(f"Events: {len(trace_events)}") + + +if __name__ == "__main__": + main()