-
Notifications
You must be signed in to change notification settings - Fork 1k
Arm backend: Add event profiling to VGF backend #19703
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
wwwind
wants to merge
2
commits into
pytorch:main
Choose a base branch
from
wwwind:profiling
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+1,161
−516
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,13 +6,20 @@ | |
| */ | ||
|
|
||
| #include <cinttypes> | ||
| #include <list> | ||
| #include <numeric> | ||
|
|
||
| using namespace std; | ||
|
|
||
| #include <c10/util/safe_numerics.h> | ||
| #include <executorch/runtime/backend/interface.h> | ||
| #include <executorch/runtime/core/error.h> | ||
| #include <executorch/runtime/core/evalue.h> | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| #include <executorch/runtime/core/event_tracer_hooks_delegate.h> | ||
| #endif | ||
|
|
||
| using executorch::aten::Tensor; | ||
| using executorch::runtime::ArrayRef; | ||
| using executorch::runtime::Backend; | ||
|
|
@@ -27,6 +34,13 @@ using executorch::runtime::MemoryAllocator; | |
| using executorch::runtime::Result; | ||
| using executorch::runtime::Span; | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| using executorch::runtime::event_tracer_end_profiling_delegate; | ||
| using executorch::runtime::event_tracer_start_profiling_delegate; | ||
| using executorch::runtime::EventTracer; | ||
| using executorch::runtime::EventTracerEntry; | ||
| #endif | ||
|
|
||
| // We use the platform and runtime environment provided by the Vulkan delegate | ||
| #include <executorch/backends/vulkan/runtime/vk_api/vk_api.h> | ||
|
|
||
|
|
@@ -69,7 +83,8 @@ VkResult vkml_allocate_basics( | |
| VkPhysicalDevice* physical_device, | ||
| VkDevice* device, | ||
| VkQueue* queue, | ||
| VkCommandPool* command_pool); | ||
| VkCommandPool* command_pool, | ||
| uint32_t* queue_family_index); | ||
|
|
||
| void vkml_free_basics( | ||
| VkInstance* instance, | ||
|
|
@@ -104,7 +119,8 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { | |
| &vk_physical_device, | ||
| &vk_device, | ||
| &vk_queue, | ||
| &vk_command_pool); | ||
| &vk_command_pool, | ||
| &vk_queue_family_index); | ||
| if (result != VK_SUCCESS) { | ||
| ET_LOG( | ||
| Error, "Failed to initialize the Vulkan device error 0x%08X", result); | ||
|
|
@@ -142,8 +158,31 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { | |
| ArrayRef<CompileSpec> compile_specs) const override { | ||
| ET_LOG(Info, "Entered VGF init"); | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| EventTracer* event_tracer = context.event_tracer(); | ||
|
|
||
| EventTracerEntry init_total_event = event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_INIT_TOTAL", | ||
| /*delegate_debug_id=*/-1); | ||
|
|
||
| EventTracerEntry ensure_initialized_event = | ||
| event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_INIT_ENSURE_INITIALIZED", | ||
| /*delegate_debug_id=*/-1); | ||
| #endif | ||
|
|
||
| const_cast<VGFBackend*>(this)->ensure_initialized(); | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, ensure_initialized_event); | ||
| #endif | ||
|
|
||
| if (!is_initialized_) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, init_total_event); | ||
| #endif | ||
| ET_LOG( | ||
| Error, | ||
| "VGF backend is unavailable because Vulkan initialization failed"); | ||
|
|
@@ -152,30 +191,89 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { | |
|
|
||
| const char* vgf_data = reinterpret_cast<const char*>(processed->data()); | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| EventTracerEntry allocate_repr_event = | ||
| event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_INIT_ALLOCATE_REPR", | ||
| /*delegate_debug_id=*/-1); | ||
| #endif | ||
|
|
||
| MemoryAllocator* allocator = context.get_runtime_allocator(); | ||
| VgfRepr* repr = allocator->allocateInstance<VgfRepr>(); | ||
| new (repr) VgfRepr( | ||
| vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool); | ||
| vk_instance, | ||
| vk_physical_device, | ||
| vk_device, | ||
| vk_queue, | ||
| vk_command_pool, | ||
| vk_queue_family_index); | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, allocate_repr_event); | ||
|
|
||
| EventTracerEntry process_vgf_event = event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_INIT_PROCESS_VGF_BACKEND", | ||
| /*delegate_debug_id=*/-1); | ||
| #endif | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| auto valid_vgf = repr->process_vgf( | ||
| vgf_data, processed->size(), compile_specs, event_tracer); | ||
| #else | ||
| auto valid_vgf = | ||
| repr->process_vgf(vgf_data, processed->size(), compile_specs); | ||
| #endif | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, process_vgf_event); | ||
| #endif | ||
|
|
||
| if (!valid_vgf) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, init_total_event); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you see nested profiling events leading to some noise? I.e. the time spent in recording the inner profiling event measured in the outer one? I understand this is needed for chrometrace etc. Just want to make sure this record overhead isn't too much. |
||
| #endif | ||
| ET_LOG(Error, "Failed to process VGF blob."); | ||
| return Error::Internal; | ||
| } | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, init_total_event); | ||
| #endif | ||
|
|
||
| return repr; | ||
| } | ||
|
|
||
| Error execute( | ||
| ET_UNUSED BackendExecutionContext& context, | ||
| BackendExecutionContext& context, | ||
| DelegateHandle* handle, | ||
| Span<EValue*> args) const override { | ||
| VgfRepr* repr = static_cast<VgfRepr*>(handle); | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| EventTracer* event_tracer = context.event_tracer(); | ||
|
|
||
| EventTracerEntry vgf_execute_event = event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_EXECUTE", | ||
| /*delegate_debug_id=*/-1); | ||
|
|
||
| EventTracerEntry copy_inputs_event = event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_COPY_INPUTS", | ||
| /*delegate_debug_id=*/-1); | ||
| #else | ||
| (void)context; | ||
| #endif | ||
|
|
||
| // Copy all inputs from EValue to VkDeviceMemory | ||
| for (int i = 0; i < repr->IOs.size(); i++) { | ||
| if (!args[i]->isTensor()) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event); | ||
| event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); | ||
| #endif | ||
| ET_LOG( | ||
| Error, | ||
| "Expected EValue %d to be tensor, got %d", | ||
|
|
@@ -206,22 +304,59 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { | |
|
|
||
| void* data; | ||
| if (!repr->map_io(io, &data)) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event); | ||
| event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); | ||
| #endif | ||
| ET_LOG(Error, "Failed to map Vulkan IO memory"); | ||
| return Error::Internal; | ||
| } | ||
| memcpy(data, tensor->mutable_data_ptr(), io_size); | ||
| repr->unmap_io(io); | ||
| } | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, copy_inputs_event); | ||
|
|
||
| EventTracerEntry dispatch_event = event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_DISPATCH_AND_WAIT", | ||
| /*delegate_debug_id=*/-1); | ||
| #endif | ||
|
|
||
| // Execute the workload | ||
| if (!repr->execute_vgf()) { | ||
| bool execute_ok = false; | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| execute_ok = repr->execute_vgf(event_tracer); | ||
| #else | ||
| execute_ok = repr->execute_vgf(); | ||
| #endif | ||
|
|
||
| if (!execute_ok) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, dispatch_event); | ||
| event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); | ||
| #endif | ||
| ET_LOG(Error, "Failed to execute the VGF representation"); | ||
| return Error::Internal; | ||
| } | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, dispatch_event); | ||
|
|
||
| EventTracerEntry copy_outputs_event = event_tracer_start_profiling_delegate( | ||
| event_tracer, | ||
| "VGF_COPY_OUTPUTS", | ||
| /*delegate_debug_id=*/-1); | ||
| #endif | ||
|
|
||
| // Copy all outputs from VKDeviceMemory to EValue | ||
| for (int i = 0; i < repr->IOs.size(); i++) { | ||
| if (!args[i]->isTensor()) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event); | ||
| event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); | ||
| #endif | ||
| ET_LOG( | ||
| Error, | ||
| "Expected EValue %d to be tensor, got %d", | ||
|
|
@@ -251,13 +386,22 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { | |
|
|
||
| void* data; | ||
| if (!repr->map_io(io, &data)) { | ||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event); | ||
| event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); | ||
| #endif | ||
| ET_LOG(Error, "Failed to map Vulkan IO memory"); | ||
| return Error::Internal; | ||
| } | ||
| memcpy(tensor->mutable_data_ptr(), data, io_size); | ||
| repr->unmap_io(io); | ||
| } | ||
|
|
||
| #ifdef ET_EVENT_TRACER_ENABLED | ||
| event_tracer_end_profiling_delegate(event_tracer, copy_outputs_event); | ||
| event_tracer_end_profiling_delegate(event_tracer, vgf_execute_event); | ||
| #endif | ||
|
|
||
| return Error::Ok; | ||
| } | ||
|
|
||
|
|
@@ -272,6 +416,7 @@ class VGFBackend final : public ::executorch::runtime::BackendInterface { | |
| VkDevice vk_device = VK_NULL_HANDLE; | ||
| VkQueue vk_queue = VK_NULL_HANDLE; | ||
| VkCommandPool vk_command_pool = VK_NULL_HANDLE; | ||
| uint32_t vk_queue_family_index = UINT32_MAX; | ||
| bool is_initialized_ = false; | ||
| }; | ||
|
|
||
|
|
@@ -286,7 +431,8 @@ VkResult vkml_allocate_basics( | |
| VkPhysicalDevice* physical_device, | ||
| VkDevice* device, | ||
| VkQueue* queue, | ||
| VkCommandPool* command_pool) { | ||
| VkCommandPool* command_pool, | ||
| uint32_t* queue_family_index) { | ||
| VkResult result; | ||
|
|
||
| if (VK_SUCCESS != volkInitialize()) { | ||
|
|
@@ -408,6 +554,9 @@ VkResult vkml_allocate_basics( | |
| ET_LOG(Error, "Failed to find suitable queue"); | ||
| return VK_ERROR_UNKNOWN; | ||
| } | ||
| if (queue_family_index != nullptr) { | ||
| *queue_family_index = qf; | ||
| } | ||
|
|
||
| // Device with ML tensor extension | ||
| float qp = 1.0f; | ||
|
|
@@ -544,4 +693,4 @@ VkResult vkml_allocate_basics( | |
|
|
||
| } // namespace vgf | ||
| } // namespace backends | ||
| } // namespace executorch | ||
| } // namespace executorch | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you planning to expand this to hardware performance counters like occupancy going beyond time measurements, and chrometrace.