From 105a02458dead055e47a717b1180429ce3b7447f Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 11 May 2026 21:35:07 -0700 Subject: [PATCH] [ET Device Support] DeviceAllocator interface and DeviceAllocatorRegistry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.). **DeviceAllocator interface provides:** - `allocate()` / `deallocate()` - Dynamic device memory allocation - `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device - `device_type()` - Returns the device type this allocator handles **DeviceAllocatorRegistry provides:** - Singleton registry mapping DeviceType → DeviceAllocator - `register_allocator()` / `get_allocator()` methods - Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly) **Design notes:** - Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime - Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry) - Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/) [ghstack-poisoned] --- runtime/core/device_allocator.cpp | 56 +++++ runtime/core/device_allocator.h | 198 ++++++++++++++++ runtime/core/portable_type/targets.bzl | 1 + runtime/core/targets.bzl | 14 ++ runtime/core/test/device_allocator_test.cpp | 241 ++++++++++++++++++++ runtime/core/test/targets.bzl | 12 +- 6 files changed, 521 insertions(+), 1 deletion(-) create mode 100644 runtime/core/device_allocator.cpp create mode 100644 runtime/core/device_allocator.h create mode 100644 runtime/core/test/device_allocator_test.cpp diff --git a/runtime/core/device_allocator.cpp b/runtime/core/device_allocator.cpp new file mode 100644 index 00000000000..91b32571aaf --- /dev/null +++ b/runtime/core/device_allocator.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace executorch { +namespace runtime { + +DeviceAllocatorRegistry& DeviceAllocatorRegistry::instance() { + static DeviceAllocatorRegistry registry; + return registry; +} + +void DeviceAllocatorRegistry::register_allocator(DeviceAllocator* alloc) { + ET_CHECK_MSG(alloc != nullptr, "Cannot register a null allocator"); + auto type = alloc->device_type(); + auto index = static_cast(type); + ET_CHECK_MSG( + index < etensor::kNumDeviceTypes, + "Invalid device type: %d", + static_cast(type)); + ET_CHECK_MSG( + allocators_[index] == nullptr, + "Allocator already registered for device type: %d", + static_cast(type)); + allocators_[index] = alloc; +} + +DeviceAllocator* DeviceAllocatorRegistry::get_allocator( + etensor::DeviceType type) { + auto index = static_cast(type); + if (index >= etensor::kNumDeviceTypes) { + return nullptr; + } + return allocators_[index]; +} + +// Convenience free functions + +void register_device_allocator(DeviceAllocator* alloc) { + DeviceAllocatorRegistry::instance().register_allocator(alloc); +} + +DeviceAllocator* get_device_allocator(etensor::DeviceType type) { + return DeviceAllocatorRegistry::instance().get_allocator(type); +} + +} // namespace runtime +} // namespace executorch diff --git a/runtime/core/device_allocator.h b/runtime/core/device_allocator.h new file mode 100644 index 00000000000..259bdb9f032 --- /dev/null +++ b/runtime/core/device_allocator.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace executorch { +namespace runtime { + +/** + * Abstract interface for device-specific memory allocation. + * + * Each device type (CUDA, etc.) provides a concrete implementation + * that handles memory allocation on that device. Implementations are + * expected to be singletons with static lifetime, registered via + * DeviceAllocatorRegistry. + */ +class DeviceAllocator { + public: + /** + * Default alignment of memory returned by allocate(). Reuses + * MemoryAllocator::kDefaultAlignment so host- and device-side allocations + * share the same baseline contract. Backends whose underlying device APIs + * already provide stronger guarantees (e.g. cudaMalloc returns 256-byte + * aligned pointers) will trivially satisfy this. + */ + static constexpr size_t kDefaultAlignment = + MemoryAllocator::kDefaultAlignment; + + virtual ~DeviceAllocator() = default; + /** + * Allocate device memory. + * + * @param nbytes Number of bytes to allocate. + * @param index The device index. + * @param alignment Minimum alignment of the returned pointer in bytes. + * Must be a power of 2. Defaults to kDefaultAlignment. + * @return A Result containing the device pointer on success, or an error. + */ + virtual Result allocate( + size_t nbytes, + etensor::DeviceIndex index, + size_t alignment = kDefaultAlignment) = 0; + + /** + * Deallocate device memory previously allocated via allocate(). + * + * @param ptr Pointer to the memory to deallocate. + * @param index The device index. + */ + virtual void deallocate(void* ptr, etensor::DeviceIndex index) = 0; + + /** + * Copy data from host memory to device memory. + * + * @param dst Destination pointer (device memory). + * @param src Source pointer (host memory). + * @param nbytes Number of bytes to copy. + * @param index The device index. + * @return Error::Ok on success, or an appropriate error code on failure. + */ + virtual Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + etensor::DeviceIndex index) = 0; + + /** + * Copy data from device memory to host memory. + * + * @param dst Destination pointer (host memory). + * @param src Source pointer (device memory). + * @param nbytes Number of bytes to copy. + * @param index The device index. + * @return Error::Ok on success, or an appropriate error code on failure. + */ + virtual Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + etensor::DeviceIndex index) = 0; + + /** + * Returns the device type this allocator handles. + */ + virtual etensor::DeviceType device_type() const = 0; +}; + +/** + * Registry for device allocators. + * + * Provides a global mapping from DeviceType to DeviceAllocator instances. + * Device allocators register themselves at static initialization time, + * and the runtime queries the registry to find the appropriate allocator + * for a given device type. + * + * Threading contract: + * - Registration is expected to happen once per device type during static + * initialization (single-threaded). The registry itself does not perform + * any locking around register_allocator()/get_allocator(), and concurrent + * registration is not supported. + * - After registration, get_allocator() is safe to call concurrently from + * multiple threads because the underlying array is never mutated again. + * - The DeviceAllocator implementation is responsible for its own + * thread-safety. When multiple Programs are loaded concurrently and each + * needs device memory, the allocator must serialize access to any shared + * state internally (similar to how XNNPACK's weight cache guards its + * internal state). The registry does not provide any synchronization on + * behalf of the allocator. + */ +class DeviceAllocatorRegistry { + public: + /** + * Returns the singleton instance of the registry. + */ + static DeviceAllocatorRegistry& instance(); + + /** + * Register an allocator. The device type is taken from + * alloc->device_type(). Each device type may only be registered once; + * attempting to register a second allocator for the same device type + * will abort. + * + * Not thread-safe. Expected to be called during static initialization. + * + * @param alloc Pointer to the allocator (must have static lifetime). + */ + void register_allocator(DeviceAllocator* alloc); + + /** + * Get the allocator for a specific device type. + * + * Safe to call concurrently with other get_allocator() calls. + * + * @param type The device type. + * @return Pointer to the allocator, or nullptr if not registered. + */ + DeviceAllocator* get_allocator(etensor::DeviceType type); + + private: + DeviceAllocatorRegistry() = default; + + // Singletons must not be copied or moved; instance() returns a reference, + // and silently shallow-copying the registry would lead to confusing bugs + // where modifications to the copy don't affect the real singleton. + DeviceAllocatorRegistry(const DeviceAllocatorRegistry&) = delete; + DeviceAllocatorRegistry& operator=(const DeviceAllocatorRegistry&) = delete; + DeviceAllocatorRegistry(DeviceAllocatorRegistry&&) = delete; + DeviceAllocatorRegistry& operator=(DeviceAllocatorRegistry&&) = delete; + + // Fixed-size array indexed by device type. This avoids dynamic allocation + // and is suitable for embedded environments. + DeviceAllocator* allocators_[etensor::kNumDeviceTypes] = {}; +}; + +// Convenience free functions + +/** + * Register a device allocator. The device type is taken from + * alloc->device_type(). See DeviceAllocatorRegistry::register_allocator() + * for the threading contract. + * + * @param alloc Pointer to the allocator (must have static lifetime). + */ +void register_device_allocator(DeviceAllocator* alloc); + +/** + * Get the device allocator for a specific device type. + * + * @param type The device type. + * @return Pointer to the allocator, or nullptr if not registered. + */ +DeviceAllocator* get_device_allocator(etensor::DeviceType type); + +} // namespace runtime +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::runtime::DeviceAllocator; +using ::executorch::runtime::DeviceAllocatorRegistry; +using ::executorch::runtime::get_device_allocator; +using ::executorch::runtime::register_device_allocator; +} // namespace executor +} // namespace torch diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 5b6e67fa213..78ffea3bdba 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -27,6 +27,7 @@ def define_common_targets(): "//executorch/backends/...", "//executorch/extension/fb/dynamic_shim/...", "//executorch/kernels/portable/cpu/...", + "//executorch/runtime/core:device_allocator", "//executorch/runtime/core/exec_aten/...", "//executorch/runtime/core/portable_type/test/...", ], diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl index 2c13cdbdae3..1a81c5af1f6 100644 --- a/runtime/core/targets.bzl +++ b/runtime/core/targets.bzl @@ -82,6 +82,20 @@ def define_common_targets(): visibility = ["PUBLIC"], ) + runtime.cxx_library( + name = "device_allocator", + srcs = ["device_allocator.cpp"], + exported_headers = [ + "device_allocator.h", + ], + exported_deps = [ + ":core", + ":memory_allocator", + "//executorch/runtime/core/exec_aten:lib", + ], + visibility = ["PUBLIC"], + ) + for aten_mode in get_aten_mode_options(): aten_suffix = ("_aten" if aten_mode else "") runtime.cxx_library( diff --git a/runtime/core/test/device_allocator_test.cpp b/runtime/core/test/device_allocator_test.cpp new file mode 100644 index 00000000000..f0bd7c6556e --- /dev/null +++ b/runtime/core/test/device_allocator_test.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using namespace ::testing; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceAllocatorRegistry; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; +using executorch::runtime::etensor::kNumDeviceTypes; + +/** + * A mock DeviceAllocator implementation for testing purposes. + * Tracks calls to verify the registry dispatches correctly. + */ +class MockDeviceAllocator : public DeviceAllocator { + public: + explicit MockDeviceAllocator(DeviceType type) : type_(type) {} + + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = DeviceAllocator::kDefaultAlignment) override { + last_allocate_size_ = nbytes; + last_allocate_index_ = index; + last_allocate_alignment_ = alignment; + allocate_call_count_++; + return &dummy_buffer_; + } + + void deallocate(void* ptr, DeviceIndex index) override { + last_deallocate_ptr_ = ptr; + last_deallocate_index_ = index; + deallocate_call_count_++; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + last_h2d_dst_ = dst; + last_h2d_src_ = src; + last_h2d_size_ = nbytes; + last_h2d_index_ = index; + copy_h2d_call_count_++; + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + last_d2h_dst_ = dst; + last_d2h_src_ = src; + last_d2h_size_ = nbytes; + last_d2h_index_ = index; + copy_d2h_call_count_++; + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + // Reset all tracking state so tests can run against a clean baseline. + void reset_counters() { + last_allocate_size_ = 0; + last_allocate_index_ = -1; + last_allocate_alignment_ = 0; + allocate_call_count_ = 0; + + last_deallocate_ptr_ = nullptr; + last_deallocate_index_ = -1; + deallocate_call_count_ = 0; + + last_h2d_dst_ = nullptr; + last_h2d_src_ = nullptr; + last_h2d_size_ = 0; + last_h2d_index_ = -1; + copy_h2d_call_count_ = 0; + + last_d2h_dst_ = nullptr; + last_d2h_src_ = nullptr; + last_d2h_size_ = 0; + last_d2h_index_ = -1; + copy_d2h_call_count_ = 0; + } + + // Tracking variables for verification + size_t last_allocate_size_ = 0; + DeviceIndex last_allocate_index_ = -1; + size_t last_allocate_alignment_ = 0; + int allocate_call_count_ = 0; + + void* last_deallocate_ptr_ = nullptr; + DeviceIndex last_deallocate_index_ = -1; + int deallocate_call_count_ = 0; + + void* last_h2d_dst_ = nullptr; + const void* last_h2d_src_ = nullptr; + size_t last_h2d_size_ = 0; + DeviceIndex last_h2d_index_ = -1; + int copy_h2d_call_count_ = 0; + + void* last_d2h_dst_ = nullptr; + const void* last_d2h_src_ = nullptr; + size_t last_d2h_size_ = 0; + DeviceIndex last_d2h_index_ = -1; + int copy_d2h_call_count_ = 0; + + private: + DeviceType type_; + uint8_t dummy_buffer_[64] = {}; +}; + +/** + * Test fixture that owns a single MockDeviceAllocator with static lifetime + * and registers it in DeviceAllocatorRegistry exactly once for the whole + * test suite. Every test in this fixture exercises the same registered + * allocator instance via get_device_allocator(), which mirrors how real + * code is expected to use the registry (one allocator per device type, + * registered during static initialization). Per-test isolation is provided + * by reset_counters() in SetUp(). + */ +class DeviceAllocatorTest : public ::testing::Test { + protected: + static MockDeviceAllocator& cuda_allocator() { + static MockDeviceAllocator allocator(DeviceType::CUDA); + return allocator; + } + + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(&cuda_allocator()); + } + + void SetUp() override { + cuda_allocator().reset_counters(); + } +}; + +TEST_F(DeviceAllocatorTest, RegisteredAllocatorReportsCorrectDeviceType) { + DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA); + ASSERT_NE(alloc, nullptr); + EXPECT_EQ(alloc, &cuda_allocator()); + EXPECT_EQ(alloc->device_type(), DeviceType::CUDA); +} + +TEST_F(DeviceAllocatorTest, AllocateAndDeallocate) { + DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA); + ASSERT_NE(alloc, nullptr); + + Result result = alloc->allocate(/*nbytes=*/512, /*index=*/0); + EXPECT_TRUE(result.ok()); + void* ptr = result.get(); + EXPECT_NE(ptr, nullptr); + EXPECT_EQ(cuda_allocator().allocate_call_count_, 1); + EXPECT_EQ(cuda_allocator().last_allocate_size_, 512); + EXPECT_EQ(cuda_allocator().last_allocate_index_, 0); + + alloc->deallocate(ptr, /*index=*/0); + EXPECT_EQ(cuda_allocator().deallocate_call_count_, 1); + EXPECT_EQ(cuda_allocator().last_deallocate_ptr_, ptr); + EXPECT_EQ(cuda_allocator().last_deallocate_index_, 0); +} + +TEST_F(DeviceAllocatorTest, CopyHostToDevice) { + DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA); + ASSERT_NE(alloc, nullptr); + + uint8_t host_data[64] = {1, 2, 3, 4}; + uint8_t device_data[64] = {}; + + Error err = alloc->copy_host_to_device( + device_data, host_data, sizeof(host_data), /*index=*/0); + + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(cuda_allocator().copy_h2d_call_count_, 1); + EXPECT_EQ(cuda_allocator().last_h2d_dst_, device_data); + EXPECT_EQ(cuda_allocator().last_h2d_src_, host_data); + EXPECT_EQ(cuda_allocator().last_h2d_size_, sizeof(host_data)); + EXPECT_EQ(cuda_allocator().last_h2d_index_, 0); +} + +TEST_F(DeviceAllocatorTest, CopyDeviceToHost) { + DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA); + ASSERT_NE(alloc, nullptr); + + uint8_t device_data[64] = {5, 6, 7, 8}; + uint8_t host_data[64] = {}; + + Error err = alloc->copy_device_to_host( + host_data, device_data, sizeof(device_data), /*index=*/1); + + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(cuda_allocator().copy_d2h_call_count_, 1); + EXPECT_EQ(cuda_allocator().last_d2h_dst_, host_data); + EXPECT_EQ(cuda_allocator().last_d2h_src_, device_data); + EXPECT_EQ(cuda_allocator().last_d2h_size_, sizeof(device_data)); + EXPECT_EQ(cuda_allocator().last_d2h_index_, 1); +} + +TEST_F(DeviceAllocatorTest, RegistryGetUnregisteredReturnsNullptr) { + // Getting an allocator for an unregistered device type should return nullptr. + // The fixture only registers a CUDA allocator, so CPU must remain unset. + DeviceAllocator* alloc = get_device_allocator(DeviceType::CPU); + EXPECT_EQ(alloc, nullptr); +} + +TEST_F(DeviceAllocatorTest, RegistrySingletonInstance) { + // Verify that instance() returns the same object each time. + DeviceAllocatorRegistry& instance1 = DeviceAllocatorRegistry::instance(); + DeviceAllocatorRegistry& instance2 = DeviceAllocatorRegistry::instance(); + + EXPECT_EQ(&instance1, &instance2); +} + +TEST_F(DeviceAllocatorTest, RegisteringSameDeviceTypeTwiceAborts) { + // The fixture has already registered cuda_allocator() for CUDA; attempting + // to register a second allocator for the same device type must abort. + MockDeviceAllocator another_allocator(DeviceType::CUDA); + EXPECT_DEATH( + register_device_allocator(&another_allocator), + "Allocator already registered"); +} diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl index 1ad0940c62e..1adb75f6e82 100644 --- a/runtime/core/test/targets.bzl +++ b/runtime/core/test/targets.bzl @@ -42,7 +42,7 @@ def define_common_targets(): "//executorch/runtime/core:core", ], ) - + runtime.cxx_test( name = "event_tracer_test", srcs = [ @@ -83,6 +83,16 @@ def define_common_targets(): ], ) + runtime.cxx_test( + name = "device_allocator_test", + srcs = [ + "device_allocator_test.cpp", + ], + deps = [ + "//executorch/runtime/core:device_allocator", + ], + ) + runtime.cxx_test( name = "hierarchical_allocator_test", srcs = [