From 9f5c14a7c7b2d92f4db45db48b39350007cff452 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 18 Feb 2026 11:25:39 -0800 Subject: [PATCH 01/12] [ET Device Support] Schema changes: device info on Tensor and buffer-level device array This diff adds device placement information to the ExecuTorch schema to support representing tensor-level device type information, which will be the basic requirement for the following tensor_parser updates. This is part of the Phase 1 implementation to make ET device type work E2E without user-specified device placement. Design doc: https://docs.google.com/document/d/1lwd9BlohmwkN5EEvRulO_b-XnZBwv1nMb5l2K3jfuwA/edit?tab=t.0#heading=h.o6anuvkix4bu Differential Revision: [D93635657](https://our.internmc.facebook.com/intern/diff/D93635657/) [ghstack-poisoned] --- exir/schema.py | 43 +++++++++++++++++++++++++++++++++++++ schema/program.fbs | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/exir/schema.py b/exir/schema.py index 7dba623aebf..0d06a85acfa 100644 --- a/exir/schema.py +++ b/exir/schema.py @@ -48,6 +48,17 @@ class TensorDataLocation(IntEnum): EXTERNAL = 1 +class DeviceType(IntEnum): + """ + Device type enum indicating where a tensor resides or should be allocated. + Note that this enum is not directly mapped to the DeviceType enum in pytorch/pytorch + Check program.fbs for explanations of this enum. + """ + + CPU = 0 + CUDA = 1 + + @dataclass class ExtraTensorInfo: """ @@ -57,6 +68,12 @@ class ExtraTensorInfo: mutable_data_segments_idx: int = 0 fully_qualified_name: Optional[str] = None location: TensorDataLocation = TensorDataLocation.SEGMENT + # Device type where this tensor resides or should be allocated. + # Defaults to CPU for backward compatibility. + device_type: DeviceType = DeviceType.CPU + # Device index for multi-device scenarios (e.g., cuda:0, cuda:1). + # A value of -1 indicates the default device. + device_index: int = -1 @dataclass @@ -261,6 +278,26 @@ class Operator: overload: str +@dataclass +class NonConstBufferDevice: + """ + Device placement information for a non-constant memory buffer. + This is a sparse representation: only buffers that are NOT on CPU need entries. + Buffers not listed in ExecutionPlan.non_const_buffer_device default to CPU. + Check program.fbs for explanations. + """ + + # Index into ExecutionPlan.non_const_buffer_sizes identifying which buffer + # this entry applies to. + buffer_index: int + # The device type where this buffer should be allocated. + # Defaults to CPU for backward compatibility. + device_type: DeviceType = DeviceType.CPU + # The device index for multi-device scenarios (e.g., cuda:0, cuda:1). + # A value of -1 indicates the default device. + device_index: int = -1 + + @dataclass class ExecutionPlan: name: str @@ -276,6 +313,12 @@ class ExecutionPlan: # Runtime should use the len(constant_buffer) as the ground truch of # constant memory buffer size, and ignore non_const_buffer_sizes[0]. non_const_buffer_sizes: List[int] + # [Optional] Sparse device placement information for non-constant buffers. + # Only buffers that are NOT on CPU need to be listed here. Each entry + # specifies a buffer_index (into non_const_buffer_sizes) and its device. + # Buffers not listed here default to CPU, saving binary size when most + # buffers are on CPU. + non_const_buffer_device: Optional[List[NonConstBufferDevice]] = None @dataclass diff --git a/schema/program.fbs b/schema/program.fbs index 18e96de69b6..07901923410 100644 --- a/schema/program.fbs +++ b/schema/program.fbs @@ -61,6 +61,25 @@ enum TensorDataLocation : byte { EXTERNAL = 1, } +// Device type enum indicating where a tensor resides or should be allocated. +// Follows PyTorch DeviceType convention for compatibility. +enum DeviceType : byte { + CPU = 0, + CUDA = 1, + // Reserve slots for future device types following PyTorch convention: + // MKLDNN = 2, + // OPENGL = 3, + // OPENCL = 4, + // IDEEP = 5, + // HIP = 6, + // FPGA = 7, + // MAIA = 8, + // XLA = 9, + // MPS = 10, + // XPU = 11, + // PrivateUse1 = 12, +} + // Table to put additional information about tensors in that is not applicable // to the vast majority of tensors in the vast majority of programs. table ExtraTensorInfo { @@ -79,6 +98,15 @@ table ExtraTensorInfo { // must be non-empty, and is used as a key to find the tensor's external // data. Tensor.data_buffer_idx is ignored. location: TensorDataLocation; + + // [Optional] The device type where this tensor resides or should be allocated. + // Defaults to CPU for backward compatibility with existing PTE files. + device_type: DeviceType = CPU; + + // [Optional] The device index for multi-device scenarios (e.g., cuda:0, cuda:1). + // A value of -1 indicates the default device. Defaults to -1 for backward + // compatibility. + device_index: byte = -1; } table Tensor { @@ -386,6 +414,13 @@ table ExecutionPlan { // constants memory buffer size, and ignore non_const_buffer_sizes[0]. non_const_buffer_sizes: [int64]; + // [Optional] Sparse device placement information for non-constant buffers. + // Only buffers that are NOT on CPU need to be listed here. Each entry + // specifies a buffer_index (into non_const_buffer_sizes) and its device. + // Buffers not listed here default to CPU, saving binary size when most + // buffers are on CPU. + non_const_buffer_device: [NonConstBufferDevice]; + } // Constant tensor data stored directly in the flatbuffer. @@ -406,6 +441,24 @@ table BackendDelegateInlineData { data: [ubyte] (force_align: 16); // @executorch-delegate-alignment } +// Device placement information for a non-constant memory buffer. +// This is a sparse representation: only buffers that are NOT on CPU need entries. +// Buffers not listed in ExecutionPlan.non_const_buffer_device default to CPU. +table NonConstBufferDevice { + // Index into ExecutionPlan.non_const_buffer_sizes identifying which buffer + // this entry applies to. + buffer_index: uint32; + + // The device type where this buffer should be allocated. + // Defaults to CPU for backward compatibility with existing PTE files. + device_type: DeviceType = CPU; + + // The device index for multi-device scenarios (e.g., cuda:0, cuda:1). + // A value of -1 indicates the default device. Defaults to -1 for backward + // compatibility. + device_index: byte = -1; +} + // Describes a contiguous piece of data that lives outside of the flatbuffer data, // typically appended afterwards in the file. The "extended header" in the file, // when present, points to the segment base offset. From af472d76b6cbe487d81c4ecbd7bf5d00368d91b4 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 18 Feb 2026 11:25:45 -0800 Subject: [PATCH 02/12] [ET Device Support] TensorImpl carries device info This diff extends `TensorImpl` to carry device information, enabling the runtime tensor to track which device its data resides on (CPU, CUDA, etc.). This is a prerequisite for parsing device info from the schema and allocating device memory. Differential Revision: [D93635655](https://our.internmc.facebook.com/intern/diff/D93635655/) [ghstack-poisoned] --- runtime/core/portable_type/tensor_impl.cpp | 7 +- runtime/core/portable_type/tensor_impl.h | 25 +++- .../portable_type/test/tensor_impl_test.cpp | 112 ++++++++++++++++++ 3 files changed, 141 insertions(+), 3 deletions(-) diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp index ede5a3d4101..17243fca0fd 100644 --- a/runtime/core/portable_type/tensor_impl.cpp +++ b/runtime/core/portable_type/tensor_impl.cpp @@ -50,7 +50,9 @@ TensorImpl::TensorImpl( void* data, DimOrderType* dim_order, StridesType* strides, - TensorShapeDynamism dynamism) + TensorShapeDynamism dynamism, + DeviceType device_type, + DeviceIndex device_index) : sizes_(sizes), dim_order_(dim_order), strides_(strides), @@ -59,7 +61,8 @@ TensorImpl::TensorImpl( numel_(compute_numel(sizes, dim)), numel_bound_(numel_), type_(type), - shape_dynamism_(dynamism) { + shape_dynamism_(dynamism), + device_(device_type, device_index) { ET_CHECK_MSG( isValid(type_), "Invalid type %" PRId8, static_cast(type_)); ET_CHECK_MSG(dim_ >= 0, "Dimension must be non-negative, got %zd", dim_); diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h index 1e2b3620ca2..767a53bffae 100644 --- a/runtime/core/portable_type/tensor_impl.h +++ b/runtime/core/portable_type/tensor_impl.h @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -99,6 +100,8 @@ class TensorImpl { * @param strides Strides of the tensor at each dimension. Must contain `dim` * entries. * @param dynamism The mutability of the shape of the tensor. + * @param device_type The type of device where tensor data resides. + * @param device_index The device index for multi-device scenarios. */ TensorImpl( ScalarType type, @@ -107,7 +110,9 @@ class TensorImpl { void* data = nullptr, DimOrderType* dim_order = nullptr, StridesType* strides = nullptr, - TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC); + TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC, + DeviceType device_type = DeviceType::CPU, + DeviceIndex device_index = -1); /** * Returns the size of the tensor in bytes. @@ -176,6 +181,21 @@ class TensorImpl { return shape_dynamism_; } + /// Returns the device where tensor data resides. + Device device() const { + return device_; + } + + /// Returns the type of device where tensor data resides. + DeviceType device_type() const { + return device_.type(); + } + + /// Returns the device index, or -1 if default/unspecified. + DeviceIndex device_index() const { + return device_.index(); + } + /// Returns a pointer of type T to the constant underlying data blob. template inline const T* data() const { @@ -261,6 +281,9 @@ class TensorImpl { /// Specifies the mutability of the shape of the tensor. const TensorShapeDynamism shape_dynamism_; + + /// Device where tensor data resides (CPU, CUDA, etc.) + Device device_; }; /** diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp index 0b8ae05f4da..f51ac5374dd 100644 --- a/runtime/core/portable_type/test/tensor_impl_test.cpp +++ b/runtime/core/portable_type/test/tensor_impl_test.cpp @@ -21,6 +21,9 @@ using namespace ::testing; using executorch::runtime::ArrayRef; using executorch::runtime::Error; using executorch::runtime::TensorShapeDynamism; +using executorch::runtime::etensor::Device; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; using executorch::runtime::etensor::ScalarType; using executorch::runtime::etensor::TensorImpl; using SizesType = TensorImpl::SizesType; @@ -449,3 +452,112 @@ TEST_F(TensorImplTest, TestResizingTensorToZeroAndBack) { EXPECT_GT(t.numel(), 0); EXPECT_EQ(t.data(), data); } + +// ============== Device Tests ============== + +TEST_F(TensorImplTest, TestDefaultDeviceIsCPU) { + // TensorImpl constructed without device parameters should default to CPU + SizesType sizes[2] = {3, 2}; + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + TensorImpl t(ScalarType::Float, 2, sizes, data); + + EXPECT_EQ(t.device_type(), DeviceType::CPU); + EXPECT_EQ(t.device_index(), -1); + EXPECT_EQ(t.device(), Device(DeviceType::CPU, -1)); +} + +TEST_F(TensorImplTest, TestExplicitCPUDevice) { + // TensorImpl constructed with explicit CPU device + SizesType sizes[2] = {3, 2}; + DimOrderType dim_order[2] = {0, 1}; + StridesType strides[2] = {2, 1}; + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + TensorImpl t( + ScalarType::Float, + 2, + sizes, + data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + + EXPECT_EQ(t.device_type(), DeviceType::CPU); + EXPECT_EQ(t.device_index(), 0); + EXPECT_EQ(t.device(), Device(DeviceType::CPU, 0)); +} + +TEST_F(TensorImplTest, TestCUDADevice) { + // TensorImpl constructed with CUDA device + SizesType sizes[2] = {3, 2}; + DimOrderType dim_order[2] = {0, 1}; + StridesType strides[2] = {2, 1}; + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + TensorImpl t( + ScalarType::Float, + 2, + sizes, + data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + + EXPECT_EQ(t.device_type(), DeviceType::CUDA); + EXPECT_EQ(t.device_index(), 0); + EXPECT_EQ(t.device(), Device(DeviceType::CUDA, 0)); +} + +TEST_F(TensorImplTest, TestCUDADeviceMultiGPU) { + // TensorImpl with CUDA device index 1 (second GPU) + SizesType sizes[2] = {3, 2}; + DimOrderType dim_order[2] = {0, 1}; + StridesType strides[2] = {2, 1}; + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + TensorImpl t( + ScalarType::Float, + 2, + sizes, + data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 1); + + EXPECT_EQ(t.device_type(), DeviceType::CUDA); + EXPECT_EQ(t.device_index(), 1); + EXPECT_EQ(t.device(), Device(DeviceType::CUDA, 1)); +} + +TEST_F(TensorImplTest, TestDeviceWithDynamicTensor) { + // Device info should work correctly with dynamic tensors + SizesType sizes[2] = {3, 2}; + DimOrderType dim_order[2] = {0, 1}; + StridesType strides[2] = {2, 1}; + float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + TensorImpl t( + ScalarType::Float, + 2, + sizes, + data, + dim_order, + strides, + TensorShapeDynamism::DYNAMIC_BOUND, + DeviceType::CUDA, + 0); + + EXPECT_EQ(t.device_type(), DeviceType::CUDA); + EXPECT_EQ(t.device_index(), 0); + + // Resize should not affect device + SizesType new_sizes[2] = {2, 2}; + Error err = resize_tensor_impl(&t, {new_sizes, 2}); + EXPECT_EQ(err, Error::Ok); + + // Device should remain unchanged after resize + EXPECT_EQ(t.device_type(), DeviceType::CUDA); + EXPECT_EQ(t.device_index(), 0); +} From 2b669855d081d0f16446774c78553f9326e0bfa3 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 18 Feb 2026 11:25:51 -0800 Subject: [PATCH 03/12] [ET Device Support] DeviceAllocator interface and DeviceAllocatorRegistry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.). **DeviceAllocator interface provides:** - `init_buffer()` - Initialize memory buffer pools for memory-planned tensors - `get_offset_address()` - Get pointer to offset within pre-allocated buffer - `allocate()` / `deallocate()` - Dynamic device memory allocation - `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device - `device_type()` - Returns the device type this allocator handles **DeviceAllocatorRegistry provides:** - Singleton registry mapping DeviceType → DeviceAllocator - `register_allocator()` / `get_allocator()` methods - Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly) **Design notes:** - Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime - Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry) - Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/) [ghstack-poisoned] --- runtime/core/device_allocator.cpp | 58 +++++ runtime/core/device_allocator.h | 185 +++++++++++++ runtime/core/test/device_allocator_test.cpp | 271 ++++++++++++++++++++ 3 files changed, 514 insertions(+) create mode 100644 runtime/core/device_allocator.cpp create mode 100644 runtime/core/device_allocator.h create mode 100644 runtime/core/test/device_allocator_test.cpp diff --git a/runtime/core/device_allocator.cpp b/runtime/core/device_allocator.cpp new file mode 100644 index 00000000000..6046445d3be --- /dev/null +++ b/runtime/core/device_allocator.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +namespace executorch { +namespace runtime { + +DeviceAllocatorRegistry& DeviceAllocatorRegistry::instance() { + static DeviceAllocatorRegistry registry; + return registry; +} + +void DeviceAllocatorRegistry::register_allocator( + etensor::DeviceType type, + DeviceAllocator* alloc) { + auto index = static_cast(type); + ET_CHECK_MSG( + index < etensor::kNumDeviceTypes, + "Invalid device type: %d", + static_cast(type)); + ET_CHECK_MSG( + allocators_[index] == nullptr, + "Allocator already registered for device type: %d", + static_cast(type)); + allocators_[index] = alloc; +} + +DeviceAllocator* DeviceAllocatorRegistry::get_allocator( + etensor::DeviceType type) { + auto index = static_cast(type); + if (index >= etensor::kNumDeviceTypes) { + return nullptr; + } + return allocators_[index]; +} + +// Convenience free functions + +void register_device_allocator( + etensor::DeviceType type, + DeviceAllocator* alloc) { + DeviceAllocatorRegistry::instance().register_allocator(type, alloc); +} + +DeviceAllocator* get_device_allocator(etensor::DeviceType type) { + return DeviceAllocatorRegistry::instance().get_allocator(type); +} + +} // namespace runtime +} // namespace executorch diff --git a/runtime/core/device_allocator.h b/runtime/core/device_allocator.h new file mode 100644 index 00000000000..52f9710902d --- /dev/null +++ b/runtime/core/device_allocator.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace executorch { +namespace runtime { + +/** + * Abstract interface for device-specific memory allocation. + * + * Each device type (CUDA, etc.) provides a concrete implementation + * that handles memory allocation on that device. Implementations are + * expected to be singletons with static lifetime, registered via + * DeviceAllocatorRegistry. + + */ +class DeviceAllocator { + public: + virtual ~DeviceAllocator() = default; + + /** + * Initialize a memory buffer pool for memory-planned tensors. + * + * @param memory_id The ID of the memory buffer (index into + * ExecutionPlan.non_const_buffer_sizes). + * @param size The size in bytes to allocate for this buffer. + * @param index The device index (e.g., GPU 0 vs GPU 1). + * @return Error::Ok on success, or an appropriate error code on failure. + */ + virtual Error + init_buffer(uint32_t memory_id, size_t size, etensor::DeviceIndex index) = 0; + + /** + * Get a pointer to a specific offset within a pre-allocated buffer pool. + * + * @param memory_id The ID of the memory buffer. + * @param offset_bytes Offset in bytes from the start of the buffer. + * @param size_bytes Size of the requested region in bytes. + * @param index The device index. + * @return A Result containing the device pointer on success, or an error. + */ + virtual Result get_offset_address( + uint32_t memory_id, + size_t offset_bytes, + size_t size_bytes, + etensor::DeviceIndex index) = 0; + + /** + * Allocate device memory. + * + * @param nbytes Number of bytes to allocate. + * @param index The device index. + * @return A Result containing the device pointer on success, or an error. + */ + virtual Result allocate(size_t nbytes, etensor::DeviceIndex index) = 0; + + /** + * Deallocate device memory previously allocated via allocate(). + * + * @param ptr Pointer to the memory to deallocate. + * @param index The device index. + */ + virtual void deallocate(void* ptr, etensor::DeviceIndex index) = 0; + + /** + * Copy data from host memory to device memory. + * + * @param dst Destination pointer (device memory). + * @param src Source pointer (host memory). + * @param nbytes Number of bytes to copy. + * @param index The device index. + * @return Error::Ok on success, or an appropriate error code on failure. + */ + virtual Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + etensor::DeviceIndex index) = 0; + + /** + * Copy data from device memory to host memory. + * + * @param dst Destination pointer (host memory). + * @param src Source pointer (device memory). + * @param nbytes Number of bytes to copy. + * @param index The device index. + * @return Error::Ok on success, or an appropriate error code on failure. + */ + virtual Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + etensor::DeviceIndex index) = 0; + + /** + * Returns the device type this allocator handles. + */ + virtual etensor::DeviceType device_type() const = 0; +}; + +/** + * Registry for device allocators. + * + * Provides a global mapping from DeviceType to DeviceAllocator instances. + * Device allocators register themselves at static initialization time, + * and the runtime queries the registry to find the appropriate allocator + * for a given device type. + */ +class DeviceAllocatorRegistry { + public: + /** + * Returns the singleton instance of the registry. + */ + static DeviceAllocatorRegistry& instance(); + + /** + * Register an allocator for a specific device type. + * + * @param type The device type this allocator handles. + * @param alloc Pointer to the allocator (must have static lifetime). + */ + void register_allocator(etensor::DeviceType type, DeviceAllocator* alloc); + + /** + * Get the allocator for a specific device type. + * + * @param type The device type. + * @return Pointer to the allocator, or nullptr if not registered. + */ + DeviceAllocator* get_allocator(etensor::DeviceType type); + + private: + DeviceAllocatorRegistry() = default; + + // Fixed-size array indexed by device type. This avoids dynamic allocation + // and is suitable for embedded environments. + DeviceAllocator* allocators_[etensor::kNumDeviceTypes] = {}; +}; + +// Convenience free functions + +/** + * Register a device allocator for a specific device type. + * + * @param type The device type this allocator handles. + * @param alloc Pointer to the allocator (must have static lifetime). + */ +void register_device_allocator( + etensor::DeviceType type, + DeviceAllocator* alloc); + +/** + * Get the device allocator for a specific device type. + * + * @param type The device type. + * @return Pointer to the allocator, or nullptr if not registered. + */ +DeviceAllocator* get_device_allocator(etensor::DeviceType type); + +} // namespace runtime +} // namespace executorch + +namespace torch { +namespace executor { +// TODO(T197294990): Remove these deprecated aliases once all users have moved +// to the new `::executorch` namespaces. +using ::executorch::runtime::DeviceAllocator; +using ::executorch::runtime::DeviceAllocatorRegistry; +using ::executorch::runtime::get_device_allocator; +using ::executorch::runtime::register_device_allocator; +} // namespace executor +} // namespace torch diff --git a/runtime/core/test/device_allocator_test.cpp b/runtime/core/test/device_allocator_test.cpp new file mode 100644 index 00000000000..3bf0f5ad583 --- /dev/null +++ b/runtime/core/test/device_allocator_test.cpp @@ -0,0 +1,271 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using namespace ::testing; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceAllocatorRegistry; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; +using executorch::runtime::etensor::kNumDeviceTypes; + +/** + * A mock DeviceAllocator implementation for testing purposes. + * Tracks calls to verify the registry dispatches correctly. + */ +class MockDeviceAllocator : public DeviceAllocator { + public: + explicit MockDeviceAllocator(DeviceType type) : type_(type) {} + + Error init_buffer(uint32_t memory_id, size_t size, DeviceIndex index) + override { + last_init_buffer_memory_id_ = memory_id; + last_init_buffer_size_ = size; + last_init_buffer_index_ = index; + init_buffer_call_count_++; + return Error::Ok; + } + + Result get_offset_address( + uint32_t memory_id, + size_t offset_bytes, + size_t size_bytes, + DeviceIndex index) override { + last_get_offset_memory_id_ = memory_id; + last_get_offset_offset_ = offset_bytes; + last_get_offset_size_ = size_bytes; + last_get_offset_index_ = index; + get_offset_address_call_count_++; + return &dummy_buffer_; + } + + Result allocate(size_t nbytes, DeviceIndex index) override { + last_allocate_size_ = nbytes; + last_allocate_index_ = index; + allocate_call_count_++; + return &dummy_buffer_; + } + + void deallocate(void* ptr, DeviceIndex index) override { + last_deallocate_ptr_ = ptr; + last_deallocate_index_ = index; + deallocate_call_count_++; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + last_h2d_dst_ = dst; + last_h2d_src_ = src; + last_h2d_size_ = nbytes; + last_h2d_index_ = index; + copy_h2d_call_count_++; + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + last_d2h_dst_ = dst; + last_d2h_src_ = src; + last_d2h_size_ = nbytes; + last_d2h_index_ = index; + copy_d2h_call_count_++; + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + // Tracking variables for verification + uint32_t last_init_buffer_memory_id_ = 0; + size_t last_init_buffer_size_ = 0; + DeviceIndex last_init_buffer_index_ = -1; + int init_buffer_call_count_ = 0; + + uint32_t last_get_offset_memory_id_ = 0; + size_t last_get_offset_offset_ = 0; + size_t last_get_offset_size_ = 0; + DeviceIndex last_get_offset_index_ = -1; + int get_offset_address_call_count_ = 0; + + size_t last_allocate_size_ = 0; + DeviceIndex last_allocate_index_ = -1; + int allocate_call_count_ = 0; + + void* last_deallocate_ptr_ = nullptr; + DeviceIndex last_deallocate_index_ = -1; + int deallocate_call_count_ = 0; + + void* last_h2d_dst_ = nullptr; + const void* last_h2d_src_ = nullptr; + size_t last_h2d_size_ = 0; + DeviceIndex last_h2d_index_ = -1; + int copy_h2d_call_count_ = 0; + + void* last_d2h_dst_ = nullptr; + const void* last_d2h_src_ = nullptr; + size_t last_d2h_size_ = 0; + DeviceIndex last_d2h_index_ = -1; + int copy_d2h_call_count_ = 0; + + private: + DeviceType type_; + uint8_t dummy_buffer_[64] = {}; +}; + +class DeviceAllocatorTest : public ::testing::Test { + protected: + void SetUp() override { + executorch::runtime::runtime_init(); + } +}; + +TEST_F(DeviceAllocatorTest, MockAllocatorDeviceType) { + MockDeviceAllocator cpu_allocator(DeviceType::CPU); + MockDeviceAllocator cuda_allocator(DeviceType::CUDA); + + EXPECT_EQ(cpu_allocator.device_type(), DeviceType::CPU); + EXPECT_EQ(cuda_allocator.device_type(), DeviceType::CUDA); +} + +TEST_F(DeviceAllocatorTest, MockAllocatorInitBuffer) { + MockDeviceAllocator allocator(DeviceType::CUDA); + + Error err = + allocator.init_buffer(/*memory_id=*/1, /*size=*/1024, /*index=*/0); + + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(allocator.init_buffer_call_count_, 1); + EXPECT_EQ(allocator.last_init_buffer_memory_id_, 1); + EXPECT_EQ(allocator.last_init_buffer_size_, 1024); + EXPECT_EQ(allocator.last_init_buffer_index_, 0); +} + +TEST_F(DeviceAllocatorTest, MockAllocatorGetOffsetAddress) { + MockDeviceAllocator allocator(DeviceType::CUDA); + + Result result = allocator.get_offset_address( + /*memory_id=*/2, /*offset_bytes=*/128, /*size_bytes=*/256, /*index=*/1); + + EXPECT_TRUE(result.ok()); + EXPECT_NE(result.get(), nullptr); + EXPECT_EQ(allocator.get_offset_address_call_count_, 1); + EXPECT_EQ(allocator.last_get_offset_memory_id_, 2); + EXPECT_EQ(allocator.last_get_offset_offset_, 128); + EXPECT_EQ(allocator.last_get_offset_size_, 256); + EXPECT_EQ(allocator.last_get_offset_index_, 1); +} + +TEST_F(DeviceAllocatorTest, MockAllocatorAllocateAndDeallocate) { + MockDeviceAllocator allocator(DeviceType::CUDA); + + Result result = allocator.allocate(/*nbytes=*/512, /*index=*/0); + EXPECT_TRUE(result.ok()); + void* ptr = result.get(); + EXPECT_NE(ptr, nullptr); + EXPECT_EQ(allocator.allocate_call_count_, 1); + EXPECT_EQ(allocator.last_allocate_size_, 512); + EXPECT_EQ(allocator.last_allocate_index_, 0); + + allocator.deallocate(ptr, /*index=*/0); + EXPECT_EQ(allocator.deallocate_call_count_, 1); + EXPECT_EQ(allocator.last_deallocate_ptr_, ptr); + EXPECT_EQ(allocator.last_deallocate_index_, 0); +} + +TEST_F(DeviceAllocatorTest, MockAllocatorCopyHostToDevice) { + MockDeviceAllocator allocator(DeviceType::CUDA); + uint8_t host_data[64] = {1, 2, 3, 4}; + uint8_t device_data[64] = {}; + + Error err = allocator.copy_host_to_device( + device_data, host_data, sizeof(host_data), /*index=*/0); + + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(allocator.copy_h2d_call_count_, 1); + EXPECT_EQ(allocator.last_h2d_dst_, device_data); + EXPECT_EQ(allocator.last_h2d_src_, host_data); + EXPECT_EQ(allocator.last_h2d_size_, sizeof(host_data)); + EXPECT_EQ(allocator.last_h2d_index_, 0); +} + +TEST_F(DeviceAllocatorTest, MockAllocatorCopyDeviceToHost) { + MockDeviceAllocator allocator(DeviceType::CUDA); + uint8_t device_data[64] = {5, 6, 7, 8}; + uint8_t host_data[64] = {}; + + Error err = allocator.copy_device_to_host( + host_data, device_data, sizeof(device_data), /*index=*/1); + + EXPECT_EQ(err, Error::Ok); + EXPECT_EQ(allocator.copy_d2h_call_count_, 1); + EXPECT_EQ(allocator.last_d2h_dst_, host_data); + EXPECT_EQ(allocator.last_d2h_src_, device_data); + EXPECT_EQ(allocator.last_d2h_size_, sizeof(device_data)); + EXPECT_EQ(allocator.last_d2h_index_, 1); +} + +TEST_F(DeviceAllocatorTest, RegistryGetUnregisteredReturnsNullptr) { + // Getting an allocator for an unregistered device type should return nullptr + // Note that there shouldn't be any regsitered allocators for CPU backend. + DeviceAllocator* alloc = get_device_allocator(DeviceType::CPU); + (void)alloc; +} + +TEST_F(DeviceAllocatorTest, RegistrySingletonInstance) { + // Verify that instance() returns the same object each time + DeviceAllocatorRegistry& instance1 = DeviceAllocatorRegistry::instance(); + DeviceAllocatorRegistry& instance2 = DeviceAllocatorRegistry::instance(); + + EXPECT_EQ(&instance1, &instance2); +} + +TEST_F(DeviceAllocatorTest, RegisterAndGetDeviceAllocator) { + // Register a mock allocator for CUDA and retrieve it via the free function. + MockDeviceAllocator cuda_allocator(DeviceType::CUDA); + register_device_allocator(DeviceType::CUDA, &cuda_allocator); + + DeviceAllocator* retrieved = get_device_allocator(DeviceType::CUDA); + EXPECT_EQ(retrieved, &cuda_allocator); + EXPECT_EQ(retrieved->device_type(), DeviceType::CUDA); + + // Registering the same device type twice should abort. + MockDeviceAllocator another_allocator(DeviceType::CUDA); + EXPECT_DEATH( + register_device_allocator(DeviceType::CUDA, &another_allocator), + "Allocator already registered"); +} + +TEST_F(DeviceAllocatorTest, RegisterAndDispatchThroughRegistry) { + // Verify that after registration, calls dispatch to the registered allocator. + DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA); + ASSERT_NE(alloc, nullptr); + + // Use the allocator through the registry and verify it reaches the mock. + Error err = alloc->init_buffer(/*memory_id=*/5, /*size=*/2048, /*index=*/0); + EXPECT_EQ(err, Error::Ok); + + Result result = alloc->allocate(/*nbytes=*/256, /*index=*/1); + EXPECT_TRUE(result.ok()); + EXPECT_NE(result.get(), nullptr); +} From d630230274e98f87ac6b089d964f2a2a04cb273a Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 18 Feb 2026 11:28:57 -0800 Subject: [PATCH 04/12] Update base for Update on "[ET Device Support] DeviceAllocator interface and DeviceAllocatorRegistry" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.). **DeviceAllocator interface provides:** - `init_buffer()` - Initialize memory buffer pools for memory-planned tensors - `get_offset_address()` - Get pointer to offset within pre-allocated buffer - `allocate()` / `deallocate()` - Dynamic device memory allocation - `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device - `device_type()` - Returns the device type this allocator handles **DeviceAllocatorRegistry provides:** - Singleton registry mapping DeviceType → DeviceAllocator - `register_allocator()` / `get_allocator()` methods - Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly) **Design notes:** - Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime - Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry) - Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/) [ghstack-poisoned] --- schema/program.fbs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/schema/program.fbs b/schema/program.fbs index 07901923410..5b81afa905d 100644 --- a/schema/program.fbs +++ b/schema/program.fbs @@ -62,22 +62,9 @@ enum TensorDataLocation : byte { } // Device type enum indicating where a tensor resides or should be allocated. -// Follows PyTorch DeviceType convention for compatibility. enum DeviceType : byte { CPU = 0, CUDA = 1, - // Reserve slots for future device types following PyTorch convention: - // MKLDNN = 2, - // OPENGL = 3, - // OPENCL = 4, - // IDEEP = 5, - // HIP = 6, - // FPGA = 7, - // MAIA = 8, - // XLA = 9, - // MPS = 10, - // XPU = 11, - // PrivateUse1 = 12, } // Table to put additional information about tensors in that is not applicable From 228bdeb7c9511024c769f4fe08b44153c3af0201 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 19 Mar 2026 11:15:20 -0700 Subject: [PATCH 05/12] [ET Device Support] Add NonConstBufferDevice schema for per-buffer device mapping Adds the NonConstBufferDevice table to the FlatBuffer schema (program.fbs) and the corresponding Python dataclass to schema.py. This enables mapping each non-constant planned memory buffer to a specific device type (CPU, CUDA, etc.). The field is optional and absent for CPU-only programs, ensuring zero binary size regression. Differential Revision: [D97335597](https://our.internmc.facebook.com/intern/diff/D97335597/) [ghstack-poisoned] --- .../executorch_flatbuffer/ExecutionPlan.py | 62 ++++++++- .../NonConstBufferDevice.py | 130 ++++++++++++++++++ .../executorch_flatbuffer/__init__.py | 2 + exir/_serialize/test/test_program.py | 28 ++++ exir/schema.py | 15 ++ schema/program.fbs | 21 +++ 6 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py diff --git a/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py b/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py index b8ed496b8a8..340a0ad69aa 100644 --- a/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py +++ b/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py @@ -10,6 +10,7 @@ from executorch.exir._serialize.generated.executorch_flatbuffer.Chain import Chain from executorch.exir._serialize.generated.executorch_flatbuffer.ContainerMetadata import ContainerMetadata from executorch.exir._serialize.generated.executorch_flatbuffer.EValue import EValue +from executorch.exir._serialize.generated.executorch_flatbuffer.NonConstBufferDevice import NonConstBufferDevice from executorch.exir._serialize.generated.executorch_flatbuffer.Operator import Operator from typing import Optional np = import_numpy() @@ -230,8 +231,32 @@ def NonConstBufferSizesIsNone(self) -> bool: o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20)) return o == 0 + # ExecutionPlan + def NonConstBufferDevice(self, j: int) -> Optional[NonConstBufferDevice]: + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + obj = NonConstBufferDevice() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # ExecutionPlan + def NonConstBufferDeviceLength(self) -> int: + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # ExecutionPlan + def NonConstBufferDeviceIsNone(self) -> bool: + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22)) + return o == 0 + def ExecutionPlanStart(builder: flatbuffers.Builder): - builder.StartObject(9) + builder.StartObject(10) def Start(builder: flatbuffers.Builder): ExecutionPlanStart(builder) @@ -332,6 +357,18 @@ def ExecutionPlanStartNonConstBufferSizesVector(builder, numElems: int) -> int: def StartNonConstBufferSizesVector(builder, numElems: int) -> int: return ExecutionPlanStartNonConstBufferSizesVector(builder, numElems) +def ExecutionPlanAddNonConstBufferDevice(builder: flatbuffers.Builder, nonConstBufferDevice: int): + builder.PrependUOffsetTRelativeSlot(9, flatbuffers.number_types.UOffsetTFlags.py_type(nonConstBufferDevice), 0) + +def AddNonConstBufferDevice(builder: flatbuffers.Builder, nonConstBufferDevice: int): + ExecutionPlanAddNonConstBufferDevice(builder, nonConstBufferDevice) + +def ExecutionPlanStartNonConstBufferDeviceVector(builder, numElems: int) -> int: + return builder.StartVector(4, numElems, 4) + +def StartNonConstBufferDeviceVector(builder, numElems: int) -> int: + return ExecutionPlanStartNonConstBufferDeviceVector(builder, numElems) + def ExecutionPlanEnd(builder: flatbuffers.Builder) -> int: return builder.EndObject() @@ -342,6 +379,7 @@ def End(builder: flatbuffers.Builder) -> int: from executorch.exir._serialize.generated.executorch_flatbuffer import Chain from executorch.exir._serialize.generated.executorch_flatbuffer import ContainerMetadata from executorch.exir._serialize.generated.executorch_flatbuffer import EValue +from executorch.exir._serialize.generated.executorch_flatbuffer import NonConstBufferDevice from executorch.exir._serialize.generated.executorch_flatbuffer import Operator try: from typing import List, Optional @@ -361,6 +399,7 @@ def __init__(self): self.operators = None # type: List[executorch_flatbuffer.Operator.OperatorT] self.delegates = None # type: List[executorch_flatbuffer.BackendDelegate.BackendDelegateT] self.nonConstBufferSizes = None # type: List[int] + self.nonConstBufferDevice = None # type: List[executorch_flatbuffer.NonConstBufferDevice.NonConstBufferDeviceT] @classmethod def InitFromBuf(cls, buf, pos): @@ -389,7 +428,8 @@ def __eq__(self, other): self.chains == other.chains and \ self.operators == other.operators and \ self.delegates == other.delegates and \ - self.nonConstBufferSizes == other.nonConstBufferSizes + self.nonConstBufferSizes == other.nonConstBufferSizes and \ + self.nonConstBufferDevice == other.nonConstBufferDevice # ExecutionPlanT def _UnPack(self, executionPlan): @@ -451,6 +491,14 @@ def _UnPack(self, executionPlan): self.nonConstBufferSizes.append(executionPlan.NonConstBufferSizes(i)) else: self.nonConstBufferSizes = executionPlan.NonConstBufferSizesAsNumpy() + if not executionPlan.NonConstBufferDeviceIsNone(): + self.nonConstBufferDevice = [] + for i in range(executionPlan.NonConstBufferDeviceLength()): + if executionPlan.NonConstBufferDevice(i) is None: + self.nonConstBufferDevice.append(None) + else: + nonConstBufferDevice_ = executorch_flatbuffer.NonConstBufferDevice.NonConstBufferDeviceT.InitFromObj(executionPlan.NonConstBufferDevice(i)) + self.nonConstBufferDevice.append(nonConstBufferDevice_) # ExecutionPlanT def Pack(self, builder): @@ -514,6 +562,14 @@ def Pack(self, builder): for i in reversed(range(len(self.nonConstBufferSizes))): builder.PrependInt64(self.nonConstBufferSizes[i]) nonConstBufferSizes = builder.EndVector() + if self.nonConstBufferDevice is not None: + nonConstBufferDevicelist = [] + for i in range(len(self.nonConstBufferDevice)): + nonConstBufferDevicelist.append(self.nonConstBufferDevice[i].Pack(builder)) + ExecutionPlanStartNonConstBufferDeviceVector(builder, len(self.nonConstBufferDevice)) + for i in reversed(range(len(self.nonConstBufferDevice))): + builder.PrependUOffsetTRelative(nonConstBufferDevicelist[i]) + nonConstBufferDevice = builder.EndVector() ExecutionPlanStart(builder) if self.name is not None: ExecutionPlanAddName(builder, name) @@ -533,5 +589,7 @@ def Pack(self, builder): ExecutionPlanAddDelegates(builder, delegates) if self.nonConstBufferSizes is not None: ExecutionPlanAddNonConstBufferSizes(builder, nonConstBufferSizes) + if self.nonConstBufferDevice is not None: + ExecutionPlanAddNonConstBufferDevice(builder, nonConstBufferDevice) executionPlan = ExecutionPlanEnd(builder) return executionPlan diff --git a/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py b/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py new file mode 100644 index 00000000000..d82df37d29b --- /dev/null +++ b/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py @@ -0,0 +1,130 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: executorch_flatbuffer + +import flatbuffers +from flatbuffers.compat import import_numpy +from typing import Any +np = import_numpy() + +class NonConstBufferDevice(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAs(cls, buf, offset: int = 0): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = NonConstBufferDevice() + x.Init(buf, n + offset) + return x + + @classmethod + def GetRootAsNonConstBufferDevice(cls, buf, offset=0): + """This method is deprecated. Please switch to GetRootAs.""" + return cls.GetRootAs(buf, offset) + @classmethod + def NonConstBufferDeviceBufferHasIdentifier(cls, buf, offset, size_prefixed=False): + return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x45\x54\x31\x32", size_prefixed=size_prefixed) + + # NonConstBufferDevice + def Init(self, buf: bytes, pos: int): + self._tab = flatbuffers.table.Table(buf, pos) + + # NonConstBufferDevice + def BufferIdx(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # NonConstBufferDevice + def DeviceType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # NonConstBufferDevice + def DeviceIndex(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def NonConstBufferDeviceStart(builder: flatbuffers.Builder): + builder.StartObject(3) + +def Start(builder: flatbuffers.Builder): + NonConstBufferDeviceStart(builder) + +def NonConstBufferDeviceAddBufferIdx(builder: flatbuffers.Builder, bufferIdx: int): + builder.PrependInt32Slot(0, bufferIdx, 0) + +def AddBufferIdx(builder: flatbuffers.Builder, bufferIdx: int): + NonConstBufferDeviceAddBufferIdx(builder, bufferIdx) + +def NonConstBufferDeviceAddDeviceType(builder: flatbuffers.Builder, deviceType: int): + builder.PrependInt8Slot(1, deviceType, 0) + +def AddDeviceType(builder: flatbuffers.Builder, deviceType: int): + NonConstBufferDeviceAddDeviceType(builder, deviceType) + +def NonConstBufferDeviceAddDeviceIndex(builder: flatbuffers.Builder, deviceIndex: int): + builder.PrependInt8Slot(2, deviceIndex, 0) + +def AddDeviceIndex(builder: flatbuffers.Builder, deviceIndex: int): + NonConstBufferDeviceAddDeviceIndex(builder, deviceIndex) + +def NonConstBufferDeviceEnd(builder: flatbuffers.Builder) -> int: + return builder.EndObject() + +def End(builder: flatbuffers.Builder) -> int: + return NonConstBufferDeviceEnd(builder) + + +class NonConstBufferDeviceT(object): + + # NonConstBufferDeviceT + def __init__(self): + self.bufferIdx = 0 # type: int + self.deviceType = 0 # type: int + self.deviceIndex = 0 # type: int + + @classmethod + def InitFromBuf(cls, buf, pos): + nonConstBufferDevice = NonConstBufferDevice() + nonConstBufferDevice.Init(buf, pos) + return cls.InitFromObj(nonConstBufferDevice) + + @classmethod + def InitFromPackedBuf(cls, buf, pos=0): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, pos) + return cls.InitFromBuf(buf, pos+n) + + @classmethod + def InitFromObj(cls, nonConstBufferDevice): + x = NonConstBufferDeviceT() + x._UnPack(nonConstBufferDevice) + return x + + def __eq__(self, other): + return type(self) == type(other) and \ + self.bufferIdx == other.bufferIdx and \ + self.deviceType == other.deviceType and \ + self.deviceIndex == other.deviceIndex + + # NonConstBufferDeviceT + def _UnPack(self, nonConstBufferDevice): + if nonConstBufferDevice is None: + return + self.bufferIdx = nonConstBufferDevice.BufferIdx() + self.deviceType = nonConstBufferDevice.DeviceType() + self.deviceIndex = nonConstBufferDevice.DeviceIndex() + + # NonConstBufferDeviceT + def Pack(self, builder): + NonConstBufferDeviceStart(builder) + NonConstBufferDeviceAddBufferIdx(builder, self.bufferIdx) + NonConstBufferDeviceAddDeviceType(builder, self.deviceType) + NonConstBufferDeviceAddDeviceIndex(builder, self.deviceIndex) + nonConstBufferDevice = NonConstBufferDeviceEnd(builder) + return nonConstBufferDevice diff --git a/exir/_serialize/generated/executorch_flatbuffer/__init__.py b/exir/_serialize/generated/executorch_flatbuffer/__init__.py index df59751e724..7cc3b482376 100644 --- a/exir/_serialize/generated/executorch_flatbuffer/__init__.py +++ b/exir/_serialize/generated/executorch_flatbuffer/__init__.py @@ -31,6 +31,7 @@ from . import KernelTypes from . import MoveCall from . import NamedData +from . import NonConstBufferDevice from . import Null from . import Operator from . import OptionalTensorList @@ -75,6 +76,7 @@ "KernelTypes", "MoveCall", "NamedData", + "NonConstBufferDevice", "Null", "Operator", "OptionalTensorList", diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index 46e8f020a0b..1b6aab94af3 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -38,7 +38,9 @@ ContainerMetadata, DataLocation, DataSegment, + DeviceType, ExecutionPlan, + NonConstBufferDevice, Program, SubsegmentOffsets, ) @@ -477,6 +479,32 @@ def test_round_trip_large_buffer_sizes(self) -> None: program, deserialize_pte_binary(flatbuffer_from_py).program ) + def test_round_trip_with_non_const_buffer_device(self) -> None: + """Tests that non_const_buffer_device survives round-trip + serialization/deserialization. This verifies the schema extension + for per-buffer device mapping works correctly. + """ + program = get_test_program() + program.execution_plan[0].non_const_buffer_device = [ + NonConstBufferDevice(buffer_idx=0, device_type=DeviceType.CPU, device_index=0), + NonConstBufferDevice(buffer_idx=1, device_type=DeviceType.CUDA, device_index=0), + ] + flatbuffer_from_py = bytes(serialize_pte_binary(pte_file=PTEFile(program))) + self.assert_programs_equal( + program, deserialize_pte_binary(flatbuffer_from_py).program + ) + + def test_round_trip_without_non_const_buffer_device(self) -> None: + """Tests backward compatibility: a program without non_const_buffer_device + (the default) round-trips correctly and the field remains None. + """ + program = get_test_program() + self.assertIsNone(program.execution_plan[0].non_const_buffer_device) + flatbuffer_from_py = bytes(serialize_pte_binary(pte_file=PTEFile(program))) + deserialized = deserialize_pte_binary(flatbuffer_from_py).program + self.assert_programs_equal(program, deserialized) + self.assertIsNone(deserialized.execution_plan[0].non_const_buffer_device) + def test_round_trip_no_segments_and_no_header(self) -> None: """Tests that a Program serialized with extract_delegate_segments=True when there are no segments does not contain an extended header, diff --git a/exir/schema.py b/exir/schema.py index 993a473dabb..add90dec45c 100644 --- a/exir/schema.py +++ b/exir/schema.py @@ -268,6 +268,18 @@ class Operator: overload: str +@dataclass +class NonConstBufferDevice: + """Maps a non-constant buffer to the device where it should be allocated.""" + + # Index into the non_const_buffer_sizes list. + buffer_idx: int = 0 + # The device type for this buffer (CPU, CUDA, etc.). + device_type: DeviceType = DeviceType.CPU + # The device index for multi-device scenarios (e.g., cuda:0, cuda:1). + device_index: int = 0 + + @dataclass class ExecutionPlan: name: str @@ -283,6 +295,9 @@ class ExecutionPlan: # Runtime should use the len(constant_buffer) as the ground truch of # constant memory buffer size, and ignore non_const_buffer_sizes[0]. non_const_buffer_sizes: List[int] + # Per-buffer device mapping. Each entry maps a non-constant buffer to the + # device where it should be allocated. For CPU-only programs, this is empty. + non_const_buffer_device: Optional[List[NonConstBufferDevice]] = None @dataclass diff --git a/schema/program.fbs b/schema/program.fbs index f5872633ac8..c6e6edc790f 100644 --- a/schema/program.fbs +++ b/schema/program.fbs @@ -401,6 +401,27 @@ table ExecutionPlan { // constants memory buffer size, and ignore non_const_buffer_sizes[0]. non_const_buffer_sizes: [int64]; + // [Optional] Per-buffer device mapping, parallel to non_const_buffer_sizes. + // Each entry maps a non-constant buffer to the device where it should be + // allocated. For CPU-only programs, this field is absent and all buffers + // default to CPU, ensuring zero regression. + non_const_buffer_device: [NonConstBufferDevice]; + +} + +// Maps a non-constant buffer to the device where it should be allocated. +// When present as part of ExecutionPlan.non_const_buffer_device, each entry +// describes the device placement for the corresponding planned memory buffer. +// For CPU-only programs, this table is absent (all buffers default to CPU). +table NonConstBufferDevice { + // Index into the non_const_buffer_sizes list. + buffer_idx: int; + + // The device type for this buffer (CPU, CUDA, etc.). + device_type: DeviceType = CPU; + + // The device index for multi-device scenarios (e.g., cuda:0, cuda:1). + device_index: byte = 0; } // Constant tensor data stored directly in the flatbuffer. From 747dbaaa4c49ea659009445ad634e0dcedecc1ee Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 20 Mar 2026 11:21:56 -0700 Subject: [PATCH 06/12] [ET Device Support] Device-aware memory planning: separate buffers per device type Extends memory planning to separate device tensors from CPU tensors into distinct memory buffers. Non-CPU TensorSpecs (e.g., CUDA) are pre-assigned device-specific mem_ids before the greedy/naive algorithm runs, ensuring they get planned into independent memory buffers that never share space with CPU tensors. Differential Revision: [D97447105](https://our.internmc.facebook.com/intern/diff/D97447105/) [ghstack-poisoned] --- exir/capture/_config.py | 8 +- exir/memory_planning.py | 116 ++++++++++++++----- exir/passes/memory_planning_pass.py | 3 + exir/program/_program.py | 6 + exir/tests/test_memory_planning.py | 169 ++++++++++++++++++++++++++++ 5 files changed, 273 insertions(+), 29 deletions(-) diff --git a/exir/capture/_config.py b/exir/capture/_config.py index 3fbc8ae7ef3..f8c3be6e7c8 100644 --- a/exir/capture/_config.py +++ b/exir/capture/_config.py @@ -115,5 +115,11 @@ class ExecutorchBackendConfig: # If set to true, we run quant fusion and constant propagation passes do_quant_fusion_and_const_prop: bool = False - # Experimental: If set to true, we run a pass to reinplace ops in the graph. + # If set to true, we run a pass to reinplace ops in the graph. run_reinplace_pass: bool = False + + # When True, memory planning partitions specs by device and runs the + # algorithm independently per device, producing separate buffers for CPU + # vs. accelerator memory. Default False preserves the legacy behavior + # where all tensors are planned into CPU memory regardless of device. + enable_non_cpu_memory_planning: bool = False diff --git a/exir/memory_planning.py b/exir/memory_planning.py index c5d3441bcde..f6e3234fce5 100644 --- a/exir/memory_planning.py +++ b/exir/memory_planning.py @@ -28,6 +28,7 @@ import torch from executorch.exir import memory from executorch.exir.control_flow import while_loop as exir_while +from executorch.exir.schema import DeviceType, NonConstBufferDevice from executorch.exir.delegate import executorch_call_delegate from executorch.exir.error import internal_assert, InternalError from executorch.exir.operator.convert import is_inplace_variant, is_out_variant @@ -1211,10 +1212,19 @@ def apply_algo( alloc_graph_input: bool = True, alloc_graph_output: bool = True, alloc_mutable_buffers: bool = True, + enable_non_cpu_memory_planning: bool = False, ) -> list[int]: """ Recursively apply algo to graph_module and its submodules for control flow. + Partitions specs by device type and device idx, and runs the memory planning + algorithm independently per device, then merges results into separate buffers. + This ensures device memory and CPU memory are never mixed. + + When enable_non_cpu_memory_planning is False (default), all specs are planned + into a single CPU memory pool regardless of their device attribute. This + preserves the legacy behavior. Set to True to enable per-device partitioning. + Algo implementation should handle one of two meta entries for submodules: 1. input_mem_buffer_sizes: List of int offset bytes. Memory allocated by `algo` should start at the offset specified by this list; @@ -1229,18 +1239,19 @@ def apply_algo( `operand` arg. The memory for operands is unused. """ # Extract the nodes and their lifespans from the graph_module - # Difficult to just filter the list of specs returned by this due to - # how we flag trainable weights. _ = update_all_tensors_lifetime(graph_module, graph_signature) - # Filter specs based on alloc_graph_input and alloc_graph_output - specs = collect_specs_from_nodes( - graph_module.graph.nodes, - graph_signature, - do_assertion=False, - ignore_graph_input=not alloc_graph_input, - ignore_graph_output=not alloc_graph_output, - ignore_mutable_buffers=not alloc_mutable_buffers, + # Collect and materialize specs into a set so we can iterate multiple + # times and partition by device. + all_specs: set[TensorSpec] = set( + collect_specs_from_nodes( + graph_module.graph.nodes, + graph_signature, + do_assertion=False, + ignore_graph_input=not alloc_graph_input, + ignore_graph_output=not alloc_graph_output, + ignore_mutable_buffers=not alloc_mutable_buffers, + ) ) # Get temporary specs for submodules to set aside space during execution @@ -1249,29 +1260,78 @@ def apply_algo( algo, graph_module, alignment, graph_signature ) - # Update `input_mem_buffer_sizes` in graph_module. This will allow existing - # algos to work using `input_mem_buffer_sizes` or use - # `non_const_buffer_sizes` directly. - # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`. - graph_module.input_mem_buffer_sizes = submodule_bufsizes - # Get extra padding for XNNPACK if needed extra_padding = 0 if _contains_xnnpack_delegate(graph_module): extra_padding = 64 - # Pass the filtered specs to the algorithm - bufsizes: list[int] = algo( - alignment, - specs, - graph_module, - graph_signature, - extra_padding, + # 1. Partition specs by device + specs_by_device: dict[DeviceType, set[TensorSpec]] = defaultdict(set) + if enable_non_cpu_memory_planning: + for spec in all_specs: + specs_by_device[spec.device].add(spec) + else: + # Legacy behavior: all specs planned into CPU memory regardless of device + specs_by_device[DeviceType.CPU] = all_specs + + # 2. Plan each device independently + global_bufsizes: list[int] = [0] # index 0 reserved for constants + buffer_device_types: list[DeviceType] = [DeviceType.CPU] + + # Process CPU first (if present), then other devices sorted by enum value + device_order = sorted( + specs_by_device.keys(), + key=lambda d: (d != DeviceType.CPU, d.value), ) - # pyre-ignore[6]: Incompatible parameter type [6] - # In call `insert_calls_to_free`, for 2nd positional argument, expected `Set[TensorSpec]` but got `Iterable[TensorSpec]` - insert_calls_to_free(graph_module, specs) + for device_type in device_order: + device_specs = specs_by_device[device_type] - graph_module.meta.update({"non_const_buffer_sizes": bufsizes}) - return bufsizes + # Only apply submodule pre-allocation for CPU specs; device buffers + # do not share memory space with CPU submodule arenas. + # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`. + graph_module.input_mem_buffer_sizes = ( + submodule_bufsizes if device_type == DeviceType.CPU else [] + ) + + # Run algorithm independently on this device's specs + device_bufsizes = algo( + alignment, device_specs, graph_module, graph_signature, extra_padding + ) + + # Calculate base mem_id in global space + base_mem_id = len(global_bufsizes) + + # Append buffer sizes (skip index 0 which is constants placeholder) + global_bufsizes.extend(device_bufsizes[1:]) + + # Track device type for each new buffer slot + for _ in device_bufsizes[1:]: + buffer_device_types.append(device_type) + + # Remap spec mem_ids from algo-local to global. + # The algorithm assigns mem_id starting from 1; remap to global position. + for spec in device_specs: + if spec.mem_id is not None: + spec.mem_id = (spec.mem_id - 1) + base_mem_id + + # Ensure backward compatibility: at least [0, 0] when no specs exist + if len(global_bufsizes) < 2: + global_bufsizes.append(0) + buffer_device_types.append(DeviceType.CPU) + + # 3. Insert free calls and build device buffer mapping + insert_calls_to_free(graph_module, all_specs) + + has_device_buffers = any(dt != DeviceType.CPU for dt in buffer_device_types) + non_const_buffer_device: Optional[list[NonConstBufferDevice]] = None + if has_device_buffers: + non_const_buffer_device = [ + NonConstBufferDevice(buffer_idx=i, device_type=dt, device_index=0) + for i, dt in enumerate(buffer_device_types) + ] + + graph_module.meta["non_const_buffer_sizes"] = global_bufsizes + if non_const_buffer_device is not None: + graph_module.meta["non_const_buffer_device"] = non_const_buffer_device + return global_bufsizes diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py index f3970f13b56..32c343a4607 100644 --- a/exir/passes/memory_planning_pass.py +++ b/exir/passes/memory_planning_pass.py @@ -153,6 +153,7 @@ def __init__( alloc_mutable_buffers: bool = True, share_mutable_buffers: bool = False, alignment: int = ALIGNMENT, + enable_non_cpu_memory_planning: bool = False, ) -> None: r""" alloc_graph_input/alloc_graph_output will have 4 different combinations @@ -173,6 +174,7 @@ def __init__( self.alloc_mutable_buffers = alloc_mutable_buffers self.share_mutable_buffers = share_mutable_buffers self.alignment = alignment + self.enable_non_cpu_memory_planning = enable_non_cpu_memory_planning self.state = _MemoryPlanningState() def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None: @@ -250,6 +252,7 @@ def run( # If mutable buffers are shared, then do not allocate them in the # main memory planning algo; they are allocated in run_multimethod. self.alloc_mutable_buffers and not self.share_mutable_buffers, + self.enable_non_cpu_memory_planning, ) if self.share_mutable_buffers and graph_signature is not None: diff --git a/exir/program/_program.py b/exir/program/_program.py index 9813b12d594..f1a22773b69 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -1792,6 +1792,12 @@ def to_executorch( # noqa (FLAKE8) C901 else: memory_planning_pass = config.memory_planning_pass # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work + # Propagate enable_non_cpu_memory_planning from the top-level config + # to the pass instance so that device-aware partitioning is applied. + if hasattr(memory_planning_pass, "enable_non_cpu_memory_planning"): + memory_planning_pass.enable_non_cpu_memory_planning = ( + config.enable_non_cpu_memory_planning + ) if hasattr(memory_planning_pass, "run"): new_gm_res = memory_planning_pass.run(new_gm, new_signature) else: diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py index f364541d900..27ecbdfe633 100644 --- a/exir/tests/test_memory_planning.py +++ b/exir/tests/test_memory_planning.py @@ -29,6 +29,8 @@ from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.memory_planning import ( _do_user_inputs_exist, + apply_algo, + collect_specs_from_nodes, filter_nodes, get_node_tensor_specs, greedy, @@ -45,6 +47,7 @@ ToOutVarPass, ) from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass +from executorch.exir.schema import DeviceType from executorch.exir.tensor import TensorSpec from functorch.experimental.control_flow import map as torch_map from parameterized import parameterized @@ -1259,3 +1262,169 @@ def reset(self, k_zeros: torch.Tensor, v_zeros: torch.Tensor) -> None: self.assertEqual(v_cache[0].val.allocation_info.memory_id, 2) self.assertEqual(v_cache[0].val.allocation_info.memory_offset_low, 256) self.assertEqual(v_cache[0].val.allocation_info.memory_offset_high, 0) + + +class TestDeviceAwareMemoryPlanning(unittest.TestCase): + """Tests for per-device memory planning (separate buffers per device type).""" + + def _prepare_model( + self, + ) -> Tuple[GraphModule, ExportGraphSignature]: + """Prepare ToyModelForMemPlanning through SpecPropPass + ToOutVarPass.""" + model = ToyModelForMemPlanning() + inputs = model.get_random_inputs() + edge = to_edge(export(model, inputs, strict=True)) + gm = edge.exported_program().graph_module + gs = edge.exported_program().graph_signature + gm = PassManager(passes=[SpecPropPass(), ToOutVarPass()])(gm).graph_module + return gm, gs + + def _get_planned_specs( + self, + gm: GraphModule, + gs: ExportGraphSignature, + ) -> list[TensorSpec]: + """Get the unique set of specs that apply_algo would plan.""" + return list( + collect_specs_from_nodes( + gm.graph.nodes, + gs, + do_assertion=False, + ignore_graph_input=False, + ignore_graph_output=False, + ignore_mutable_buffers=False, + ) + ) + + def test_cpu_only_unchanged(self) -> None: + """CPU-only specs produce bufsizes = [0, X] with no device metadata.""" + gm, gs = self._prepare_model() + + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) + bufsizes = apply_algo( + algo, gm, 16, gs, enable_non_cpu_memory_planning=True + ) + + # The CUDA spec is the only tensor in its buffer + self.assertEqual(bufsizes[0], 0) # constants + self.assertGreater(bufsizes[1], 0) # CPU activations + self.assertNotIn("non_const_buffer_device", gm.meta) + + def test_all_cuda_no_wasted_slots(self) -> None: + """CUDA-only specs produce [0, X] with CUDA at buffer index 1.""" + gm, gs = self._prepare_model() + specs = self._get_planned_specs(gm, gs) + for spec in specs: + spec.device = DeviceType.CUDA + + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) + bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True) + + # [0, cuda_size] — no wasted CPU buffer slot + self.assertEqual(len(bufsizes), 2) + self.assertEqual(bufsizes[0], 0) + self.assertGreater(bufsizes[1], 0) + # Device mapping should be present + self.assertIn("non_const_buffer_device", gm.meta) + device_map = gm.meta["non_const_buffer_device"] + self.assertEqual(len(device_map), 2) + self.assertEqual(device_map[0].device_type, DeviceType.CPU) # constants + self.assertEqual(device_map[1].device_type, DeviceType.CUDA) + + def test_mixed_cpu_cuda_separate_buffers(self) -> None: + """CPU specs at mem_id=1, CUDA specs at mem_id=2, separate sizes.""" + gm, gs = self._prepare_model() + specs = self._get_planned_specs(gm, gs) + + # Set second half of specs to CUDA + mid = len(specs) // 2 + self.assertGreater(mid, 0) + cpu_specs = specs[:mid] + cuda_specs = specs[mid:] + for spec in cuda_specs: + spec.device = DeviceType.CUDA + + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) + bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True) + + # [constants, cpu_activations, cuda_activations] + self.assertEqual(len(bufsizes), 3) + self.assertEqual(bufsizes[0], 0) + self.assertGreater(bufsizes[1], 0) + self.assertGreater(bufsizes[2], 0) + + # CPU specs should have mem_id=1, CUDA specs should have mem_id=2 + for spec in cpu_specs: + self.assertEqual(spec.mem_id, 1, f"CPU spec has wrong mem_id: {spec.mem_id}") + for spec in cuda_specs: + self.assertEqual(spec.mem_id, 2, f"CUDA spec has wrong mem_id: {spec.mem_id}") + + def test_mem_offset_correct_after_remap(self) -> None: + """After remapping, mem_offset is relative to its own buffer.""" + gm, gs = self._prepare_model() + specs = self._get_planned_specs(gm, gs) + + # Set the last spec to CUDA (sole CUDA tensor) + cuda_spec = specs[-1] + cuda_spec.device = DeviceType.CUDA + + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) + bufsizes = apply_algo( + algo, gm, 16, gs, enable_non_cpu_memory_planning=True + ) + + # The CUDA spec is the only tensor in its buffer, so offset should be 0 + self.assertEqual(cuda_spec.mem_offset, 0) + # The CUDA buffer should fit exactly this tensor + cuda_mem_id = cuda_spec.mem_id + self.assertIsNotNone(cuda_mem_id) + assert cuda_mem_id is not None + self.assertGreaterEqual(bufsizes[cuda_mem_id], cuda_spec.allocated_memory) + + def test_no_cross_device_memory_sharing(self) -> None: + """Specs on different devices never share buffers, regardless of lifetime.""" + gm, gs = self._prepare_model() + specs = self._get_planned_specs(gm, gs) + self.assertGreaterEqual(len(specs), 2) + + # Assign alternating specs to CUDA to ensure some pairs have + # non-overlapping lifetimes (which greedy would normally share). + for i, spec in enumerate(specs): + if i % 2 == 0: + spec.device = DeviceType.CUDA + + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) + apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True) + + # Verify CPU and CUDA specs have disjoint mem_ids + cpu_mem_ids: set[int] = set() + cuda_mem_ids: set[int] = set() + for i, spec in enumerate(specs): + if spec.mem_id is not None: + if i % 2 == 0: + cuda_mem_ids.add(spec.mem_id) + else: + cpu_mem_ids.add(spec.mem_id) + + self.assertTrue( + cpu_mem_ids.isdisjoint(cuda_mem_ids), + f"CPU {cpu_mem_ids} and CUDA {cuda_mem_ids} should not share buffers", + ) + + def test_disabled_falls_back_to_cpu(self) -> None: + """With enable_non_cpu_memory_planning=False (default), CUDA specs are + planned into CPU memory — no device-specific buffers are created.""" + gm, gs = self._prepare_model() + specs = self._get_planned_specs(gm, gs) + for spec in specs: + spec.device = DeviceType.CUDA + + algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy]) + # Default: enable_non_cpu_memory_planning=False + bufsizes = apply_algo(algo, gm, 16, gs) + + # All specs planned into a single CPU pool — same as CPU-only + self.assertEqual(len(bufsizes), 2) + self.assertEqual(bufsizes[0], 0) + self.assertGreater(bufsizes[1], 0) + self.assertNotIn("non_const_buffer_device", gm.meta) From 0829c5d321b1156c0bc179eef87895cdd89eb375 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 24 Mar 2026 14:58:38 -0700 Subject: [PATCH 07/12] [ET Device Support] Emitter reads non_const_buffer_device from graph meta Enable serialzing non_const_buffer_device into into PTE file. Differential Revision: [D97850707](https://our.internmc.facebook.com/intern/diff/D97850707/) [ghstack-poisoned] --- exir/emit/_emitter.py | 5 + exir/emit/test/test_emit.py | 183 ++++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index a48d88fa224..5c1f2db465d 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan: self.module.meta["non_const_buffer_sizes"], ), container_meta_type=self.container_meta_type, + # non_const_buffer_device is set by apply_algo in memory_planning.py + # when device tensors are present. None for CPU-only programs. + non_const_buffer_device=self.module.meta.get( + "non_const_buffer_device", None + ), ) diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index 04284398656..7d08a94a8af 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -2643,3 +2643,186 @@ def forward(self, a, b): 0, "No tensor should have CUDA device when model runs entirely on CPU", ) + + def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None: + """Verify that non_const_buffer_device is emitted into ExecutionPlan when + device-aware memory planning is enabled and non-CPU tensors are present.""" + from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( + generate_pattern_op_partitions, + ) + from executorch.exir.backend.compile_spec_schema import CompileSpec + from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, + ) + from executorch.exir.backend.test.backend_with_compiler_demo import ( + BackendWithCompilerDemo, + ) + from executorch.exir.passes.propagate_device_pass import ( + TARGET_DEVICE_COMPILE_SPEC_KEY, + ) + from torch.fx.passes.operator_support import any_chain, OperatorSupportBase + + class AddSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.add.Tensor, + ] + + class DevicePartitioner(Partitioner): + def __init__(self): + super().__init__() + self.delegation_spec = DelegationSpec( + BackendWithCompilerDemo.__name__, + [ + CompileSpec("max_value", bytes([4])), + CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"), + ], + ) + + def partition(self, exported_program) -> PartitionResult: + partition_tags = {} + partition_list = generate_pattern_op_partitions( + exported_program.graph_module, + op_support=any_chain(AddSupport()), + ) + for partition in partition_list: + for node in partition.nodes: + tag = f"tag{partition.id}" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + return PartitionResult( + tagged_exported_program=exported_program, + partition_tags=partition_tags, + ) + + class Model(torch.nn.Module): + def forward(self, a, b): + return torch.add(a, b) + + model = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2)) + + edge = to_edge( + export(model, inputs), + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ) + lowered = edge.to_backend(DevicePartitioner()) + et_prog = lowered.to_executorch( + config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True), + ) + program = et_prog._emitter_output.program + + plan = program.execution_plan[0] + self.assertIsNotNone( + plan.non_const_buffer_device, + "non_const_buffer_device should be set when device tensors are present " + "and enable_non_cpu_memory_planning is True", + ) + self.assertGreater(len(plan.non_const_buffer_device), 0) + for entry in plan.non_const_buffer_device: + self.assertEqual(entry.device_type, schema.DeviceType.CUDA) + self.assertEqual(entry.device_index, 0) + + def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None: + """When all tensors are on CPU, non_const_buffer_device should be None + even with enable_non_cpu_memory_planning=True.""" + + class Model(torch.nn.Module): + def forward(self, a, b): + return torch.add(a, b) + + model = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2)) + + edge = to_edge( + export(model, inputs), + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ) + et_prog = edge.to_executorch( + config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True), + ) + program = et_prog._emitter_output.program + + plan = program.execution_plan[0] + self.assertIsNone( + plan.non_const_buffer_device, + "non_const_buffer_device should be None for CPU-only programs", + ) + + def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None: + """Even with device tensors, non_const_buffer_device should be None when + enable_non_cpu_memory_planning is False (default).""" + from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( + generate_pattern_op_partitions, + ) + from executorch.exir.backend.compile_spec_schema import CompileSpec + from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, + ) + from executorch.exir.backend.test.backend_with_compiler_demo import ( + BackendWithCompilerDemo, + ) + from executorch.exir.passes.propagate_device_pass import ( + TARGET_DEVICE_COMPILE_SPEC_KEY, + ) + from torch.fx.passes.operator_support import any_chain, OperatorSupportBase + + class AddSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.add.Tensor, + ] + + class DevicePartitioner(Partitioner): + def __init__(self): + super().__init__() + self.delegation_spec = DelegationSpec( + BackendWithCompilerDemo.__name__, + [ + CompileSpec("max_value", bytes([4])), + CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"), + ], + ) + + def partition(self, exported_program) -> PartitionResult: + partition_tags = {} + partition_list = generate_pattern_op_partitions( + exported_program.graph_module, + op_support=any_chain(AddSupport()), + ) + for partition in partition_list: + for node in partition.nodes: + tag = f"tag{partition.id}" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + return PartitionResult( + tagged_exported_program=exported_program, + partition_tags=partition_tags, + ) + + class Model(torch.nn.Module): + def forward(self, a, b): + return torch.add(a, b) + + model = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2)) + + edge = to_edge( + export(model, inputs), + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ) + lowered = edge.to_backend(DevicePartitioner()) + # Default: enable_non_cpu_memory_planning=False + et_prog = lowered.to_executorch() + program = et_prog._emitter_output.program + + plan = program.execution_plan[0] + self.assertIsNone( + plan.non_const_buffer_device, + "non_const_buffer_device should be None when " + "enable_non_cpu_memory_planning is False", + ) From 9dc075bff1e8d77a41a549c5e06e9f68d4d80d41 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 24 Mar 2026 14:59:55 -0700 Subject: [PATCH 08/12] [ET Device Support] DeviceMemoryBuffer RAII class for device memory lifetime management Introduces DeviceMemoryBuffer, an RAII wrapper that owns a single device memory allocation. On destruction, it automatically calls DeviceAllocator::deallocate() to free the memory. This mirrors the role of std::vector for CPU planned buffers, but for non-cpu device memory (CUDA, etc.). Key features: - Static factory create(size, type, index) looks up DeviceAllocator from registry - Move-only semantics (no copy) to enforce single ownership - as_span() accessor wraps device pointer for use with HierarchicalAllocator - Destructor is no-op for default-constructed or moved-from instances Differential Revision: [D97850709](https://our.internmc.facebook.com/intern/diff/D97850709/) [ghstack-poisoned] --- runtime/core/device_memory_buffer.cpp | 34 ++++ runtime/core/device_memory_buffer.h | 126 +++++++++++++ runtime/core/portable_type/targets.bzl | 1 + runtime/core/targets.bzl | 27 +++ .../core/test/device_memory_buffer_test.cpp | 169 ++++++++++++++++++ runtime/core/test/targets.bzl | 8 + 6 files changed, 365 insertions(+) create mode 100644 runtime/core/device_memory_buffer.cpp create mode 100644 runtime/core/device_memory_buffer.h create mode 100644 runtime/core/test/device_memory_buffer_test.cpp diff --git a/runtime/core/device_memory_buffer.cpp b/runtime/core/device_memory_buffer.cpp new file mode 100644 index 00000000000..7eb3f0e3ae2 --- /dev/null +++ b/runtime/core/device_memory_buffer.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch::runtime { + +Result DeviceMemoryBuffer::create( + size_t size, + etensor::DeviceType type, + etensor::DeviceIndex index) { + DeviceAllocator* allocator = get_device_allocator(type); + if (allocator == nullptr) { + ET_LOG( + Error, + "No device allocator registered for device type %d", + static_cast(type)); + return Error::NotFound; + } + + auto result = allocator->allocate(size, index); + if (!result.ok()) { + return result.error(); + } + + return DeviceMemoryBuffer(result.get(), size, allocator, index); +} + +} // namespace executorch::runtime diff --git a/runtime/core/device_memory_buffer.h b/runtime/core/device_memory_buffer.h new file mode 100644 index 00000000000..7071f3de58d --- /dev/null +++ b/runtime/core/device_memory_buffer.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace executorch::runtime { + +/** + * RAII wrapper that owns a single device memory allocation. + * + * On destruction, calls DeviceAllocator::deallocate() to free the memory. + * This mirrors the role of std::vector for CPU planned buffers, + * but for device memory (CUDA, etc.). + * + * Move-only: cannot be copied, but can be moved to transfer ownership. + */ +class DeviceMemoryBuffer final { + public: + /** + * Creates a DeviceMemoryBuffer by allocating device memory. + * + * Looks up the DeviceAllocator for the given device type via the + * DeviceAllocatorRegistry. If no allocator is registered for the type, + * returns Error::NotFound. + * + * @param size Number of bytes to allocate. + * @param type The device type (e.g., CUDA). + * @param index The device index (e.g., 0 for cuda:0). + * @return A Result containing the DeviceMemoryBuffer on success, or an error. + */ + static Result create( + size_t size, + etensor::DeviceType type, + etensor::DeviceIndex index = 0); + + DeviceMemoryBuffer() = default; + + ~DeviceMemoryBuffer() { + if (ptr_ != nullptr && allocator_ != nullptr) { + allocator_->deallocate(ptr_, device_index_); + } + } + + // Move constructor: transfer ownership. + DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept + : ptr_(other.ptr_), + size_(other.size_), + allocator_(other.allocator_), + device_index_(other.device_index_) { + other.ptr_ = nullptr; + other.size_ = 0; + other.allocator_ = nullptr; + } + + // Move assignment: release current, take ownership. + DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept { + if (this != &other) { + if (ptr_ != nullptr && allocator_ != nullptr) { + allocator_->deallocate(ptr_, device_index_); + } + ptr_ = other.ptr_; + size_ = other.size_; + allocator_ = other.allocator_; + device_index_ = other.device_index_; + other.ptr_ = nullptr; + other.size_ = 0; + other.allocator_ = nullptr; + } + return *this; + } + + // Non-copyable. + DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete; + DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete; + + /// Returns the device pointer, or nullptr if empty/moved-from. + void* data() const { + return ptr_; + } + + /// Returns the size in bytes of the allocation. + size_t size() const { + return size_; + } + + /** + * Returns a Span wrapping the device pointer. + * + * This is intended for use with HierarchicalAllocator, which only performs + * pointer arithmetic on the span data and never dereferences it. Device + * pointers are valid for pointer arithmetic from the CPU side. + */ + Span as_span() const { + return {static_cast(ptr_), size_}; + } + + private: + DeviceMemoryBuffer( + void* ptr, + size_t size, + DeviceAllocator* allocator, + etensor::DeviceIndex device_index) + : ptr_(ptr), + size_(size), + allocator_(allocator), + device_index_(device_index) {} + + void* ptr_ = nullptr; + size_t size_ = 0; + DeviceAllocator* allocator_ = nullptr; + etensor::DeviceIndex device_index_ = 0; +}; + +} // namespace executorch::runtime diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 5b6e67fa213..33f18c68006 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -27,6 +27,7 @@ def define_common_targets(): "//executorch/backends/...", "//executorch/extension/fb/dynamic_shim/...", "//executorch/kernels/portable/cpu/...", + "//executorch/runtime/core/...", "//executorch/runtime/core/exec_aten/...", "//executorch/runtime/core/portable_type/test/...", ], diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl index 2c13cdbdae3..9b40e947626 100644 --- a/runtime/core/targets.bzl +++ b/runtime/core/targets.bzl @@ -141,6 +141,33 @@ def define_common_targets(): visibility = ["//executorch/..."], ) + runtime.cxx_library( + name = "device_allocator", + srcs = ["device_allocator.cpp"], + exported_headers = [ + "device_allocator.h", + ], + exported_deps = [ + ":core", + "//executorch/runtime/core/portable_type:portable_type", + ], + deps = [ + "//executorch/runtime/platform:platform", + ], + visibility = ["PUBLIC"], + ) + + runtime.cxx_library( + name = "device_memory_buffer", + srcs = ["device_memory_buffer.cpp"], + exported_headers = ["device_memory_buffer.h"], + exported_deps = [ + ":core", + ":device_allocator", + ], + visibility = ["PUBLIC"], + ) + runtime.cxx_library( name = "tag", srcs = ["tag.cpp"], diff --git a/runtime/core/test/device_memory_buffer_test.cpp b/runtime/core/test/device_memory_buffer_test.cpp new file mode 100644 index 00000000000..81d0a757cf4 --- /dev/null +++ b/runtime/core/test/device_memory_buffer_test.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; +using executorch::runtime::Error; +using executorch::runtime::Result; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +/** + * A mock DeviceAllocator for testing DeviceMemoryBuffer. + * Returns pointers into a local buffer and tracks call counts. + */ +class MockAllocator : public DeviceAllocator { + public: + explicit MockAllocator(DeviceType type) : type_(type) {} + + Result allocate(size_t nbytes, DeviceIndex index) override { + allocate_count_++; + last_allocate_size_ = nbytes; + return static_cast(buffer_); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + last_deallocate_ptr_ = ptr; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + size_t last_allocate_size_ = 0; + void* last_deallocate_ptr_ = nullptr; + uint8_t buffer_[256] = {}; + + private: + DeviceType type_; +}; + +// Global mock registered once before all tests run. +static MockAllocator g_mock_cuda(DeviceType::CUDA); + +class DeviceMemoryBufferTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(DeviceType::CUDA, &g_mock_cuda); + } + + void SetUp() override { + // Reset counters before each test. + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; + g_mock_cuda.last_allocate_size_ = 0; + g_mock_cuda.last_deallocate_ptr_ = nullptr; + } +}; + +TEST_F(DeviceMemoryBufferTest, DefaultConstructedIsEmpty) { + DeviceMemoryBuffer buf; + EXPECT_EQ(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 0); + + auto span = buf.as_span(); + EXPECT_EQ(span.data(), nullptr); + EXPECT_EQ(span.size(), 0); +} + +TEST_F(DeviceMemoryBufferTest, CreateAllocatesAndDestructorDeallocates) { + { + auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + + auto buf = std::move(result.get()); + EXPECT_NE(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 1024); + EXPECT_EQ(g_mock_cuda.allocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 1024); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); + } + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_deallocate_ptr_, g_mock_cuda.buffer_); +} + +TEST_F(DeviceMemoryBufferTest, CreateFailsWithNoRegisteredAllocator) { + auto result = DeviceMemoryBuffer::create(512, DeviceType::CPU, 0); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), Error::NotFound); +} + +TEST_F(DeviceMemoryBufferTest, MoveConstructorTransfersOwnership) { + auto result = DeviceMemoryBuffer::create(256, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto original = std::move(result.get()); + void* original_ptr = original.data(); + + DeviceMemoryBuffer moved(std::move(original)); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(original.size(), 0); + EXPECT_EQ(moved.data(), original_ptr); + EXPECT_EQ(moved.size(), 256); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, MoveAssignmentTransfersOwnership) { + auto result = DeviceMemoryBuffer::create(128, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto original = std::move(result.get()); + void* original_ptr = original.data(); + + DeviceMemoryBuffer target; + target = std::move(original); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(target.data(), original_ptr); + EXPECT_EQ(target.size(), 128); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, DestructorNoOpForDefaultConstructed) { + { + DeviceMemoryBuffer buf; + } + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, AsSpanWrapsDevicePointer) { + auto result = DeviceMemoryBuffer::create(2048, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto buf = std::move(result.get()); + + auto span = buf.as_span(); + EXPECT_EQ(span.data(), static_cast(buf.data())); + EXPECT_EQ(span.size(), 2048); +} diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl index 1ad0940c62e..0436d3e10dd 100644 --- a/runtime/core/test/targets.bzl +++ b/runtime/core/test/targets.bzl @@ -7,6 +7,14 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ + runtime.cxx_test( + name = "device_memory_buffer_test", + srcs = ["device_memory_buffer_test.cpp"], + deps = [ + "//executorch/runtime/core:device_memory_buffer", + ], + ) + runtime.cxx_test( name = "span_test", srcs = ["span_test.cpp"], From 1fdbbd16da03c92b7b71e90f03acb6dbe5bb9ab9 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 24 Mar 2026 15:01:28 -0700 Subject: [PATCH 09/12] [ET Device Support] MethodMeta: expose per-buffer device placement API Add memory_planned_buffer_device(index) to MethodMeta, returning the Device (type + index) for each planned memory buffer. This reads from the non_const_buffer_device field in the serialized ExecutionPlan. For CPU-only programs (or legacy PTE files without non_const_buffer_device), all buffers default to Device{CPU, 0}. The sparse list only stores entries for non-CPU buffers, so the lookup scans for a matching buffer_idx. This API enables Module::load_method() to query each buffer's target device and allocate accordingly (malloc for CPU, DeviceAllocator for CUDA, etc.). Differential Revision: [D97850708](https://our.internmc.facebook.com/intern/diff/D97850708/) [ghstack-poisoned] --- runtime/core/test/targets.bzl | 2 +- runtime/executor/method_meta.cpp | 36 +++++++++++++ runtime/executor/method_meta.h | 14 +++++ runtime/executor/test/method_meta_test.cpp | 51 +++++++++++++++++++ runtime/executor/test/targets.bzl | 7 ++- .../models/export_program_with_device_info.py | 7 ++- 6 files changed, 114 insertions(+), 3 deletions(-) diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl index 0436d3e10dd..c4da8cc37de 100644 --- a/runtime/core/test/targets.bzl +++ b/runtime/core/test/targets.bzl @@ -50,7 +50,7 @@ def define_common_targets(): "//executorch/runtime/core:core", ], ) - + runtime.cxx_test( name = "event_tracer_test", srcs = [ diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp index 75dadfd893a..ca1b3254338 100644 --- a/runtime/executor/method_meta.cpp +++ b/runtime/executor/method_meta.cpp @@ -325,6 +325,42 @@ Result MethodMeta::memory_planned_buffer_size(size_t index) const { return s_plan_->non_const_buffer_sizes()->Get(index + 1); } +Result MethodMeta::memory_planned_buffer_device( + size_t index) const { + auto num_buffers = this->num_memory_planned_buffers(); + ET_CHECK_OR_RETURN_ERROR( + index < num_buffers, + InvalidArgument, + "index %zu out of range. num_buffers: %zu", + index, + num_buffers); + + // The non_const_buffer_device field is optional and only present when the + // program contains non-CPU buffers. For CPU-only programs (or legacy PTE + // files), this field is null and all buffers default to CPU. + auto* buffer_devices = s_plan_->non_const_buffer_device(); + if (buffer_devices == nullptr) { + return etensor::Device{etensor::DeviceType::CPU, 0}; + } + + // The sparse list only contains entries for non-CPU buffers. + // buffer_idx uses the same indexing as non_const_buffer_sizes (1-based, + // with index 0 reserved). The user-facing index is 0-based, so we + // compare against index + 1. + const auto internal_idx = static_cast(index + 1); + for (size_t i = 0; i < buffer_devices->size(); ++i) { + auto entry = buffer_devices->Get(i); + if (entry->buffer_idx() == internal_idx) { + return etensor::Device{ + static_cast(entry->device_type()), + static_cast(entry->device_index())}; + } + } + + // Not found in the sparse list — this buffer is on CPU. + return etensor::Device{etensor::DeviceType::CPU, 0}; +} + bool MethodMeta::uses_backend(const char* backend_name) const { ET_CHECK_MSG(backend_name, "backend name is null"); const auto delegates = s_plan_->delegates(); diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h index 79fd05c28ee..e0fa16cda22 100644 --- a/runtime/executor/method_meta.h +++ b/runtime/executor/method_meta.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include @@ -234,6 +235,19 @@ class MethodMeta final { */ Result memory_planned_buffer_size(size_t index) const; + /** + * Get the device placement for the specified memory-planned buffer. + * + * For CPU-only programs (no non_const_buffer_device in the PTE), all buffers + * default to Device{CPU, 0}. For programs with device annotations, returns + * the device type and index that the buffer should be allocated on. + * + * @param[in] index The index of the buffer to look up (0-based, same + * indexing as memory_planned_buffer_size()). + * @returns The Device on success, or an error on failure. + */ + Result memory_planned_buffer_device(size_t index) const; + /** * Check to see if a backend is used in this method. * diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp index e4ef2e72a85..4b2fdb26da2 100644 --- a/runtime/executor/test/method_meta_test.cpp +++ b/runtime/executor/test/method_meta_test.cpp @@ -74,6 +74,10 @@ class MethodMetaTest : public ::testing::Test { void SetUp() override { load_program(std::getenv("ET_MODULE_ADD_PATH"), "add"); load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful"); + const char* device_path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + if (device_path != nullptr) { + load_program(device_path, "add_with_device"); + } } private: @@ -192,6 +196,27 @@ TEST_F(MethodMetaTest, MethodMetaAttribute) { ASSERT_EQ(bad_access.error(), Error::InvalidArgument); } +TEST_F(MethodMetaTest, MemoryPlannedBufferDeviceDefaultsCpu) { + Result method_meta = programs_["add"]->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + // CPU-only model: all buffers should default to CPU device. + size_t num_buffers = method_meta->num_memory_planned_buffers(); + ASSERT_GT(num_buffers, 0); + + for (size_t i = 0; i < num_buffers; ++i) { + auto device = method_meta->memory_planned_buffer_device(i); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CPU); + EXPECT_EQ(device->index(), 0); + } + + // Out of range returns error. + EXPECT_EQ( + method_meta->memory_planned_buffer_device(num_buffers).error(), + Error::InvalidArgument); +} + TEST_F(MethodMetaTest, TensorInfoSizeOverflow) { // Create sizes that will cause overflow when multiplied std::vector overflow_sizes = { @@ -214,3 +239,29 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) { executorch::aten::string_view{nullptr, 0}), ""); } + +TEST_F(MethodMetaTest, MethodMetaBufferDeviceReturnsCudaForDeviceBuffer) { + ASSERT_NE(programs_.find("add_with_device"), programs_.end()) + << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set"; + Result method_meta = + programs_["add_with_device"]->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + // ModuleAddWithDevice exports with enable_non_cpu_memory_planning=True. + // The model delegates add(a,b) to CUDA, producing: + // non_const_buffer_sizes: [0, 48] (index 0 reserved) + // non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}] + // So there is exactly 1 planned buffer (user-facing index 0), on CUDA. + ASSERT_EQ(method_meta->num_memory_planned_buffers(), 1); + + // Buffer 0 should be CUDA device. + auto device = method_meta->memory_planned_buffer_device(0); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CUDA); + EXPECT_EQ(device->index(), 0); + + // Out of range should return error. + EXPECT_EQ( + method_meta->memory_planned_buffer_device(1).error(), + Error::InvalidArgument); +} diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index f4534aefdea..74ea9a8262d 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -178,7 +178,12 @@ def define_common_targets(is_fbcode = False): "//executorch/runtime/executor:program", "//executorch/extension/data_loader:file_data_loader", ], - env = modules_env, + env = dict( + modules_env, + **{ + "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])", + } + ), ) runtime.cxx_test( diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py index 1abf73bfb73..246c41bb9f3 100644 --- a/test/models/export_program_with_device_info.py +++ b/test/models/export_program_with_device_info.py @@ -99,7 +99,12 @@ def main() -> None: compile_config=EdgeCompileConfig(_check_ir_validity=False), ) lowered = edge.to_backend(_DeviceAwarePartitioner()) - et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False)) + et_prog = lowered.to_executorch( + ExecutorchBackendConfig( + emit_stacktrace=False, + enable_non_cpu_memory_planning=True, + ) + ) os.makedirs(args.outdir, exist_ok=True) outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte") From 6af305941c640ed3ebd329cdc2172ae3c06141a8 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 24 Mar 2026 15:01:37 -0700 Subject: [PATCH 10/12] [ET Device Support] MemoryManager: add per-buffer device metadata This diff extend MemoryManager with optional per-buffer device type metadata so the runtime explicitly knows which planned memory buffers are on which device. This enables future device-aware dispatch and debugging. Changes: - New constructor taking planned_buffer_devices as extra input for device info - New accessors: planned_buffer_devices(), has_device_memory() - No existing functionalities have been updated. Differential Revision: [D97850706](https://our.internmc.facebook.com/intern/diff/D97850706/) [ghstack-poisoned] --- runtime/core/portable_type/targets.bzl | 1 + runtime/executor/memory_manager.h | 46 +++++ runtime/executor/targets.bzl | 1 + runtime/executor/test/memory_manager_test.cpp | 44 +++++ runtime/executor/test/targets.bzl | 3 + .../test/tensor_parser_device_test.cpp | 170 ++++++++++++++++++ 6 files changed, 265 insertions(+) diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 33f18c68006..66fa9986e15 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -28,6 +28,7 @@ def define_common_targets(): "//executorch/extension/fb/dynamic_shim/...", "//executorch/kernels/portable/cpu/...", "//executorch/runtime/core/...", + "//executorch/runtime/executor/...", "//executorch/runtime/core/exec_aten/...", "//executorch/runtime/core/portable_type/test/...", ], diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h index 42edd9f0bea..b80344d4c9c 100644 --- a/runtime/executor/memory_manager.h +++ b/runtime/executor/memory_manager.h @@ -10,6 +10,8 @@ #include #include +#include +#include namespace executorch { namespace runtime { @@ -61,6 +63,32 @@ class MemoryManager final { "method allocator cannot be the same as temp allocator"); } + /** + * Constructs a new MemoryManager with per-buffer device metadata. + * + * @param[in] method_allocator Same as above. + * @param[in] planned_memory Same as above. May contain a mix of CPU and + * device pointers — HierarchicalAllocator only does pointer arithmetic, + * so device pointers are valid. + * @param[in] temp_allocator Same as above. + * @param[in] planned_buffer_devices One entry per planned memory buffer + * (same count as planned_memory buffers), indicating the device type for + * each buffer. For CPU-only programs, use the 3-arg constructor instead. + */ + MemoryManager( + MemoryAllocator* method_allocator, + HierarchicalAllocator* planned_memory, + MemoryAllocator* temp_allocator, + Span planned_buffer_devices) + : method_allocator_(method_allocator), + planned_memory_(planned_memory), + temp_allocator_(temp_allocator), + planned_buffer_devices_(planned_buffer_devices) { + ET_CHECK_MSG( + method_allocator != temp_allocator, + "method allocator cannot be the same as temp allocator"); + } + /** * DEPRECATED: Use the constructor without `constant_allocator` instead. * @@ -105,10 +133,28 @@ class MemoryManager final { return temp_allocator_; } + /** + * Returns per-buffer device metadata. One entry per planned memory buffer, + * same count as planned_memory buffers. Empty if no device metadata was + * provided (CPU-only program). + */ + Span planned_buffer_devices() const { + return planned_buffer_devices_; + } + + /** + * Returns true if any planned buffer is on a non-CPU device. + * When false, the memory setup is CPU-only and follows the legacy path. + */ + bool has_device_memory() const { + return planned_buffer_devices_.size() > 0; + } + private: MemoryAllocator* method_allocator_; HierarchicalAllocator* planned_memory_; MemoryAllocator* temp_allocator_; + Span planned_buffer_devices_; }; } // namespace runtime diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index 90f8d0221e9..2441c55b58e 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -36,6 +36,7 @@ def define_common_targets(): ], exported_deps = [ "//executorch/runtime/core:memory_allocator", + "//executorch/runtime/core/portable_type:portable_type", ], visibility = ["PUBLIC"], ) diff --git a/runtime/executor/test/memory_manager_test.cpp b/runtime/executor/test/memory_manager_test.cpp index 0e1feb47793..3a7a07d145a 100644 --- a/runtime/executor/test/memory_manager_test.cpp +++ b/runtime/executor/test/memory_manager_test.cpp @@ -17,6 +17,8 @@ using namespace ::testing; using executorch::runtime::HierarchicalAllocator; using executorch::runtime::MemoryAllocator; using executorch::runtime::MemoryManager; +using executorch::runtime::Span; +using executorch::runtime::etensor::DeviceType; TEST(MemoryManagerTest, MinimalCtor) { MemoryAllocator method_allocator(0, nullptr); @@ -93,3 +95,45 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) { /*temp_allocator=*/&method_allocator), "cannot be the same"); } + +TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) { + MemoryAllocator method_allocator(0, nullptr); + HierarchicalAllocator planned_memory({}); + MemoryAllocator temp_allocator(0, nullptr); + + MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator); + + EXPECT_FALSE(mm.has_device_memory()); + EXPECT_EQ(mm.planned_buffer_devices().size(), 0); +} + +TEST(MemoryManagerTest, FourArgCtorWithDeviceMetadata) { + MemoryAllocator method_allocator(0, nullptr); + HierarchicalAllocator planned_memory({}); + MemoryAllocator temp_allocator(0, nullptr); + + // 3 buffers: CPU, CUDA, CPU + DeviceType devices[] = {DeviceType::CPU, DeviceType::CUDA, DeviceType::CPU}; + Span device_span(devices, 3); + + MemoryManager mm( + &method_allocator, &planned_memory, &temp_allocator, device_span); + + EXPECT_EQ(mm.method_allocator(), &method_allocator); + EXPECT_EQ(mm.planned_memory(), &planned_memory); + EXPECT_EQ(mm.temp_allocator(), &temp_allocator); + EXPECT_TRUE(mm.has_device_memory()); + EXPECT_EQ(mm.planned_buffer_devices().size(), 3); + EXPECT_EQ(mm.planned_buffer_devices()[0], DeviceType::CPU); + EXPECT_EQ(mm.planned_buffer_devices()[1], DeviceType::CUDA); + EXPECT_EQ(mm.planned_buffer_devices()[2], DeviceType::CPU); +} + +TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) { + MemoryAllocator method_allocator(0, nullptr); + + MemoryManager mm(&method_allocator); + + EXPECT_FALSE(mm.has_device_memory()); + EXPECT_EQ(mm.planned_buffer_devices().size(), 0); +} diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index 74ea9a8262d..32baa63a76b 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False): "//executorch/exir/backend/test/...", "//executorch/runtime/backend/...", "//executorch/extension/pybindings/...", + "//executorch/extension/module/test/...", "//executorch/devtools/fb/runners/...", "//executorch/test/...", "//executorch/examples/...", @@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False): deps = [ ":managed_memory_manager", "//executorch/runtime/executor:program", + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core:device_memory_buffer", "//executorch/extension/data_loader:file_data_loader", "//executorch/schema:program", ], diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp index 46488eacd0b..6baf525aa12 100644 --- a/runtime/executor/test/tensor_parser_device_test.cpp +++ b/runtime/executor/test/tensor_parser_device_test.cpp @@ -17,18 +17,32 @@ #include #include +#include +#include #include #include +#include #include #include using executorch::aten::Tensor; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; using executorch::runtime::Error; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::MethodMeta; using executorch::runtime::Program; using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; using executorch::runtime::deserialization::parseTensor; using executorch::runtime::testing::ManagedMemoryManager; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; using torch::executor::util::FileDataLoader; constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U; @@ -50,8 +64,67 @@ class ProgramTestFriend final { using executorch::runtime::testing::ProgramTestFriend; +namespace { + +/** + * Mock CUDA allocator that uses host memory for testing. + * Tracks the allocated range so tests can verify tensor data_ptr + * falls within the "device" memory region. + */ +class MockCudaAllocator : public DeviceAllocator { + public: + Result allocate(size_t nbytes, DeviceIndex index) override { + allocate_count_++; + buffer_ = std::make_unique(nbytes); + buffer_size_ = nbytes; + return static_cast(buffer_.get()); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + buffer_.reset(); + buffer_size_ = 0; + } + + Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + bool is_device_ptr(const void* ptr) const { + if (buffer_ == nullptr || buffer_size_ == 0) { + return false; + } + auto* p = static_cast(ptr); + return p >= buffer_.get() && p < buffer_.get() + buffer_size_; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + + private: + std::unique_ptr buffer_; + size_t buffer_size_ = 0; +}; + +} // namespace + +static MockCudaAllocator g_mock_cuda; + class TensorParserDeviceTest : public ::testing::Test { protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(DeviceType::CUDA, &g_mock_cuda); + } + void SetUp() override { const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); ASSERT_NE(path, nullptr) @@ -59,6 +132,9 @@ class TensorParserDeviceTest : public ::testing::Test { Result loader = FileDataLoader::from(path); ASSERT_EQ(loader.error(), Error::Ok); loader_ = std::make_unique(std::move(loader.get())); + + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; } std::unique_ptr loader_; @@ -169,3 +245,97 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) { << " without device annotation should have device_index=0"; } } +TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) { + Result program = + Program::load(loader_.get(), Program::Verification::Minimal); + ASSERT_EQ(program.error(), Error::Ok); + + Result method_meta = program->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + // ModuleAddWithDevice has: + // non_const_buffer_sizes: [0, 48] (index 0 reserved, buffer 0 = 48 bytes) + // non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}] + const size_t num_buffers = method_meta->num_memory_planned_buffers(); + ASSERT_EQ(num_buffers, 1); + + // Set up device-aware planned memory. + std::vector> planned_spans; + std::vector> cpu_buffers; + std::vector device_buffers; + + for (size_t i = 0; i < num_buffers; ++i) { + auto size = method_meta->memory_planned_buffer_size(i); + ASSERT_TRUE(size.ok()); + auto device = method_meta->memory_planned_buffer_device(i); + ASSERT_TRUE(device.ok()); + + if (device->is_cpu()) { + cpu_buffers.emplace_back(size.get()); + planned_spans.emplace_back( + cpu_buffers.back().data(), cpu_buffers.back().size()); + } else { + cpu_buffers.emplace_back(); // empty placeholder + auto dmb = DeviceMemoryBuffer::create( + size.get(), device->type(), device->index()); + ASSERT_TRUE(dmb.ok()) + << "DeviceMemoryBuffer::create failed for buffer " << i; + planned_spans.emplace_back(dmb->as_span()); + device_buffers.push_back(std::move(dmb.get())); + } + } + + ASSERT_EQ(g_mock_cuda.allocate_count_, 1); + + // Build HierarchicalAllocator with mixed CPU/device spans. + HierarchicalAllocator planned_memory( + {planned_spans.data(), planned_spans.size()}); + + constexpr size_t kMethodAllocBytes = 32 * 1024U; + auto method_alloc_pool = std::make_unique(kMethodAllocBytes); + MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get()); + MemoryManager memory_manager(&method_allocator, &planned_memory); + + // Parse tensors and verify CUDA tensors have device memory. + const executorch_flatbuffer::Program* internal_program = + ProgramTestFriend::GetInternalProgram(&program.get()); + auto* execution_plan = + internal_program->execution_plan()->GetMutableObject(0); + auto* flatbuffer_values = execution_plan->values(); + + int cuda_with_device_memory = 0; + + for (size_t i = 0; i < flatbuffer_values->size(); ++i) { + auto* serialization_value = flatbuffer_values->Get(i); + if (serialization_value->val_type() != + executorch_flatbuffer::KernelTypes::Tensor) { + continue; + } + + auto* s_tensor = serialization_value->val_as_Tensor(); + bool is_cuda = s_tensor->extra_tensor_info() != nullptr && + s_tensor->extra_tensor_info()->device_type() == + executorch_flatbuffer::DeviceType::CUDA; + + Result tensor = + parseTensor(&program.get(), &memory_manager, s_tensor); + ASSERT_TRUE(tensor.ok()) + << "parseTensor failed at index " << i + << " with error 0x" << std::hex + << static_cast(tensor.error()); + + Tensor t = tensor.get(); + + if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) { + EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr())) + << "CUDA tensor at index " << i + << " should have data_ptr in device memory, but got CPU memory"; + cuda_with_device_memory++; + } + } + + // All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have + // their data_ptr pointing to the mock device memory buffer. + EXPECT_EQ(cuda_with_device_memory, 3) + << "All 3 CUDA tensors should have data_ptr in device memory"; +} From 8bd18ab125bd1bf64482dbc80e7a72be86a3322b Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 18 May 2026 17:25:49 -0700 Subject: [PATCH 11/12] Update (base update) [ghstack-poisoned] --- runtime/executor/test/tensor_parser_device_test.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp index 918e564e093..3cd5570b42b 100644 --- a/runtime/executor/test/tensor_parser_device_test.cpp +++ b/runtime/executor/test/tensor_parser_device_test.cpp @@ -73,7 +73,12 @@ namespace { */ class MockCudaAllocator : public DeviceAllocator { public: - Result allocate(size_t nbytes, DeviceIndex index) override { + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignement = kDefaultAlignment) override { + (void)alignement; + (void)index; allocate_count_++; buffer_ = std::make_unique(nbytes); buffer_size_ = nbytes; From b68f982d91a7a9894f39fd51ccfd196bc12bf48c Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Fri, 22 May 2026 17:54:19 -0700 Subject: [PATCH 12/12] [ET Device Support] Module: allocate device memory for planned buffers Differential Revision: D97850705 Pull Request resolved: https://github.com/pytorch/executorch/pull/18476 --- extension/module/module.cpp | 78 ++++++- extension/module/module.h | 9 + extension/module/targets.bzl | 1 + .../module/test/module_device_memory_test.cpp | 218 ++++++++++++++++++ extension/module/test/targets.bzl | 22 +- .../executorch/build/build_variables.bzl | 2 + test/models/targets.bzl | 1 + 7 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 extension/module/test/module_device_memory_test.cpp diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 5422fb15b71..11fea031603 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace executorch { @@ -367,6 +368,51 @@ Module::make_planned_memory_with_shared_arenas( return planned; } +std::unique_ptr Module::make_planned_memory_with_devices( + const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) { + auto planned = std::make_unique(); + const size_t num_buffers = method_meta.num_memory_planned_buffers(); + planned->planned_buffers.reserve(num_buffers); + planned->planned_spans.reserve(num_buffers); + planned->device_buffers.reserve(num_buffers); + planned->planned_devices.reserve(num_buffers); + + for (size_t i = 0; i < num_buffers; ++i) { + auto size = method_meta.memory_planned_buffer_size(i); + ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i); + auto device = method_meta.memory_planned_buffer_device(i); + ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i); + planned->planned_devices.push_back(device.get()); + + if (device->is_cpu()) { + planned->planned_buffers.emplace_back(size.get()); + planned->planned_spans.emplace_back( + planned->planned_buffers.back().data(), size.get()); + } else { + // Allocate device memory via DeviceAllocator and store the RAII buffer. + planned->planned_buffers.emplace_back(); // empty CPU placeholder + auto dmb = runtime::DeviceMemoryBuffer::create( + size.get(), device->type(), device->index()); + ET_CHECK_MSG( + dmb.ok(), + "Failed to allocate device memory for buffer %zu (device_type=%d)", + i, + static_cast(device->type())); + planned->planned_spans.emplace_back(dmb->as_span()); + planned->device_buffers.push_back(std::move(dmb.get())); + } + } + + // HierarchicalAllocator owns the per-buffer Device metadata so the + // MemoryManager can later expose it via planned_buffer_devices(). + planned->planned_memory = std::make_unique( + runtime::Span>( + planned->planned_spans.data(), planned->planned_spans.size()), + runtime::Span( + planned->planned_devices.data(), planned->planned_devices.size())); + return planned; +} + runtime::Result> Module::get_mem_planned_buffer_sizes( const std::string& method_name) { auto meta_res = program_->method_meta(method_name.c_str()); @@ -422,10 +468,38 @@ runtime::Error Module::load_method( MethodHolder method_holder; if (!planned_memory) { - if (!share_memory_arenas_) { + // Check if any buffers need device memory allocation. + auto meta_res = program_->method_meta(method_name.c_str()); + ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error()); + auto& meta = meta_res.get(); + + bool has_device_buffers = false; + for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) { + auto dev = meta.memory_planned_buffer_device(i); + if (dev.ok() && !dev->is_cpu()) { + has_device_buffers = true; + break; + } + } + + if (has_device_buffers) { + // Device memory with shared arenas is not yet supported. + ET_CHECK_OR_RETURN_ERROR( + !share_memory_arenas_, + NotSupported, + "Device memory buffers are not yet compatible with " + "share_memory_arenas. Please disable share_memory_arenas " + "when using models with device-planned memory."); + + // Device-aware path: allocate CPU and device buffers. The device + // span is owned by the HierarchicalAllocator inside PlannedMemory. + method_holder.planned_memory = make_planned_memory_with_devices(meta); + planned_memory = method_holder.planned_memory->planned_memory.get(); + } else if (!share_memory_arenas_) { auto sizes_res = get_mem_planned_buffer_sizes(method_name); ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error()); method_holder.planned_memory = make_planned_memory(sizes_res.get()); + planned_memory = method_holder.planned_memory->planned_memory.get(); } else { auto sizes_res = get_mem_planned_buffer_sizes(method_name); ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error()); @@ -442,8 +516,8 @@ runtime::Error Module::load_method( } method_holder.planned_memory = make_planned_memory_with_shared_arenas(sizes, shared_arenas_); + planned_memory = method_holder.planned_memory->planned_memory.get(); } - planned_memory = method_holder.planned_memory->planned_memory.get(); } method_holder.memory_manager = std::make_unique( diff --git a/extension/module/module.h b/extension/module/module.h index 47ead23032e..91c7feaad9b 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -18,6 +18,8 @@ #include #include +#include + #ifdef USE_ATEN_LIB #define ET_MODULE_NAMESPACE module::aten #else // !USE_ATEN_LIB @@ -716,6 +718,11 @@ class Module { struct PlannedMemory { std::vector> planned_buffers; std::vector> planned_spans; + std::vector device_buffers; + /// Per-buffer Device (type + index) metadata used by + /// HierarchicalAllocator. Owns the storage backing the device span the + /// allocator references, so it must outlive `planned_memory`. + std::vector planned_devices; std::unique_ptr planned_memory; }; std::unique_ptr make_planned_memory( @@ -723,6 +730,8 @@ class Module { std::unique_ptr make_planned_memory_with_shared_arenas( const std::vector& buffer_sizes, std::vector>& shared_arenas); + std::unique_ptr make_planned_memory_with_devices( + const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta); runtime::Result> get_mem_planned_buffer_sizes( const std::string& method_name); runtime::Result> get_max_mem_planned_buffer_sizes(); diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl index fa80203831a..e622b138ff6 100644 --- a/extension/module/targets.bzl +++ b/extension/module/targets.bzl @@ -30,6 +30,7 @@ def define_common_targets(): "//executorch/runtime/backend:backend_options", "//executorch/runtime/backend:backend_options_map", "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix, + "//executorch/runtime/core:device_memory_buffer", ], ) diff --git a/extension/module/test/module_device_memory_test.cpp b/extension/module/test/module_device_memory_test.cpp new file mode 100644 index 00000000000..5031273ac2b --- /dev/null +++ b/extension/module/test/module_device_memory_test.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests that Module's device-aware memory allocation path works correctly. + * + * Uses ModuleAddWithDevice.pte which has: + * non_const_buffer_sizes: [0, 48] (1 buffer, index 0 reserved) + * non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}] + * + * Since we don't have a real CUDA backend, we test that: + * 1. CPU-only models load through Module without invoking device allocator + * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock + */ + +#include + +#include + +#include +#include +#include + +using executorch::extension::Module; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; +using executorch::runtime::Error; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +class MockCudaAllocator : public DeviceAllocator { + public: + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = kDefaultAlignment) override { + (void)alignment; + allocate_count_++; + last_allocate_size_ = nbytes; + last_allocate_index_ = index; + buffer_ = std::make_unique(nbytes); + return static_cast(buffer_.get()); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + buffer_.reset(); + } + + Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + size_t last_allocate_size_ = 0; + DeviceIndex last_allocate_index_ = -1; + + private: + std::unique_ptr buffer_; +}; + +} // namespace + +static MockCudaAllocator g_mock_cuda; + +class ModuleDeviceMemoryTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(&g_mock_cuda); + } + + void SetUp() override { + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; + g_mock_cuda.last_allocate_size_ = 0; + g_mock_cuda.last_allocate_index_ = -1; + } +}; + +TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) { + const char* path = std::getenv("ET_MODULE_ADD_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set"; + + Module module(path); + auto err = module.load_method("forward"); + ASSERT_EQ(err, Error::Ok); + + EXPECT_EQ(g_mock_cuda.allocate_count_, 0) + << "CPU-only model should not allocate device memory"; +} + +TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) { + // Directly test DeviceMemoryBuffer::create with the registered mock. + // This verifies the RAII allocation/deallocation path that Module uses. + { + auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto buf = std::move(result.get()); + + EXPECT_EQ(g_mock_cuda.allocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48); + EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0); + EXPECT_NE(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 48); + + // as_span() wraps the device pointer for HierarchicalAllocator. + auto span = buf.as_span(); + EXPECT_EQ(span.data(), static_cast(buf.data())); + EXPECT_EQ(span.size(), 48); + + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); + } + // RAII deallocation on scope exit. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1); +} + +TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) { + // Verify MethodMeta reports the correct device for buffers in the + // device-annotated model, without needing to load the full method. + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + Module module(path); + auto err = module.load(); + ASSERT_EQ(err, Error::Ok); + + auto meta = module.method_meta("forward"); + ASSERT_TRUE(meta.ok()); + + // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA. + ASSERT_EQ(meta->num_memory_planned_buffers(), 1); + + auto size = meta->memory_planned_buffer_size(0); + ASSERT_TRUE(size.ok()); + EXPECT_EQ(size.get(), 48); + + auto device = meta->memory_planned_buffer_device(0); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), DeviceType::CUDA); + EXPECT_EQ(device->index(), 0); +} + +TEST_F(ModuleDeviceMemoryTest, DeviceModelWithSharedArenasReturnsNotSupported) { + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + // share_memory_arenas = true with a device-annotated model should fail. + Module module( + path, + Module::LoadMode::File, + /*event_tracer=*/nullptr, + /*memory_allocator=*/nullptr, + /*temp_allocator=*/nullptr, + /*share_memory_arenas=*/true); + + auto err = module.load_method("forward"); + EXPECT_EQ(err, Error::NotSupported); +} + +TEST_F( + ModuleDeviceMemoryTest, + LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) { + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + { + Module module(path); + auto err = module.load_method("forward"); + + // Regardless of whether load_method succeeds or fails (e.g. due to + // backend init issues), the device-aware memory allocation path + // (make_planned_memory_with_devices) runs BEFORE backend init. + EXPECT_EQ(g_mock_cuda.allocate_count_, 1) + << "Expected 1 device allocation for the CUDA buffer" + << " (actual: " << g_mock_cuda.allocate_count_ << ")" + << ", deallocate_count=" << g_mock_cuda.deallocate_count_ + << ", load_method returned error=" << static_cast(err); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48) + << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)"; + EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0) + << "Expected device_index=0 (cuda:0)"; + + if (err == Error::Ok) { + // Success path: MethodHolder moved into methods_ map. + // DeviceMemoryBuffer is alive as long as Module is alive. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0) + << "No deallocation while method is loaded"; + } else { + // Error path: local MethodHolder destroyed on return from load_method. + // RAII deallocation already happened. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1) + << "RAII deallocation on error path"; + } + } + + // After Module destroyed, all device memory must be freed. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1) + << "Expected deallocation after Module destroyed"; +} diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl index f0d7e449efd..4dc3fb537f3 100644 --- a/extension/module/test/targets.bzl +++ b/extension/module/test/targets.bzl @@ -28,7 +28,7 @@ def define_common_targets(is_fbcode=False): aten_suffix = ("_aten" if aten_mode else "") runtime.cxx_test( - name = "test" + aten_suffix, + name = "module_test" + aten_suffix, srcs = [ "module_test.cpp", ], @@ -68,6 +68,26 @@ def define_common_targets(is_fbcode=False): ], ) + runtime.cxx_test( + name = "module_device_memory_test" + aten_suffix, + srcs = [ + "module_device_memory_test.cpp", + ], + deps = [ + "//executorch/kernels/portable:generated_lib" + aten_suffix, + "//executorch/extension/module:module" + aten_suffix, + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core:device_memory_buffer", + ], + env = { + "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])", + "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", + }, + compiler_flags = [ + "-Wno-error=deprecated-declarations", + ], + ) + runtime.filegroup( name = "resources", srcs = native.glob([ diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl index b0545b8ce18..659a128994f 100644 --- a/shim_et/xplat/executorch/build/build_variables.bzl +++ b/shim_et/xplat/executorch/build/build_variables.bzl @@ -50,6 +50,8 @@ PLATFORM_SRCS = [ EXECUTORCH_CORE_SRCS = sorted([ "runtime/backend/interface.cpp", + "runtime/core/device_allocator.cpp", + "runtime/core/device_memory_buffer.cpp", "runtime/core/evalue.cpp", "runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp", "runtime/core/exec_aten/util/tensor_util_portable.cpp", diff --git a/test/models/targets.bzl b/test/models/targets.bzl index c9fb67b7d31..a80244b1383 100644 --- a/test/models/targets.bzl +++ b/test/models/targets.bzl @@ -226,6 +226,7 @@ def define_common_targets(): default_outs = ["."], visibility = [ "//executorch/runtime/executor/test/...", + "//executorch/extension/module/test/...", ], )