diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index a48d88fa224..5c1f2db465d 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan: self.module.meta["non_const_buffer_sizes"], ), container_meta_type=self.container_meta_type, + # non_const_buffer_device is set by apply_algo in memory_planning.py + # when device tensors are present. None for CPU-only programs. + non_const_buffer_device=self.module.meta.get( + "non_const_buffer_device", None + ), ) diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index 2de8a98a2cf..4bf97f60da4 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -2643,3 +2643,186 @@ def forward(self, a, b): 0, "No tensor should have extra device info when model runs entirely on CPU", ) + + def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None: + """Verify that non_const_buffer_device is emitted into ExecutionPlan when + device-aware memory planning is enabled and non-CPU tensors are present.""" + from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( + generate_pattern_op_partitions, + ) + from executorch.exir.backend.compile_spec_schema import CompileSpec + from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, + ) + from executorch.exir.backend.test.backend_with_compiler_demo import ( + BackendWithCompilerDemo, + ) + from executorch.exir.passes.propagate_device_pass import ( + TARGET_DEVICE_COMPILE_SPEC_KEY, + ) + from torch.fx.passes.operator_support import any_chain, OperatorSupportBase + + class AddSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.add.Tensor, + ] + + class DevicePartitioner(Partitioner): + def __init__(self): + super().__init__() + self.delegation_spec = DelegationSpec( + BackendWithCompilerDemo.__name__, + [ + CompileSpec("max_value", bytes([4])), + CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"), + ], + ) + + def partition(self, exported_program) -> PartitionResult: + partition_tags = {} + partition_list = generate_pattern_op_partitions( + exported_program.graph_module, + op_support=any_chain(AddSupport()), + ) + for partition in partition_list: + for node in partition.nodes: + tag = f"tag{partition.id}" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + return PartitionResult( + tagged_exported_program=exported_program, + partition_tags=partition_tags, + ) + + class Model(torch.nn.Module): + def forward(self, a, b): + return torch.add(a, b) + + model = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2)) + + edge = to_edge( + export(model, inputs), + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ) + lowered = edge.to_backend(DevicePartitioner()) + et_prog = lowered.to_executorch( + config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True), + ) + program = et_prog._emitter_output.program + + plan = program.execution_plan[0] + self.assertIsNotNone( + plan.non_const_buffer_device, + "non_const_buffer_device should be set when device tensors are present " + "and enable_non_cpu_memory_planning is True", + ) + self.assertGreater(len(plan.non_const_buffer_device), 0) + for entry in plan.non_const_buffer_device: + self.assertEqual(entry.device_type, schema.DeviceType.CUDA) + self.assertEqual(entry.device_index, 0) + + def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None: + """When all tensors are on CPU, non_const_buffer_device should be None + even with enable_non_cpu_memory_planning=True.""" + + class Model(torch.nn.Module): + def forward(self, a, b): + return torch.add(a, b) + + model = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2)) + + edge = to_edge( + export(model, inputs), + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ) + et_prog = edge.to_executorch( + config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True), + ) + program = et_prog._emitter_output.program + + plan = program.execution_plan[0] + self.assertIsNone( + plan.non_const_buffer_device, + "non_const_buffer_device should be None for CPU-only programs", + ) + + def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None: + """Even with device tensors, non_const_buffer_device should be None when + enable_non_cpu_memory_planning is False (default).""" + from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import ( + generate_pattern_op_partitions, + ) + from executorch.exir.backend.compile_spec_schema import CompileSpec + from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, + ) + from executorch.exir.backend.test.backend_with_compiler_demo import ( + BackendWithCompilerDemo, + ) + from executorch.exir.passes.propagate_device_pass import ( + TARGET_DEVICE_COMPILE_SPEC_KEY, + ) + from torch.fx.passes.operator_support import any_chain, OperatorSupportBase + + class AddSupport(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + return node.op == "call_function" and node.target in [ + exir_ops.edge.aten.add.Tensor, + ] + + class DevicePartitioner(Partitioner): + def __init__(self): + super().__init__() + self.delegation_spec = DelegationSpec( + BackendWithCompilerDemo.__name__, + [ + CompileSpec("max_value", bytes([4])), + CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"), + ], + ) + + def partition(self, exported_program) -> PartitionResult: + partition_tags = {} + partition_list = generate_pattern_op_partitions( + exported_program.graph_module, + op_support=any_chain(AddSupport()), + ) + for partition in partition_list: + for node in partition.nodes: + tag = f"tag{partition.id}" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + return PartitionResult( + tagged_exported_program=exported_program, + partition_tags=partition_tags, + ) + + class Model(torch.nn.Module): + def forward(self, a, b): + return torch.add(a, b) + + model = Model() + inputs = (torch.randn(2, 2), torch.randn(2, 2)) + + edge = to_edge( + export(model, inputs), + compile_config=EdgeCompileConfig(_check_ir_validity=False), + ) + lowered = edge.to_backend(DevicePartitioner()) + # Default: enable_non_cpu_memory_planning=False + et_prog = lowered.to_executorch() + program = et_prog._emitter_output.program + + plan = program.execution_plan[0] + self.assertIsNone( + plan.non_const_buffer_device, + "non_const_buffer_device should be None when " + "enable_non_cpu_memory_planning is False", + ) diff --git a/runtime/core/device_memory_buffer.cpp b/runtime/core/device_memory_buffer.cpp new file mode 100644 index 00000000000..26faba39456 --- /dev/null +++ b/runtime/core/device_memory_buffer.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch::runtime { + +Result DeviceMemoryBuffer::create( + size_t size, + etensor::DeviceType type, + etensor::DeviceIndex index, + size_t alignment) { + DeviceAllocator* allocator = get_device_allocator(type); + if (allocator == nullptr) { + ET_LOG( + Error, + "No device allocator registered for device type %d", + static_cast(type)); + return Error::NotFound; + } + + auto result = allocator->allocate(size, index, alignment); + if (!result.ok()) { + return result.error(); + } + + return DeviceMemoryBuffer(result.get(), size, allocator, index); +} + +} // namespace executorch::runtime diff --git a/runtime/core/device_memory_buffer.h b/runtime/core/device_memory_buffer.h new file mode 100644 index 00000000000..929460ce938 --- /dev/null +++ b/runtime/core/device_memory_buffer.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace executorch::runtime { + +/** + * RAII wrapper that owns a single device memory allocation. + * + * On destruction, calls DeviceAllocator::deallocate() to free the memory. + * This mirrors the role of std::vector for CPU planned buffers, + * but for device memory (CUDA, etc.). + * + * Move-only: cannot be copied, but can be moved to transfer ownership. + */ +class DeviceMemoryBuffer final { + public: + /** + * Creates a DeviceMemoryBuffer by allocating device memory. + * + * Looks up the DeviceAllocator for the given device type via the + * DeviceAllocatorRegistry. If no allocator is registered for the type, + * returns Error::NotFound. + * + * @param size Number of bytes to allocate. + * @param type The device type (e.g., CUDA). + * @param index The device index (e.g., 0 for cuda:0). + * @param alignment Minimum alignment of the returned pointer in bytes. + * Must be a power of 2. Defaults to DeviceAllocator::kDefaultAlignment. + * @return A Result containing the DeviceMemoryBuffer on success, or an error. + */ + static Result create( + size_t size, + etensor::DeviceType type, + etensor::DeviceIndex index = 0, + size_t alignment = DeviceAllocator::kDefaultAlignment); + + DeviceMemoryBuffer() = default; + + ~DeviceMemoryBuffer() { + if (ptr_ != nullptr && allocator_ != nullptr) { + allocator_->deallocate(ptr_, device_index_); + } + } + + // Move constructor: transfer ownership. + DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept + : ptr_(other.ptr_), + size_(other.size_), + allocator_(other.allocator_), + device_index_(other.device_index_) { + other.ptr_ = nullptr; + other.size_ = 0; + other.allocator_ = nullptr; + } + + // Move assignment: release current, take ownership. + DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept { + if (this != &other) { + if (ptr_ != nullptr && allocator_ != nullptr) { + allocator_->deallocate(ptr_, device_index_); + } + ptr_ = other.ptr_; + size_ = other.size_; + allocator_ = other.allocator_; + device_index_ = other.device_index_; + other.ptr_ = nullptr; + other.size_ = 0; + other.allocator_ = nullptr; + } + return *this; + } + + // Non-copyable. + DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete; + DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete; + + /// Returns the device pointer, or nullptr if empty/moved-from. + void* data() const { + return ptr_; + } + + /// Returns the size in bytes of the allocation. + size_t size() const { + return size_; + } + + /** + * Returns a Span wrapping the device pointer. + * + * This is intended for use with HierarchicalAllocator, which only performs + * pointer arithmetic on the span data and never dereferences it. Device + * pointers are valid for pointer arithmetic from the CPU side. + */ + Span as_span() const { + return {static_cast(ptr_), size_}; + } + + private: + DeviceMemoryBuffer( + void* ptr, + size_t size, + DeviceAllocator* allocator, + etensor::DeviceIndex device_index) + : ptr_(ptr), + size_(size), + allocator_(allocator), + device_index_(device_index) {} + + void* ptr_ = nullptr; + size_t size_ = 0; + DeviceAllocator* allocator_ = nullptr; + etensor::DeviceIndex device_index_ = 0; +}; + +} // namespace executorch::runtime diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 78ffea3bdba..0a368bd6379 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -28,6 +28,7 @@ def define_common_targets(): "//executorch/extension/fb/dynamic_shim/...", "//executorch/kernels/portable/cpu/...", "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core/...", "//executorch/runtime/core/exec_aten/...", "//executorch/runtime/core/portable_type/test/...", ], diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl index 1a81c5af1f6..89c4dfa08c1 100644 --- a/runtime/core/targets.bzl +++ b/runtime/core/targets.bzl @@ -155,6 +155,17 @@ def define_common_targets(): visibility = ["//executorch/..."], ) + runtime.cxx_library( + name = "device_memory_buffer", + srcs = ["device_memory_buffer.cpp"], + exported_headers = ["device_memory_buffer.h"], + exported_deps = [ + ":core", + ":device_allocator", + ], + visibility = ["PUBLIC"], + ) + runtime.cxx_library( name = "tag", srcs = ["tag.cpp"], diff --git a/runtime/core/test/device_memory_buffer_test.cpp b/runtime/core/test/device_memory_buffer_test.cpp new file mode 100644 index 00000000000..36ecadfc2d2 --- /dev/null +++ b/runtime/core/test/device_memory_buffer_test.cpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +/** + * A mock DeviceAllocator for testing DeviceMemoryBuffer. + * Returns pointers into a local buffer and tracks call counts. + */ +class MockAllocator : public DeviceAllocator { + public: + explicit MockAllocator(DeviceType type) : type_(type) {} + + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = DeviceAllocator::kDefaultAlignment) override { + allocate_count_++; + last_allocate_size_ = nbytes; + last_allocate_alignment_ = alignment; + return static_cast(buffer_); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + last_deallocate_ptr_ = ptr; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + size_t last_allocate_size_ = 0; + size_t last_allocate_alignment_ = 0; + void* last_deallocate_ptr_ = nullptr; + uint8_t buffer_[256] = {}; + + private: + DeviceType type_; +}; + +// Global mock registered once before all tests run. +static MockAllocator g_mock_cuda(DeviceType::CUDA); + +class DeviceMemoryBufferTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(&g_mock_cuda); + } + + void SetUp() override { + // Reset counters before each test. + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; + g_mock_cuda.last_allocate_size_ = 0; + g_mock_cuda.last_allocate_alignment_ = 0; + g_mock_cuda.last_deallocate_ptr_ = nullptr; + } +}; + +TEST_F(DeviceMemoryBufferTest, DefaultConstructedIsEmpty) { + DeviceMemoryBuffer buf; + EXPECT_EQ(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 0); + + auto span = buf.as_span(); + EXPECT_EQ(span.data(), nullptr); + EXPECT_EQ(span.size(), 0); +} + +TEST_F(DeviceMemoryBufferTest, CreateAllocatesAndDestructorDeallocates) { + { + auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + + auto buf = std::move(result.get()); + EXPECT_NE(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 1024); + EXPECT_EQ(g_mock_cuda.allocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 1024); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); + } + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_deallocate_ptr_, g_mock_cuda.buffer_); +} + +TEST_F(DeviceMemoryBufferTest, CreateFailsWithNoRegisteredAllocator) { + auto result = DeviceMemoryBuffer::create(512, DeviceType::CPU, 0); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), Error::NotFound); +} + +TEST_F(DeviceMemoryBufferTest, MoveConstructorTransfersOwnership) { + auto result = DeviceMemoryBuffer::create(256, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto original = std::move(result.get()); + void* original_ptr = original.data(); + + DeviceMemoryBuffer moved(std::move(original)); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(original.size(), 0); + EXPECT_EQ(moved.data(), original_ptr); + EXPECT_EQ(moved.size(), 256); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, MoveAssignmentTransfersOwnership) { + auto result = DeviceMemoryBuffer::create(128, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto original = std::move(result.get()); + void* original_ptr = original.data(); + + DeviceMemoryBuffer target; + target = std::move(original); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(target.data(), original_ptr); + EXPECT_EQ(target.size(), 128); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, DestructorNoOpForDefaultConstructed) { + { DeviceMemoryBuffer buf; } + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, AsSpanWrapsDevicePointer) { + auto result = DeviceMemoryBuffer::create(2048, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto buf = std::move(result.get()); + + auto span = buf.as_span(); + EXPECT_EQ(span.data(), static_cast(buf.data())); + EXPECT_EQ(span.size(), 2048); +} + +TEST_F(DeviceMemoryBufferTest, CreateUsesDefaultAlignmentWhenUnspecified) { + auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + EXPECT_EQ( + g_mock_cuda.last_allocate_alignment_, DeviceAllocator::kDefaultAlignment); +} + +TEST_F(DeviceMemoryBufferTest, CreateForwardsCustomAlignmentToAllocator) { + constexpr size_t kCustomAlignment = 512; + auto result = + DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0, kCustomAlignment); + ASSERT_TRUE(result.ok()); + EXPECT_EQ(g_mock_cuda.last_allocate_alignment_, kCustomAlignment); +} diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl index 1adb75f6e82..87019909a9f 100644 --- a/runtime/core/test/targets.bzl +++ b/runtime/core/test/targets.bzl @@ -7,6 +7,14 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ + runtime.cxx_test( + name = "device_memory_buffer_test", + srcs = ["device_memory_buffer_test.cpp"], + deps = [ + "//executorch/runtime/core:device_memory_buffer", + ], + ) + runtime.cxx_test( name = "span_test", srcs = ["span_test.cpp"], diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp index ba73828378a..0af97b3fdf3 100644 --- a/runtime/executor/method_meta.cpp +++ b/runtime/executor/method_meta.cpp @@ -364,6 +364,42 @@ Result MethodMeta::memory_planned_buffer_size(size_t index) const { return size; } +Result MethodMeta::memory_planned_buffer_device( + size_t index) const { + auto num_buffers = this->num_memory_planned_buffers(); + ET_CHECK_OR_RETURN_ERROR( + index < num_buffers, + InvalidArgument, + "index %zu out of range. num_buffers: %zu", + index, + num_buffers); + + // The non_const_buffer_device field is optional and only present when the + // program contains non-CPU buffers. For CPU-only programs (or legacy PTE + // files), this field is null and all buffers default to CPU. + auto* buffer_devices = s_plan_->non_const_buffer_device(); + if (buffer_devices == nullptr) { + return etensor::Device{etensor::DeviceType::CPU, 0}; + } + + // The sparse list only contains entries for non-CPU buffers. + // buffer_idx uses the same indexing as non_const_buffer_sizes (1-based, + // with index 0 reserved). The user-facing index is 0-based, so we + // compare against index + 1. + const auto internal_idx = static_cast(index + 1); + for (size_t i = 0; i < buffer_devices->size(); ++i) { + auto entry = buffer_devices->Get(i); + if (entry->buffer_idx() == internal_idx) { + return etensor::Device{ + static_cast(entry->device_type()), + static_cast(entry->device_index())}; + } + } + + // Not found in the sparse list — this buffer is on CPU. + return etensor::Device{etensor::DeviceType::CPU, 0}; +} + bool MethodMeta::uses_backend(const char* backend_name) const { ET_CHECK_MSG(backend_name, "backend name is null"); const auto delegates = s_plan_->delegates(); diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h index 79fd05c28ee..e0fa16cda22 100644 --- a/runtime/executor/method_meta.h +++ b/runtime/executor/method_meta.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include @@ -234,6 +235,19 @@ class MethodMeta final { */ Result memory_planned_buffer_size(size_t index) const; + /** + * Get the device placement for the specified memory-planned buffer. + * + * For CPU-only programs (no non_const_buffer_device in the PTE), all buffers + * default to Device{CPU, 0}. For programs with device annotations, returns + * the device type and index that the buffer should be allocated on. + * + * @param[in] index The index of the buffer to look up (0-based, same + * indexing as memory_planned_buffer_size()). + * @returns The Device on success, or an error on failure. + */ + Result memory_planned_buffer_device(size_t index) const; + /** * Check to see if a backend is used in this method. * diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp index e4ef2e72a85..3e6e09cc8c3 100644 --- a/runtime/executor/test/method_meta_test.cpp +++ b/runtime/executor/test/method_meta_test.cpp @@ -74,6 +74,10 @@ class MethodMetaTest : public ::testing::Test { void SetUp() override { load_program(std::getenv("ET_MODULE_ADD_PATH"), "add"); load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful"); + const char* device_path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + if (device_path != nullptr) { + load_program(device_path, "add_with_device"); + } } private: @@ -192,6 +196,27 @@ TEST_F(MethodMetaTest, MethodMetaAttribute) { ASSERT_EQ(bad_access.error(), Error::InvalidArgument); } +TEST_F(MethodMetaTest, MemoryPlannedBufferDeviceDefaultsCpu) { + Result method_meta = programs_["add"]->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + // CPU-only model: all buffers should default to CPU device. + size_t num_buffers = method_meta->num_memory_planned_buffers(); + ASSERT_GT(num_buffers, 0); + + for (size_t i = 0; i < num_buffers; ++i) { + auto device = method_meta->memory_planned_buffer_device(i); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CPU); + EXPECT_EQ(device->index(), 0); + } + + // Out of range returns error. + EXPECT_EQ( + method_meta->memory_planned_buffer_device(num_buffers).error(), + Error::InvalidArgument); +} + TEST_F(MethodMetaTest, TensorInfoSizeOverflow) { // Create sizes that will cause overflow when multiplied std::vector overflow_sizes = { @@ -214,3 +239,30 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) { executorch::aten::string_view{nullptr, 0}), ""); } + +TEST_F(MethodMetaTest, MethodMetaBufferDeviceReturnsCudaForDeviceBuffer) { + ASSERT_NE(programs_.find("add_with_device"), programs_.end()) + << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set"; + Result method_meta = + programs_["add_with_device"]->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + // ModuleAddWithDevice exports with enable_non_cpu_memory_planning=True. + // The model delegates add(a,b) to CUDA, producing: + // non_const_buffer_sizes: [0, 48] (index 0 reserved) + // non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, + // device_index=0}] + // So there is exactly 1 planned buffer (user-facing index 0), on CUDA. + ASSERT_EQ(method_meta->num_memory_planned_buffers(), 1); + + // Buffer 0 should be CUDA device. + auto device = method_meta->memory_planned_buffer_device(0); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CUDA); + EXPECT_EQ(device->index(), 0); + + // Out of range should return error. + EXPECT_EQ( + method_meta->memory_planned_buffer_device(1).error(), + Error::InvalidArgument); +} diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index f4534aefdea..74ea9a8262d 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -178,7 +178,12 @@ def define_common_targets(is_fbcode = False): "//executorch/runtime/executor:program", "//executorch/extension/data_loader:file_data_loader", ], - env = modules_env, + env = dict( + modules_env, + **{ + "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])", + } + ), ) runtime.cxx_test( diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py index 8666f298640..3b6af55c6e8 100644 --- a/test/models/export_program_with_device_info.py +++ b/test/models/export_program_with_device_info.py @@ -99,7 +99,12 @@ def main() -> None: compile_config=EdgeCompileConfig(_check_ir_validity=False), ) lowered = edge.to_backend(_DeviceAwarePartitioner()) - et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False)) + et_prog = lowered.to_executorch( + ExecutorchBackendConfig( # type: ignore[call-arg] + emit_stacktrace=False, + enable_non_cpu_memory_planning=True, + ) + ) os.makedirs(args.outdir, exist_ok=True) outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte")