diff --git a/runtime/core/hierarchical_allocator.h b/runtime/core/hierarchical_allocator.h index d41b98f69d5..00eb5b3a089 100644 --- a/runtime/core/hierarchical_allocator.h +++ b/runtime/core/hierarchical_allocator.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -34,6 +35,30 @@ class HierarchicalAllocator final { explicit HierarchicalAllocator(Span> buffers) : buffers_(buffers) {} + /** + * Constructs a new hierarchical allocator with per-buffer device metadata. + * + * @param[in] buffers Same as above. May contain a mix of CPU and device + * pointers — HierarchicalAllocator only does pointer arithmetic, so + * device pointers are valid. + * @param[in] planned_buffer_devices One entry per buffer (same count as + * `buffers`), indicating the `Device` (type + index) for each buffer. + * Different buffers can target the same device type but different + * indices (e.g., `cuda:0` vs `cuda:1`). For CPU-only programs, use the + * single-arg constructor instead. + */ + HierarchicalAllocator( + Span> buffers, + Span planned_buffer_devices) + : buffers_(buffers), planned_buffer_devices_(planned_buffer_devices) { + ET_CHECK_MSG( + planned_buffer_devices.size() == buffers.size(), + "planned_buffer_devices size (%" ET_PRIsize_t + ") must match buffers size (%" ET_PRIsize_t ")", + planned_buffer_devices.size(), + buffers.size()); + } + /** * DEPRECATED: Use spans instead. */ @@ -88,6 +113,17 @@ class HierarchicalAllocator final { return buffer.data() + offset_bytes; } + /** + * Returns per-buffer device metadata. One entry per buffer, same count as + * the `buffers` passed to the constructor. Each entry is a `Device` + * carrying both type and index, so callers can distinguish e.g. `cuda:0` + * from `cuda:1`. Empty if no device metadata was provided (CPU-only + * program). + */ + Span planned_buffer_devices() const { + return planned_buffer_devices_; + } + private: // TODO(T162089316): Remove the span array and to_spans once all users move to // spans. This array is necessary to hold the pointers and sizes that were @@ -113,6 +149,10 @@ class HierarchicalAllocator final { /// The underlying buffers. Span> buffers_; + + /// Per-buffer device metadata. Empty when no device info was provided + /// (CPU-only program). + Span planned_buffer_devices_; }; } // namespace runtime diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl index 89c4dfa08c1..05beb198a61 100644 --- a/runtime/core/targets.bzl +++ b/runtime/core/targets.bzl @@ -77,6 +77,7 @@ def define_common_targets(): ], exported_deps = [ ":core", + "//executorch/runtime/core/exec_aten:lib", "//executorch/runtime/core/portable_type/c10/c10:c10", ], visibility = ["PUBLIC"], diff --git a/runtime/core/test/hierarchical_allocator_test.cpp b/runtime/core/test/hierarchical_allocator_test.cpp index e25e1eb8335..245c4b7b0e5 100644 --- a/runtime/core/test/hierarchical_allocator_test.cpp +++ b/runtime/core/test/hierarchical_allocator_test.cpp @@ -10,8 +10,10 @@ #include #include +#include #include #include +#include #include #include @@ -22,6 +24,8 @@ using executorch::runtime::HierarchicalAllocator; using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; using executorch::runtime::Span; +using executorch::runtime::etensor::Device; +using executorch::runtime::etensor::DeviceType; class HierarchicalAllocatorTest : public ::testing::Test { protected: @@ -87,6 +91,67 @@ TEST_F(HierarchicalAllocatorTest, Smoke) { } } +TEST_F(HierarchicalAllocatorTest, NoDeviceMetadataByDefault) { + Span> empty_buffers{}; + HierarchicalAllocator allocator(empty_buffers); + + EXPECT_EQ(allocator.planned_buffer_devices().size(), 0); +} + +TEST_F(HierarchicalAllocatorTest, ExposesDeviceMetadataWhenProvided) { + // Use 4 buffers so the device span size matches. + constexpr size_t n_buffers = 4; + uint8_t mem0[4]; + uint8_t mem1[4]; + uint8_t mem2[4]; + uint8_t mem3[4]; + Span buffers[n_buffers]{ + {mem0, sizeof(mem0)}, + {mem1, sizeof(mem1)}, + {mem2, sizeof(mem2)}, + {mem3, sizeof(mem3)}, + }; + + // CPU buffers come first because the runtime always sets up host-side + // planned memory before any device buffers. The two CUDA entries use + // distinct device indices to verify per-buffer index tracking. + Device devices[] = { + Device(DeviceType::CPU, 0), + Device(DeviceType::CPU, 0), + Device(DeviceType::CUDA, 0), + Device(DeviceType::CUDA, 1), + }; + Span device_span(devices, n_buffers); + + HierarchicalAllocator allocator({buffers, n_buffers}, device_span); + + ASSERT_EQ(allocator.planned_buffer_devices().size(), n_buffers); + EXPECT_EQ(allocator.planned_buffer_devices()[0], Device(DeviceType::CPU, 0)); + EXPECT_EQ(allocator.planned_buffer_devices()[1], Device(DeviceType::CPU, 0)); + EXPECT_EQ(allocator.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0)); + EXPECT_EQ(allocator.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1)); +} + +TEST_F(HierarchicalAllocatorTest, MismatchedDeviceCountAborts) { + constexpr size_t n_buffers = 2; + uint8_t mem0[4]; + uint8_t mem1[4]; + Span buffers[n_buffers]{ + {mem0, sizeof(mem0)}, + {mem1, sizeof(mem1)}, + }; + + // 3 device entries vs 2 buffers — should abort. + Device devices[] = { + Device(DeviceType::CPU, 0), + Device(DeviceType::CPU, 0), + Device(DeviceType::CUDA, 0), + }; + Span device_span(devices, 3); + + ET_EXPECT_DEATH(HierarchicalAllocator({buffers, n_buffers}, device_span), ""); +} + // TODO(T162089316): Tests the deprecated API. Remove this when removing the // API. TEST_F(HierarchicalAllocatorTest, DEPRECATEDSmoke) { diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl index 87019909a9f..4d865df425d 100644 --- a/runtime/core/test/targets.bzl +++ b/runtime/core/test/targets.bzl @@ -108,6 +108,7 @@ def define_common_targets(): ], deps = [ "//executorch/runtime/core:memory_allocator", + "//executorch/test/utils:utils", ], ) diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h index 42edd9f0bea..bc89adcda79 100644 --- a/runtime/executor/memory_manager.h +++ b/runtime/executor/memory_manager.h @@ -10,6 +10,8 @@ #include #include +#include +#include namespace executorch { namespace runtime { @@ -42,7 +44,8 @@ class MemoryManager final { * must agree with the corresponding * `MethodMeta::num_memory_planned_buffers()` and * `MethodMeta::memory_planned_buffer_size(N)` values, which are embedded - * in the Program. + * in the Program. For device-aware programs, the per-buffer device + * metadata is owned by the HierarchicalAllocator as well. * @param[in] temp_allocator The allocator to use when allocating temporary * data during kernel or delegate execution. Must outlive the Method that * uses it. May be `nullptr` if the Method does not use kernels or @@ -105,6 +108,29 @@ class MemoryManager final { return temp_allocator_; } + /** + * Returns per-buffer device metadata. One entry per planned memory buffer, + * same count as planned_memory buffers. Empty if no device metadata was + * provided (CPU-only program) or if `planned_memory` is null. + * + * This is a thin wrapper around + * `HierarchicalAllocator::planned_buffer_devices()`. + */ + Span planned_buffer_devices() const { + if (planned_memory_ == nullptr) { + return {}; + } + return planned_memory_->planned_buffer_devices(); + } + + /** + * Returns true if any planned buffer has device metadata attached. + * When false, the memory setup is CPU-only. + */ + bool has_device_memory() const { + return planned_buffer_devices().size() > 0; + } + private: MemoryAllocator* method_allocator_; HierarchicalAllocator* planned_memory_; diff --git a/runtime/executor/test/memory_manager_test.cpp b/runtime/executor/test/memory_manager_test.cpp index 0e1feb47793..edbbbde343a 100644 --- a/runtime/executor/test/memory_manager_test.cpp +++ b/runtime/executor/test/memory_manager_test.cpp @@ -17,6 +17,9 @@ using namespace ::testing; using executorch::runtime::HierarchicalAllocator; using executorch::runtime::MemoryAllocator; using executorch::runtime::MemoryManager; +using executorch::runtime::Span; +using executorch::runtime::etensor::Device; +using executorch::runtime::etensor::DeviceType; TEST(MemoryManagerTest, MinimalCtor) { MemoryAllocator method_allocator(0, nullptr); @@ -93,3 +96,64 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) { /*temp_allocator=*/&method_allocator), "cannot be the same"); } + +TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) { + MemoryAllocator method_allocator(0, nullptr); + HierarchicalAllocator planned_memory({}); + MemoryAllocator temp_allocator(0, nullptr); + + MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator); + + EXPECT_FALSE(mm.has_device_memory()); + EXPECT_EQ(mm.planned_buffer_devices().size(), 0); +} + +TEST(MemoryManagerTest, DelegatesDeviceMetadataToHierarchicalAllocator) { + MemoryAllocator method_allocator(0, nullptr); + MemoryAllocator temp_allocator(0, nullptr); + + // 4 buffers: cpu:0, cpu:0, cuda:0, cuda:1. CPU buffers come first because + // the runtime always sets up host-side planned memory before any device + // buffers. The two CUDA entries use distinct indices to verify per-buffer + // index tracking. + constexpr size_t n_buffers = 4; + uint8_t mem0[4]; + uint8_t mem1[4]; + uint8_t mem2[4]; + uint8_t mem3[4]; + Span buffers[n_buffers]{ + {mem0, sizeof(mem0)}, + {mem1, sizeof(mem1)}, + {mem2, sizeof(mem2)}, + {mem3, sizeof(mem3)}, + }; + Device devices[] = { + Device(DeviceType::CPU, 0), + Device(DeviceType::CPU, 0), + Device(DeviceType::CUDA, 0), + Device(DeviceType::CUDA, 1), + }; + Span device_span(devices, n_buffers); + + HierarchicalAllocator planned_memory({buffers, n_buffers}, device_span); + MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator); + + EXPECT_EQ(mm.method_allocator(), &method_allocator); + EXPECT_EQ(mm.planned_memory(), &planned_memory); + EXPECT_EQ(mm.temp_allocator(), &temp_allocator); + EXPECT_TRUE(mm.has_device_memory()); + EXPECT_EQ(mm.planned_buffer_devices().size(), n_buffers); + EXPECT_EQ(mm.planned_buffer_devices()[0], Device(DeviceType::CPU, 0)); + EXPECT_EQ(mm.planned_buffer_devices()[1], Device(DeviceType::CPU, 0)); + EXPECT_EQ(mm.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0)); + EXPECT_EQ(mm.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1)); +} + +TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) { + MemoryAllocator method_allocator(0, nullptr); + + MemoryManager mm(&method_allocator); + + EXPECT_FALSE(mm.has_device_memory()); + EXPECT_EQ(mm.planned_buffer_devices().size(), 0); +} diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl index 74ea9a8262d..32baa63a76b 100644 --- a/runtime/executor/test/targets.bzl +++ b/runtime/executor/test/targets.bzl @@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False): "//executorch/exir/backend/test/...", "//executorch/runtime/backend/...", "//executorch/extension/pybindings/...", + "//executorch/extension/module/test/...", "//executorch/devtools/fb/runners/...", "//executorch/test/...", "//executorch/examples/...", @@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False): deps = [ ":managed_memory_manager", "//executorch/runtime/executor:program", + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core:device_memory_buffer", "//executorch/extension/data_loader:file_data_loader", "//executorch/schema:program", ], diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp index cdcc4f3e517..3cd5570b42b 100644 --- a/runtime/executor/test/tensor_parser_device_test.cpp +++ b/runtime/executor/test/tensor_parser_device_test.cpp @@ -17,17 +17,31 @@ #include #include +#include +#include #include #include +#include #include #include using executorch::aten::Tensor; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::MethodMeta; using executorch::runtime::Program; +using executorch::runtime::register_device_allocator; using executorch::runtime::Result; +using executorch::runtime::Span; using executorch::runtime::deserialization::parseTensor; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; using executorch::runtime::testing::ManagedMemoryManager; using torch::executor::util::FileDataLoader; @@ -50,8 +64,72 @@ class ProgramTestFriend final { using executorch::runtime::testing::ProgramTestFriend; +namespace { + +/** + * Mock CUDA allocator that uses host memory for testing. + * Tracks the allocated range so tests can verify tensor data_ptr + * falls within the "device" memory region. + */ +class MockCudaAllocator : public DeviceAllocator { + public: + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignement = kDefaultAlignment) override { + (void)alignement; + (void)index; + allocate_count_++; + buffer_ = std::make_unique(nbytes); + buffer_size_ = nbytes; + return static_cast(buffer_.get()); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + buffer_.reset(); + buffer_size_ = 0; + } + + Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + bool is_device_ptr(const void* ptr) const { + if (buffer_ == nullptr || buffer_size_ == 0) { + return false; + } + auto* p = static_cast(ptr); + return p >= buffer_.get() && p < buffer_.get() + buffer_size_; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + + private: + std::unique_ptr buffer_; + size_t buffer_size_ = 0; +}; + +} // namespace + +static MockCudaAllocator g_mock_cuda; + class TensorParserDeviceTest : public ::testing::Test { protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(&g_mock_cuda); + } + void SetUp() override { const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); ASSERT_NE(path, nullptr) @@ -59,6 +137,9 @@ class TensorParserDeviceTest : public ::testing::Test { Result loader = FileDataLoader::from(path); ASSERT_EQ(loader.error(), Error::Ok); loader_ = std::make_unique(std::move(loader.get())); + + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; } std::unique_ptr loader_; @@ -167,3 +248,96 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) { << " without device annotation should have device_index=0"; } } +TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) { + Result program = + Program::load(loader_.get(), Program::Verification::Minimal); + ASSERT_EQ(program.error(), Error::Ok); + + Result method_meta = program->method_meta("forward"); + ASSERT_EQ(method_meta.error(), Error::Ok); + + // ModuleAddWithDevice has: + // non_const_buffer_sizes: [0, 48] (index 0 reserved, buffer 0 = 48 bytes) + // non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}] + const size_t num_buffers = method_meta->num_memory_planned_buffers(); + ASSERT_EQ(num_buffers, 1); + + // Set up device-aware planned memory. + std::vector> planned_spans; + std::vector> cpu_buffers; + std::vector device_buffers; + + for (size_t i = 0; i < num_buffers; ++i) { + auto size = method_meta->memory_planned_buffer_size(i); + ASSERT_TRUE(size.ok()); + auto device = method_meta->memory_planned_buffer_device(i); + ASSERT_TRUE(device.ok()); + + if (device->is_cpu()) { + cpu_buffers.emplace_back(size.get()); + planned_spans.emplace_back( + cpu_buffers.back().data(), cpu_buffers.back().size()); + } else { + cpu_buffers.emplace_back(); // empty placeholder + auto dmb = DeviceMemoryBuffer::create( + size.get(), device->type(), device->index()); + ASSERT_TRUE(dmb.ok()) + << "DeviceMemoryBuffer::create failed for buffer " << i; + planned_spans.emplace_back(dmb->as_span()); + device_buffers.push_back(std::move(dmb.get())); + } + } + + ASSERT_EQ(g_mock_cuda.allocate_count_, 1); + + // Build HierarchicalAllocator with mixed CPU/device spans. + HierarchicalAllocator planned_memory( + {planned_spans.data(), planned_spans.size()}); + + constexpr size_t kMethodAllocBytes = 32 * 1024U; + auto method_alloc_pool = std::make_unique(kMethodAllocBytes); + MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get()); + MemoryManager memory_manager(&method_allocator, &planned_memory); + + // Parse tensors and verify CUDA tensors have device memory. + const executorch_flatbuffer::Program* internal_program = + ProgramTestFriend::GetInternalProgram(&program.get()); + auto* execution_plan = + internal_program->execution_plan()->GetMutableObject(0); + auto* flatbuffer_values = execution_plan->values(); + + int cuda_with_device_memory = 0; + + for (size_t i = 0; i < flatbuffer_values->size(); ++i) { + auto* serialization_value = flatbuffer_values->Get(i); + if (serialization_value->val_type() != + executorch_flatbuffer::KernelTypes::Tensor) { + continue; + } + + auto* s_tensor = serialization_value->val_as_Tensor(); + bool is_cuda = s_tensor->extra_tensor_info() != nullptr && + s_tensor->extra_tensor_info()->device_type() == + executorch_flatbuffer::DeviceType::CUDA; + + Result tensor = + parseTensor(&program.get(), &memory_manager, s_tensor); + ASSERT_TRUE(tensor.ok()) + << "parseTensor failed at index " << i << " with error 0x" << std::hex + << static_cast(tensor.error()); + + Tensor t = tensor.get(); + + if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) { + EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr())) + << "CUDA tensor at index " << i + << " should have data_ptr in device memory, but got CPU memory"; + cuda_with_device_memory++; + } + } + + // All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have + // their data_ptr pointing to the mock device memory buffer. + EXPECT_EQ(cuda_with_device_memory, 3) + << "All 3 CUDA tensors should have data_ptr in device memory"; +}