Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions runtime/core/hierarchical_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <c10/util/safe_numerics.h>

#include <executorch/runtime/core/memory_allocator.h>
#include <executorch/runtime/core/portable_type/device.h>
#include <executorch/runtime/core/result.h>
#include <executorch/runtime/core/span.h>

Expand All @@ -34,6 +35,30 @@ class HierarchicalAllocator final {
explicit HierarchicalAllocator(Span<Span<uint8_t>> buffers)
: buffers_(buffers) {}

/**
* Constructs a new hierarchical allocator with per-buffer device metadata.
*
* @param[in] buffers Same as above. May contain a mix of CPU and device
* pointers — HierarchicalAllocator only does pointer arithmetic, so
* device pointers are valid.
* @param[in] planned_buffer_devices One entry per buffer (same count as
* `buffers`), indicating the `Device` (type + index) for each buffer.
* Different buffers can target the same device type but different
* indices (e.g., `cuda:0` vs `cuda:1`). For CPU-only programs, use the
* single-arg constructor instead.
*/
HierarchicalAllocator(
Span<Span<uint8_t>> buffers,
Span<const etensor::Device> planned_buffer_devices)
: buffers_(buffers), planned_buffer_devices_(planned_buffer_devices) {
ET_CHECK_MSG(
planned_buffer_devices.size() == buffers.size(),
"planned_buffer_devices size (%" ET_PRIsize_t
") must match buffers size (%" ET_PRIsize_t ")",
planned_buffer_devices.size(),
buffers.size());
}

/**
* DEPRECATED: Use spans instead.
*/
Expand Down Expand Up @@ -88,6 +113,17 @@ class HierarchicalAllocator final {
return buffer.data() + offset_bytes;
}

/**
* Returns per-buffer device metadata. One entry per buffer, same count as
* the `buffers` passed to the constructor. Each entry is a `Device`
* carrying both type and index, so callers can distinguish e.g. `cuda:0`
* from `cuda:1`. Empty if no device metadata was provided (CPU-only
* program).
*/
Span<const etensor::Device> planned_buffer_devices() const {
return planned_buffer_devices_;
}

private:
// TODO(T162089316): Remove the span array and to_spans once all users move to
// spans. This array is necessary to hold the pointers and sizes that were
Expand All @@ -113,6 +149,10 @@ class HierarchicalAllocator final {

/// The underlying buffers.
Span<Span<uint8_t>> buffers_;

/// Per-buffer device metadata. Empty when no device info was provided
/// (CPU-only program).
Span<const etensor::Device> planned_buffer_devices_;
};

} // namespace runtime
Expand Down
1 change: 1 addition & 0 deletions runtime/core/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def define_common_targets():
],
exported_deps = [
":core",
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/portable_type/c10/c10:c10",
],
visibility = ["PUBLIC"],
Expand Down
65 changes: 65 additions & 0 deletions runtime/core/test/hierarchical_allocator_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@

#include <executorch/runtime/core/hierarchical_allocator.h>
#include <executorch/runtime/core/memory_allocator.h>
#include <executorch/runtime/core/portable_type/device.h>
#include <executorch/runtime/core/span.h>
#include <executorch/runtime/platform/runtime.h>
#include <executorch/test/utils/DeathTest.h>
#include <executorch/test/utils/alignment.h>

#include <gtest/gtest.h>
Expand All @@ -22,6 +24,8 @@ using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::Result;
using executorch::runtime::Span;
using executorch::runtime::etensor::Device;
using executorch::runtime::etensor::DeviceType;

class HierarchicalAllocatorTest : public ::testing::Test {
protected:
Expand Down Expand Up @@ -87,6 +91,67 @@ TEST_F(HierarchicalAllocatorTest, Smoke) {
}
}

TEST_F(HierarchicalAllocatorTest, NoDeviceMetadataByDefault) {
Span<Span<uint8_t>> empty_buffers{};
HierarchicalAllocator allocator(empty_buffers);

EXPECT_EQ(allocator.planned_buffer_devices().size(), 0);
}

TEST_F(HierarchicalAllocatorTest, ExposesDeviceMetadataWhenProvided) {
// Use 4 buffers so the device span size matches.
constexpr size_t n_buffers = 4;
uint8_t mem0[4];
uint8_t mem1[4];
uint8_t mem2[4];
uint8_t mem3[4];
Span<uint8_t> buffers[n_buffers]{
{mem0, sizeof(mem0)},
{mem1, sizeof(mem1)},
{mem2, sizeof(mem2)},
{mem3, sizeof(mem3)},
};

// CPU buffers come first because the runtime always sets up host-side
// planned memory before any device buffers. The two CUDA entries use
// distinct device indices to verify per-buffer index tracking.
Device devices[] = {
Device(DeviceType::CPU, 0),
Device(DeviceType::CPU, 0),
Device(DeviceType::CUDA, 0),
Device(DeviceType::CUDA, 1),
};
Span<const Device> device_span(devices, n_buffers);

HierarchicalAllocator allocator({buffers, n_buffers}, device_span);

ASSERT_EQ(allocator.planned_buffer_devices().size(), n_buffers);
EXPECT_EQ(allocator.planned_buffer_devices()[0], Device(DeviceType::CPU, 0));
EXPECT_EQ(allocator.planned_buffer_devices()[1], Device(DeviceType::CPU, 0));
EXPECT_EQ(allocator.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0));
EXPECT_EQ(allocator.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1));
}

TEST_F(HierarchicalAllocatorTest, MismatchedDeviceCountAborts) {
constexpr size_t n_buffers = 2;
uint8_t mem0[4];
uint8_t mem1[4];
Span<uint8_t> buffers[n_buffers]{
{mem0, sizeof(mem0)},
{mem1, sizeof(mem1)},
};

// 3 device entries vs 2 buffers — should abort.
Device devices[] = {
Device(DeviceType::CPU, 0),
Device(DeviceType::CPU, 0),
Device(DeviceType::CUDA, 0),
};
Span<const Device> device_span(devices, 3);

ET_EXPECT_DEATH(HierarchicalAllocator({buffers, n_buffers}, device_span), "");
}

// TODO(T162089316): Tests the deprecated API. Remove this when removing the
// API.
TEST_F(HierarchicalAllocatorTest, DEPRECATEDSmoke) {
Expand Down
1 change: 1 addition & 0 deletions runtime/core/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def define_common_targets():
],
deps = [
"//executorch/runtime/core:memory_allocator",
"//executorch/test/utils:utils",
],
)

Expand Down
28 changes: 27 additions & 1 deletion runtime/executor/memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#include <executorch/runtime/core/hierarchical_allocator.h>
#include <executorch/runtime/core/memory_allocator.h>
#include <executorch/runtime/core/portable_type/device.h>
#include <executorch/runtime/core/span.h>

namespace executorch {
namespace runtime {
Expand Down Expand Up @@ -42,7 +44,8 @@ class MemoryManager final {
* must agree with the corresponding
* `MethodMeta::num_memory_planned_buffers()` and
* `MethodMeta::memory_planned_buffer_size(N)` values, which are embedded
* in the Program.
* in the Program. For device-aware programs, the per-buffer device
* metadata is owned by the HierarchicalAllocator as well.
* @param[in] temp_allocator The allocator to use when allocating temporary
* data during kernel or delegate execution. Must outlive the Method that
* uses it. May be `nullptr` if the Method does not use kernels or
Expand Down Expand Up @@ -105,6 +108,29 @@ class MemoryManager final {
return temp_allocator_;
}

/**
* Returns per-buffer device metadata. One entry per planned memory buffer,
* same count as planned_memory buffers. Empty if no device metadata was
* provided (CPU-only program) or if `planned_memory` is null.
*
* This is a thin wrapper around
* `HierarchicalAllocator::planned_buffer_devices()`.
*/
Span<const etensor::Device> planned_buffer_devices() const {
if (planned_memory_ == nullptr) {
return {};
}
return planned_memory_->planned_buffer_devices();
}

/**
* Returns true if any planned buffer has device metadata attached.
* When false, the memory setup is CPU-only.
*/
bool has_device_memory() const {
return planned_buffer_devices().size() > 0;
}

private:
MemoryAllocator* method_allocator_;
HierarchicalAllocator* planned_memory_;
Expand Down
64 changes: 64 additions & 0 deletions runtime/executor/test/memory_manager_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ using namespace ::testing;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::Span;
using executorch::runtime::etensor::Device;
using executorch::runtime::etensor::DeviceType;

TEST(MemoryManagerTest, MinimalCtor) {
MemoryAllocator method_allocator(0, nullptr);
Expand Down Expand Up @@ -93,3 +96,64 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
/*temp_allocator=*/&method_allocator),
"cannot be the same");
}

TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
MemoryAllocator method_allocator(0, nullptr);
HierarchicalAllocator planned_memory({});
MemoryAllocator temp_allocator(0, nullptr);

MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);

EXPECT_FALSE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
}

TEST(MemoryManagerTest, DelegatesDeviceMetadataToHierarchicalAllocator) {
MemoryAllocator method_allocator(0, nullptr);
MemoryAllocator temp_allocator(0, nullptr);

// 4 buffers: cpu:0, cpu:0, cuda:0, cuda:1. CPU buffers come first because
// the runtime always sets up host-side planned memory before any device
// buffers. The two CUDA entries use distinct indices to verify per-buffer
// index tracking.
constexpr size_t n_buffers = 4;
uint8_t mem0[4];
uint8_t mem1[4];
uint8_t mem2[4];
uint8_t mem3[4];
Span<uint8_t> buffers[n_buffers]{
{mem0, sizeof(mem0)},
{mem1, sizeof(mem1)},
{mem2, sizeof(mem2)},
{mem3, sizeof(mem3)},
};
Device devices[] = {
Device(DeviceType::CPU, 0),
Device(DeviceType::CPU, 0),
Device(DeviceType::CUDA, 0),
Device(DeviceType::CUDA, 1),
};
Span<const Device> device_span(devices, n_buffers);

HierarchicalAllocator planned_memory({buffers, n_buffers}, device_span);
MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);

EXPECT_EQ(mm.method_allocator(), &method_allocator);
EXPECT_EQ(mm.planned_memory(), &planned_memory);
EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
EXPECT_TRUE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), n_buffers);
EXPECT_EQ(mm.planned_buffer_devices()[0], Device(DeviceType::CPU, 0));
EXPECT_EQ(mm.planned_buffer_devices()[1], Device(DeviceType::CPU, 0));
EXPECT_EQ(mm.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0));
EXPECT_EQ(mm.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1));
}

TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
MemoryAllocator method_allocator(0, nullptr);

MemoryManager mm(&method_allocator);

EXPECT_FALSE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
}
3 changes: 3 additions & 0 deletions runtime/executor/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
"//executorch/exir/backend/test/...",
"//executorch/runtime/backend/...",
"//executorch/extension/pybindings/...",
"//executorch/extension/module/test/...",
"//executorch/devtools/fb/runners/...",
"//executorch/test/...",
"//executorch/examples/...",
Expand Down Expand Up @@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
deps = [
":managed_memory_manager",
"//executorch/runtime/executor:program",
"//executorch/runtime/core:device_allocator",
"//executorch/runtime/core:device_memory_buffer",
"//executorch/extension/data_loader:file_data_loader",
"//executorch/schema:program",
],
Expand Down
Loading
Loading