Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions exir/emit/_emitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan:
self.module.meta["non_const_buffer_sizes"],
),
container_meta_type=self.container_meta_type,
# non_const_buffer_device is set by apply_algo in memory_planning.py
# when device tensors are present. None for CPU-only programs.
non_const_buffer_device=self.module.meta.get(
"non_const_buffer_device", None
),
)
183 changes: 183 additions & 0 deletions exir/emit/test/test_emit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2643,3 +2643,186 @@ def forward(self, a, b):
0,
"No tensor should have extra device info when model runs entirely on CPU",
)

def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None:
"""Verify that non_const_buffer_device is emitted into ExecutionPlan when
device-aware memory planning is enabled and non-CPU tensors are present."""
from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
generate_pattern_op_partitions,
)
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.backend.partitioner import (
DelegationSpec,
Partitioner,
PartitionResult,
)
from executorch.exir.backend.test.backend_with_compiler_demo import (
BackendWithCompilerDemo,
)
from executorch.exir.passes.propagate_device_pass import (
TARGET_DEVICE_COMPILE_SPEC_KEY,
)
from torch.fx.passes.operator_support import any_chain, OperatorSupportBase

class AddSupport(OperatorSupportBase):
def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
return node.op == "call_function" and node.target in [
exir_ops.edge.aten.add.Tensor,
]

class DevicePartitioner(Partitioner):
def __init__(self):
super().__init__()
self.delegation_spec = DelegationSpec(
BackendWithCompilerDemo.__name__,
[
CompileSpec("max_value", bytes([4])),
CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
],
)

def partition(self, exported_program) -> PartitionResult:
partition_tags = {}
partition_list = generate_pattern_op_partitions(
exported_program.graph_module,
op_support=any_chain(AddSupport()),
)
for partition in partition_list:
for node in partition.nodes:
tag = f"tag{partition.id}"
node.meta["delegation_tag"] = tag
partition_tags[tag] = self.delegation_spec
return PartitionResult(
tagged_exported_program=exported_program,
partition_tags=partition_tags,
)

class Model(torch.nn.Module):
def forward(self, a, b):
return torch.add(a, b)

model = Model()
inputs = (torch.randn(2, 2), torch.randn(2, 2))

edge = to_edge(
export(model, inputs),
compile_config=EdgeCompileConfig(_check_ir_validity=False),
)
lowered = edge.to_backend(DevicePartitioner())
et_prog = lowered.to_executorch(
config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
)
program = et_prog._emitter_output.program

plan = program.execution_plan[0]
self.assertIsNotNone(
plan.non_const_buffer_device,
"non_const_buffer_device should be set when device tensors are present "
"and enable_non_cpu_memory_planning is True",
)
self.assertGreater(len(plan.non_const_buffer_device), 0)
for entry in plan.non_const_buffer_device:
self.assertEqual(entry.device_type, schema.DeviceType.CUDA)
self.assertEqual(entry.device_index, 0)

def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None:
"""When all tensors are on CPU, non_const_buffer_device should be None
even with enable_non_cpu_memory_planning=True."""

class Model(torch.nn.Module):
def forward(self, a, b):
return torch.add(a, b)

model = Model()
inputs = (torch.randn(2, 2), torch.randn(2, 2))

edge = to_edge(
export(model, inputs),
compile_config=EdgeCompileConfig(_check_ir_validity=False),
)
et_prog = edge.to_executorch(
config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
)
program = et_prog._emitter_output.program

plan = program.execution_plan[0]
self.assertIsNone(
plan.non_const_buffer_device,
"non_const_buffer_device should be None for CPU-only programs",
)

def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None:
"""Even with device tensors, non_const_buffer_device should be None when
enable_non_cpu_memory_planning is False (default)."""
from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
generate_pattern_op_partitions,
)
from executorch.exir.backend.compile_spec_schema import CompileSpec
from executorch.exir.backend.partitioner import (
DelegationSpec,
Partitioner,
PartitionResult,
)
from executorch.exir.backend.test.backend_with_compiler_demo import (
BackendWithCompilerDemo,
)
from executorch.exir.passes.propagate_device_pass import (
TARGET_DEVICE_COMPILE_SPEC_KEY,
)
from torch.fx.passes.operator_support import any_chain, OperatorSupportBase

class AddSupport(OperatorSupportBase):
def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
return node.op == "call_function" and node.target in [
exir_ops.edge.aten.add.Tensor,
]

class DevicePartitioner(Partitioner):
def __init__(self):
super().__init__()
self.delegation_spec = DelegationSpec(
BackendWithCompilerDemo.__name__,
[
CompileSpec("max_value", bytes([4])),
CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
],
)

def partition(self, exported_program) -> PartitionResult:
partition_tags = {}
partition_list = generate_pattern_op_partitions(
exported_program.graph_module,
op_support=any_chain(AddSupport()),
)
for partition in partition_list:
for node in partition.nodes:
tag = f"tag{partition.id}"
node.meta["delegation_tag"] = tag
partition_tags[tag] = self.delegation_spec
return PartitionResult(
tagged_exported_program=exported_program,
partition_tags=partition_tags,
)

class Model(torch.nn.Module):
def forward(self, a, b):
return torch.add(a, b)

model = Model()
inputs = (torch.randn(2, 2), torch.randn(2, 2))

edge = to_edge(
export(model, inputs),
compile_config=EdgeCompileConfig(_check_ir_validity=False),
)
lowered = edge.to_backend(DevicePartitioner())
# Default: enable_non_cpu_memory_planning=False
et_prog = lowered.to_executorch()
program = et_prog._emitter_output.program

plan = program.execution_plan[0]
self.assertIsNone(
plan.non_const_buffer_device,
"non_const_buffer_device should be None when "
"enable_non_cpu_memory_planning is False",
)
35 changes: 35 additions & 0 deletions runtime/core/device_memory_buffer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/runtime/core/device_memory_buffer.h>

namespace executorch::runtime {

Result<DeviceMemoryBuffer> DeviceMemoryBuffer::create(
size_t size,
etensor::DeviceType type,
etensor::DeviceIndex index,
size_t alignment) {
DeviceAllocator* allocator = get_device_allocator(type);
if (allocator == nullptr) {
ET_LOG(
Error,
"No device allocator registered for device type %d",
static_cast<int>(type));
return Error::NotFound;
}

auto result = allocator->allocate(size, index, alignment);
if (!result.ok()) {
return result.error();
}

return DeviceMemoryBuffer(result.get(), size, allocator, index);
}

} // namespace executorch::runtime
129 changes: 129 additions & 0 deletions runtime/core/device_memory_buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <cstddef>
#include <cstdint>

#include <executorch/runtime/core/device_allocator.h>
#include <executorch/runtime/core/result.h>
#include <executorch/runtime/core/span.h>

namespace executorch::runtime {

/**
* RAII wrapper that owns a single device memory allocation.
*
* On destruction, calls DeviceAllocator::deallocate() to free the memory.
* This mirrors the role of std::vector<uint8_t> for CPU planned buffers,
* but for device memory (CUDA, etc.).
*
* Move-only: cannot be copied, but can be moved to transfer ownership.
*/
class DeviceMemoryBuffer final {
public:
/**
* Creates a DeviceMemoryBuffer by allocating device memory.
*
* Looks up the DeviceAllocator for the given device type via the
* DeviceAllocatorRegistry. If no allocator is registered for the type,
* returns Error::NotFound.
*
* @param size Number of bytes to allocate.
* @param type The device type (e.g., CUDA).
* @param index The device index (e.g., 0 for cuda:0).
* @param alignment Minimum alignment of the returned pointer in bytes.
* Must be a power of 2. Defaults to DeviceAllocator::kDefaultAlignment.
* @return A Result containing the DeviceMemoryBuffer on success, or an error.
*/
static Result<DeviceMemoryBuffer> create(
size_t size,
etensor::DeviceType type,
etensor::DeviceIndex index = 0,
size_t alignment = DeviceAllocator::kDefaultAlignment);

DeviceMemoryBuffer() = default;

~DeviceMemoryBuffer() {
if (ptr_ != nullptr && allocator_ != nullptr) {
allocator_->deallocate(ptr_, device_index_);
}
}

// Move constructor: transfer ownership.
DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept
: ptr_(other.ptr_),
size_(other.size_),
allocator_(other.allocator_),
device_index_(other.device_index_) {
other.ptr_ = nullptr;
other.size_ = 0;
other.allocator_ = nullptr;
}

// Move assignment: release current, take ownership.
DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept {
if (this != &other) {
if (ptr_ != nullptr && allocator_ != nullptr) {
allocator_->deallocate(ptr_, device_index_);
}
ptr_ = other.ptr_;
size_ = other.size_;
allocator_ = other.allocator_;
device_index_ = other.device_index_;
other.ptr_ = nullptr;
other.size_ = 0;
other.allocator_ = nullptr;
}
return *this;
}

// Non-copyable.
DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete;
DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete;

/// Returns the device pointer, or nullptr if empty/moved-from.
void* data() const {
return ptr_;
}

/// Returns the size in bytes of the allocation.
size_t size() const {
return size_;
}

/**
* Returns a Span<uint8_t> wrapping the device pointer.
*
* This is intended for use with HierarchicalAllocator, which only performs
* pointer arithmetic on the span data and never dereferences it. Device
* pointers are valid for pointer arithmetic from the CPU side.
*/
Span<uint8_t> as_span() const {
return {static_cast<uint8_t*>(ptr_), size_};
}

private:
DeviceMemoryBuffer(
void* ptr,
size_t size,
DeviceAllocator* allocator,
etensor::DeviceIndex device_index)
: ptr_(ptr),
size_(size),
allocator_(allocator),
device_index_(device_index) {}

void* ptr_ = nullptr;
size_t size_ = 0;
DeviceAllocator* allocator_ = nullptr;
etensor::DeviceIndex device_index_ = 0;
};

} // namespace executorch::runtime
1 change: 1 addition & 0 deletions runtime/core/portable_type/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def define_common_targets():
"//executorch/extension/fb/dynamic_shim/...",
"//executorch/kernels/portable/cpu/...",
"//executorch/runtime/core:device_allocator",
"//executorch/runtime/core/...",
"//executorch/runtime/core/exec_aten/...",
"//executorch/runtime/core/portable_type/test/...",
],
Expand Down
11 changes: 11 additions & 0 deletions runtime/core/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ def define_common_targets():
visibility = ["//executorch/..."],
)

runtime.cxx_library(
name = "device_memory_buffer",
srcs = ["device_memory_buffer.cpp"],
exported_headers = ["device_memory_buffer.h"],
exported_deps = [
":core",
":device_allocator",
],
visibility = ["PUBLIC"],
)

runtime.cxx_library(
name = "tag",
srcs = ["tag.cpp"],
Expand Down
Loading
Loading