pytorch · Gasoonjia · May 13, 2026 · May 13, 2026 · May 13, 2026
@@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan:
                 self.module.meta["non_const_buffer_sizes"],
             ),
             container_meta_type=self.container_meta_type,
+            # non_const_buffer_device is set by apply_algo in memory_planning.py
+            # when device tensors are present. None for CPU-only programs.
+            non_const_buffer_device=self.module.meta.get(
+                "non_const_buffer_device", None
+            ),
         )
@@ -2643,3 +2643,186 @@ def forward(self, a, b):
             0,
             "No tensor should have extra device info when model runs entirely on CPU",
         )
+
+    def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None:
+        """Verify that non_const_buffer_device is emitted into ExecutionPlan when
+        device-aware memory planning is enabled and non-CPU tensors are present."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        et_prog = lowered.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNotNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be set when device tensors are present "
+            "and enable_non_cpu_memory_planning is True",
+        )
+        self.assertGreater(len(plan.non_const_buffer_device), 0)
+        for entry in plan.non_const_buffer_device:
+            self.assertEqual(entry.device_type, schema.DeviceType.CUDA)
+            self.assertEqual(entry.device_index, 0)
+
+    def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None:
+        """When all tensors are on CPU, non_const_buffer_device should be None
+        even with enable_non_cpu_memory_planning=True."""
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        et_prog = edge.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None for CPU-only programs",
+        )
+
+    def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None:
+        """Even with device tensors, non_const_buffer_device should be None when
+        enable_non_cpu_memory_planning is False (default)."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        # Default: enable_non_cpu_memory_planning=False
+        et_prog = lowered.to_executorch()
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None when "
+            "enable_non_cpu_memory_planning is False",
+        )
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+namespace executorch::runtime {
+
+Result<DeviceMemoryBuffer> DeviceMemoryBuffer::create(
+    size_t size,
+    etensor::DeviceType type,
+    etensor::DeviceIndex index,
+    size_t alignment) {
+  DeviceAllocator* allocator = get_device_allocator(type);
+  if (allocator == nullptr) {
+    ET_LOG(
+        Error,
+        "No device allocator registered for device type %d",
+        static_cast<int>(type));
+    return Error::NotFound;
+  }
+
+  auto result = allocator->allocate(size, index, alignment);
+  if (!result.ok()) {
+    return result.error();
+  }
+
+  return DeviceMemoryBuffer(result.get(), size, allocator, index);
+}
+
+} // namespace executorch::runtime
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch::runtime {
+
+/**
+ * RAII wrapper that owns a single device memory allocation.
+ *
+ * On destruction, calls DeviceAllocator::deallocate() to free the memory.
+ * This mirrors the role of std::vector<uint8_t> for CPU planned buffers,
+ * but for device memory (CUDA, etc.).
+ *
+ * Move-only: cannot be copied, but can be moved to transfer ownership.
+ */
+class DeviceMemoryBuffer final {
+ public:
+  /**
+   * Creates a DeviceMemoryBuffer by allocating device memory.
+   *
+   * Looks up the DeviceAllocator for the given device type via the
+   * DeviceAllocatorRegistry. If no allocator is registered for the type,
+   * returns Error::NotFound.
+   *
+   * @param size Number of bytes to allocate.
+   * @param type The device type (e.g., CUDA).
+   * @param index The device index (e.g., 0 for cuda:0).
+   * @param alignment Minimum alignment of the returned pointer in bytes.
+   *     Must be a power of 2. Defaults to DeviceAllocator::kDefaultAlignment.
+   * @return A Result containing the DeviceMemoryBuffer on success, or an error.
+   */
+  static Result<DeviceMemoryBuffer> create(
+      size_t size,
+      etensor::DeviceType type,
+      etensor::DeviceIndex index = 0,
+      size_t alignment = DeviceAllocator::kDefaultAlignment);
+
+  DeviceMemoryBuffer() = default;
+
+  ~DeviceMemoryBuffer() {
+    if (ptr_ != nullptr && allocator_ != nullptr) {
+      allocator_->deallocate(ptr_, device_index_);
+    }
+  }
+
+  // Move constructor: transfer ownership.
+  DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept
+      : ptr_(other.ptr_),
+        size_(other.size_),
+        allocator_(other.allocator_),
+        device_index_(other.device_index_) {
+    other.ptr_ = nullptr;
+    other.size_ = 0;
+    other.allocator_ = nullptr;
+  }
+
+  // Move assignment: release current, take ownership.
+  DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept {
+    if (this != &other) {
+      if (ptr_ != nullptr && allocator_ != nullptr) {
+        allocator_->deallocate(ptr_, device_index_);
+      }
+      ptr_ = other.ptr_;
+      size_ = other.size_;
+      allocator_ = other.allocator_;
+      device_index_ = other.device_index_;
+      other.ptr_ = nullptr;
+      other.size_ = 0;
+      other.allocator_ = nullptr;
+    }
+    return *this;
+  }
+
+  // Non-copyable.
+  DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete;
+  DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete;
+
+  /// Returns the device pointer, or nullptr if empty/moved-from.
+  void* data() const {
+    return ptr_;
+  }
+
+  /// Returns the size in bytes of the allocation.
+  size_t size() const {
+    return size_;
+  }
+
+  /**
+   * Returns a Span<uint8_t> wrapping the device pointer.
+   *
+   * This is intended for use with HierarchicalAllocator, which only performs
+   * pointer arithmetic on the span data and never dereferences it. Device
+   * pointers are valid for pointer arithmetic from the CPU side.
+   */
+  Span<uint8_t> as_span() const {
+    return {static_cast<uint8_t*>(ptr_), size_};
+  }
+
+ private:
+  DeviceMemoryBuffer(
+      void* ptr,
+      size_t size,
+      DeviceAllocator* allocator,
+      etensor::DeviceIndex device_index)
+      : ptr_(ptr),
+        size_(size),
+        allocator_(allocator),
+        device_index_(device_index) {}
+
+  void* ptr_ = nullptr;
+  size_t size_ = 0;
+  DeviceAllocator* allocator_ = nullptr;
+  etensor::DeviceIndex device_index_ = 0;
+};
+
+} // namespace executorch::runtime
@@ -28,6 +28,7 @@ def define_common_targets():
             "//executorch/extension/fb/dynamic_shim/...",
             "//executorch/kernels/portable/cpu/...",
             "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/core/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],

@@ -155,6 +155,17 @@ def define_common_targets():
             visibility = ["//executorch/..."],
         )
 
+    runtime.cxx_library(
+        name = "device_memory_buffer",
+        srcs = ["device_memory_buffer.cpp"],
+        exported_headers = ["device_memory_buffer.h"],
+        exported_deps = [
+            ":core",
+            ":device_allocator",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
     runtime.cxx_library(
         name = "tag",
         srcs = ["tag.cpp"],