From 9f5c14a7c7b2d92f4db45db48b39350007cff452 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:25:39 -0800
Subject: [PATCH 01/15] [ET Device Support] Schema changes: device info on
 Tensor and buffer-level device array

This diff adds device placement information to the ExecuTorch schema to support representing tensor-level device type information, which will be the basic requirement for the following tensor_parser updates.

This is part of the Phase 1 implementation to make ET device type work E2E without user-specified device placement.

Design doc: https://docs.google.com/document/d/1lwd9BlohmwkN5EEvRulO_b-XnZBwv1nMb5l2K3jfuwA/edit?tab=t.0#heading=h.o6anuvkix4bu

Differential Revision: [D93635657](https://our.internmc.facebook.com/intern/diff/D93635657/)

[ghstack-poisoned]
---
 exir/schema.py     | 43 +++++++++++++++++++++++++++++++++++++
 schema/program.fbs | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/exir/schema.py b/exir/schema.py
index 7dba623aebf..0d06a85acfa 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -48,6 +48,17 @@ class TensorDataLocation(IntEnum):
     EXTERNAL = 1
 
 
+class DeviceType(IntEnum):
+    """
+    Device type enum indicating where a tensor resides or should be allocated.
+    Note that this enum is not directly mapped to the DeviceType enum in pytorch/pytorch
+    Check program.fbs for explanations of this enum.
+    """
+
+    CPU = 0
+    CUDA = 1
+
+
 @dataclass
 class ExtraTensorInfo:
     """
@@ -57,6 +68,12 @@ class ExtraTensorInfo:
     mutable_data_segments_idx: int = 0
     fully_qualified_name: Optional[str] = None
     location: TensorDataLocation = TensorDataLocation.SEGMENT
+    # Device type where this tensor resides or should be allocated.
+    # Defaults to CPU for backward compatibility.
+    device_type: DeviceType = DeviceType.CPU
+    # Device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+    # A value of -1 indicates the default device.
+    device_index: int = -1
 
 
 @dataclass
@@ -261,6 +278,26 @@ class Operator:
     overload: str
 
 
+@dataclass
+class NonConstBufferDevice:
+    """
+    Device placement information for a non-constant memory buffer.
+    This is a sparse representation: only buffers that are NOT on CPU need entries.
+    Buffers not listed in ExecutionPlan.non_const_buffer_device default to CPU.
+    Check program.fbs for explanations.
+    """
+
+    # Index into ExecutionPlan.non_const_buffer_sizes identifying which buffer
+    # this entry applies to.
+    buffer_index: int
+    # The device type where this buffer should be allocated.
+    # Defaults to CPU for backward compatibility.
+    device_type: DeviceType = DeviceType.CPU
+    # The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+    # A value of -1 indicates the default device.
+    device_index: int = -1
+
+
 @dataclass
 class ExecutionPlan:
     name: str
@@ -276,6 +313,12 @@ class ExecutionPlan:
     # Runtime should use the len(constant_buffer) as the ground truch of
     # constant memory buffer size, and ignore non_const_buffer_sizes[0].
     non_const_buffer_sizes: List[int]
+    # [Optional] Sparse device placement information for non-constant buffers.
+    # Only buffers that are NOT on CPU need to be listed here. Each entry
+    # specifies a buffer_index (into non_const_buffer_sizes) and its device.
+    # Buffers not listed here default to CPU, saving binary size when most
+    # buffers are on CPU.
+    non_const_buffer_device: Optional[List[NonConstBufferDevice]] = None
 
 
 @dataclass
diff --git a/schema/program.fbs b/schema/program.fbs
index 18e96de69b6..07901923410 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -61,6 +61,25 @@ enum TensorDataLocation : byte {
   EXTERNAL = 1,
 }
 
+// Device type enum indicating where a tensor resides or should be allocated.
+// Follows PyTorch DeviceType convention for compatibility.
+enum DeviceType : byte {
+  CPU = 0,
+  CUDA = 1,
+  // Reserve slots for future device types following PyTorch convention:
+  // MKLDNN = 2,
+  // OPENGL = 3,
+  // OPENCL = 4,
+  // IDEEP = 5,
+  // HIP = 6,
+  // FPGA = 7,
+  // MAIA = 8,
+  // XLA = 9,
+  // MPS = 10,
+  // XPU = 11,
+  // PrivateUse1 = 12,
+}
+
 // Table to put additional information about tensors in that is not applicable
 // to the vast majority of tensors in the vast majority of programs.
 table ExtraTensorInfo {
@@ -79,6 +98,15 @@ table ExtraTensorInfo {
   //   must be non-empty, and is used as a key to find the tensor's external
   //   data. Tensor.data_buffer_idx is ignored.
   location: TensorDataLocation;
+
+  // [Optional] The device type where this tensor resides or should be allocated.
+  // Defaults to CPU for backward compatibility with existing PTE files.
+  device_type: DeviceType = CPU;
+
+  // [Optional] The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+  // A value of -1 indicates the default device. Defaults to -1 for backward
+  // compatibility.
+  device_index: byte = -1;
 }
 
 table Tensor {
@@ -386,6 +414,13 @@ table ExecutionPlan {
   // constants memory buffer size, and ignore non_const_buffer_sizes[0].
   non_const_buffer_sizes: [int64];
 
+  // [Optional] Sparse device placement information for non-constant buffers.
+  // Only buffers that are NOT on CPU need to be listed here. Each entry
+  // specifies a buffer_index (into non_const_buffer_sizes) and its device.
+  // Buffers not listed here default to CPU, saving binary size when most
+  // buffers are on CPU.
+  non_const_buffer_device: [NonConstBufferDevice];
+
 }
 
 // Constant tensor data stored directly in the flatbuffer.
@@ -406,6 +441,24 @@ table BackendDelegateInlineData {
   data: [ubyte] (force_align: 16);  // @executorch-delegate-alignment
 }
 
+// Device placement information for a non-constant memory buffer.
+// This is a sparse representation: only buffers that are NOT on CPU need entries.
+// Buffers not listed in ExecutionPlan.non_const_buffer_device default to CPU.
+table NonConstBufferDevice {
+  // Index into ExecutionPlan.non_const_buffer_sizes identifying which buffer
+  // this entry applies to.
+  buffer_index: uint32;
+
+  // The device type where this buffer should be allocated.
+  // Defaults to CPU for backward compatibility with existing PTE files.
+  device_type: DeviceType = CPU;
+
+  // The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+  // A value of -1 indicates the default device. Defaults to -1 for backward
+  // compatibility.
+  device_index: byte = -1;
+}
+
 // Describes a contiguous piece of data that lives outside of the flatbuffer data,
 // typically appended afterwards in the file. The "extended header" in the file,
 // when present, points to the segment base offset.

From af472d76b6cbe487d81c4ecbd7bf5d00368d91b4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:25:45 -0800
Subject: [PATCH 02/15] [ET Device Support] TensorImpl carries device info

This diff extends `TensorImpl` to carry device information, enabling the runtime tensor to track which device its data resides on (CPU, CUDA, etc.). This is a prerequisite for parsing device info from the schema and allocating device memory.

Differential Revision: [D93635655](https://our.internmc.facebook.com/intern/diff/D93635655/)

[ghstack-poisoned]
---
 runtime/core/portable_type/tensor_impl.cpp    |   7 +-
 runtime/core/portable_type/tensor_impl.h      |  25 +++-
 .../portable_type/test/tensor_impl_test.cpp   | 112 ++++++++++++++++++
 3 files changed, 141 insertions(+), 3 deletions(-)

diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index ede5a3d4101..17243fca0fd 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -50,7 +50,9 @@ TensorImpl::TensorImpl(
     void* data,
     DimOrderType* dim_order,
     StridesType* strides,
-    TensorShapeDynamism dynamism)
+    TensorShapeDynamism dynamism,
+    DeviceType device_type,
+    DeviceIndex device_index)
     : sizes_(sizes),
       dim_order_(dim_order),
       strides_(strides),
@@ -59,7 +61,8 @@ TensorImpl::TensorImpl(
       numel_(compute_numel(sizes, dim)),
       numel_bound_(numel_),
       type_(type),
-      shape_dynamism_(dynamism) {
+      shape_dynamism_(dynamism),
+      device_(device_type, device_index) {
   ET_CHECK_MSG(
       isValid(type_), "Invalid type %" PRId8, static_cast<int8_t>(type_));
   ET_CHECK_MSG(dim_ >= 0, "Dimension must be non-negative, got %zd", dim_);
diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h
index 1e2b3620ca2..767a53bffae 100644
--- a/runtime/core/portable_type/tensor_impl.h
+++ b/runtime/core/portable_type/tensor_impl.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/core/tensor_shape_dynamism.h>
 
@@ -99,6 +100,8 @@ class TensorImpl {
    * @param strides Strides of the tensor at each dimension. Must contain `dim`
    *     entries.
    * @param dynamism The mutability of the shape of the tensor.
+   * @param device_type The type of device where tensor data resides.
+   * @param device_index The device index for multi-device scenarios.
    */
   TensorImpl(
       ScalarType type,
@@ -107,7 +110,9 @@ class TensorImpl {
       void* data = nullptr,
       DimOrderType* dim_order = nullptr,
       StridesType* strides = nullptr,
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC);
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC,
+      DeviceType device_type = DeviceType::CPU,
+      DeviceIndex device_index = -1);
 
   /**
    * Returns the size of the tensor in bytes.
@@ -176,6 +181,21 @@ class TensorImpl {
     return shape_dynamism_;
   }
 
+  /// Returns the device where tensor data resides.
+  Device device() const {
+    return device_;
+  }
+
+  /// Returns the type of device where tensor data resides.
+  DeviceType device_type() const {
+    return device_.type();
+  }
+
+  /// Returns the device index, or -1 if default/unspecified.
+  DeviceIndex device_index() const {
+    return device_.index();
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* data() const {
@@ -261,6 +281,9 @@ class TensorImpl {
 
   /// Specifies the mutability of the shape of the tensor.
   const TensorShapeDynamism shape_dynamism_;
+
+  /// Device where tensor data resides (CPU, CUDA, etc.)
+  Device device_;
 };
 
 /**
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index 0b8ae05f4da..f51ac5374dd 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -21,6 +21,9 @@ using namespace ::testing;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Error;
 using executorch::runtime::TensorShapeDynamism;
+using executorch::runtime::etensor::Device;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
 using executorch::runtime::etensor::ScalarType;
 using executorch::runtime::etensor::TensorImpl;
 using SizesType = TensorImpl::SizesType;
@@ -449,3 +452,112 @@ TEST_F(TensorImplTest, TestResizingTensorToZeroAndBack) {
   EXPECT_GT(t.numel(), 0);
   EXPECT_EQ(t.data(), data);
 }
+
+// ============== Device Tests ==============
+
+TEST_F(TensorImplTest, TestDefaultDeviceIsCPU) {
+  // TensorImpl constructed without device parameters should default to CPU
+  SizesType sizes[2] = {3, 2};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(ScalarType::Float, 2, sizes, data);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CPU);
+  EXPECT_EQ(t.device_index(), -1);
+  EXPECT_EQ(t.device(), Device(DeviceType::CPU, -1));
+}
+
+TEST_F(TensorImplTest, TestExplicitCPUDevice) {
+  // TensorImpl constructed with explicit CPU device
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CPU);
+  EXPECT_EQ(t.device_index(), 0);
+  EXPECT_EQ(t.device(), Device(DeviceType::CPU, 0));
+}
+
+TEST_F(TensorImplTest, TestCUDADevice) {
+  // TensorImpl constructed with CUDA device
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 0);
+  EXPECT_EQ(t.device(), Device(DeviceType::CUDA, 0));
+}
+
+TEST_F(TensorImplTest, TestCUDADeviceMultiGPU) {
+  // TensorImpl with CUDA device index 1 (second GPU)
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      1);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 1);
+  EXPECT_EQ(t.device(), Device(DeviceType::CUDA, 1));
+}
+
+TEST_F(TensorImplTest, TestDeviceWithDynamicTensor) {
+  // Device info should work correctly with dynamic tensors
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::DYNAMIC_BOUND,
+      DeviceType::CUDA,
+      0);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 0);
+
+  // Resize should not affect device
+  SizesType new_sizes[2] = {2, 2};
+  Error err = resize_tensor_impl(&t, {new_sizes, 2});
+  EXPECT_EQ(err, Error::Ok);
+
+  // Device should remain unchanged after resize
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 0);
+}

From 2b669855d081d0f16446774c78553f9326e0bfa3 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:25:51 -0800
Subject: [PATCH 03/15] [ET Device Support] DeviceAllocator interface and
 DeviceAllocatorRegistry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.).

**DeviceAllocator interface provides:**
- `init_buffer()` - Initialize memory buffer pools for memory-planned tensors
- `get_offset_address()` - Get pointer to offset within pre-allocated buffer
- `allocate()` / `deallocate()` - Dynamic device memory allocation
- `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device
- `device_type()` - Returns the device type this allocator handles

**DeviceAllocatorRegistry provides:**
- Singleton registry mapping DeviceType → DeviceAllocator
- `register_allocator()` / `get_allocator()` methods
- Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly)

**Design notes:**
- Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime
- Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry)
- Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use

Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/)

[ghstack-poisoned]
---
 runtime/core/device_allocator.cpp           |  58 +++++
 runtime/core/device_allocator.h             | 185 +++++++++++++
 runtime/core/test/device_allocator_test.cpp | 271 ++++++++++++++++++++
 3 files changed, 514 insertions(+)
 create mode 100644 runtime/core/device_allocator.cpp
 create mode 100644 runtime/core/device_allocator.h
 create mode 100644 runtime/core/test/device_allocator_test.cpp

diff --git a/runtime/core/device_allocator.cpp b/runtime/core/device_allocator.cpp
new file mode 100644
index 00000000000..6046445d3be
--- /dev/null
+++ b/runtime/core/device_allocator.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_allocator.h>
+
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace runtime {
+
+DeviceAllocatorRegistry& DeviceAllocatorRegistry::instance() {
+  static DeviceAllocatorRegistry registry;
+  return registry;
+}
+
+void DeviceAllocatorRegistry::register_allocator(
+    etensor::DeviceType type,
+    DeviceAllocator* alloc) {
+  auto index = static_cast<size_t>(type);
+  ET_CHECK_MSG(
+      index < etensor::kNumDeviceTypes,
+      "Invalid device type: %d",
+      static_cast<int>(type));
+  ET_CHECK_MSG(
+      allocators_[index] == nullptr,
+      "Allocator already registered for device type: %d",
+      static_cast<int>(type));
+  allocators_[index] = alloc;
+}
+
+DeviceAllocator* DeviceAllocatorRegistry::get_allocator(
+    etensor::DeviceType type) {
+  auto index = static_cast<size_t>(type);
+  if (index >= etensor::kNumDeviceTypes) {
+    return nullptr;
+  }
+  return allocators_[index];
+}
+
+// Convenience free functions
+
+void register_device_allocator(
+    etensor::DeviceType type,
+    DeviceAllocator* alloc) {
+  DeviceAllocatorRegistry::instance().register_allocator(type, alloc);
+}
+
+DeviceAllocator* get_device_allocator(etensor::DeviceType type) {
+  return DeviceAllocatorRegistry::instance().get_allocator(type);
+}
+
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/core/device_allocator.h b/runtime/core/device_allocator.h
new file mode 100644
index 00000000000..52f9710902d
--- /dev/null
+++ b/runtime/core/device_allocator.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/core/result.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Abstract interface for device-specific memory allocation.
+ *
+ * Each device type (CUDA, etc.) provides a concrete implementation
+ * that handles memory allocation on that device. Implementations are
+ * expected to be singletons with static lifetime, registered via
+ * DeviceAllocatorRegistry.
+
+ */
+class DeviceAllocator {
+ public:
+  virtual ~DeviceAllocator() = default;
+
+  /**
+   * Initialize a memory buffer pool for memory-planned tensors.
+   *
+   * @param memory_id The ID of the memory buffer (index into
+   *     ExecutionPlan.non_const_buffer_sizes).
+   * @param size The size in bytes to allocate for this buffer.
+   * @param index The device index (e.g., GPU 0 vs GPU 1).
+   * @return Error::Ok on success, or an appropriate error code on failure.
+   */
+  virtual Error
+  init_buffer(uint32_t memory_id, size_t size, etensor::DeviceIndex index) = 0;
+
+  /**
+   * Get a pointer to a specific offset within a pre-allocated buffer pool.
+   *
+   * @param memory_id The ID of the memory buffer.
+   * @param offset_bytes Offset in bytes from the start of the buffer.
+   * @param size_bytes Size of the requested region in bytes.
+   * @param index The device index.
+   * @return A Result containing the device pointer on success, or an error.
+   */
+  virtual Result<void*> get_offset_address(
+      uint32_t memory_id,
+      size_t offset_bytes,
+      size_t size_bytes,
+      etensor::DeviceIndex index) = 0;
+
+  /**
+   * Allocate device memory.
+   *
+   * @param nbytes Number of bytes to allocate.
+   * @param index The device index.
+   * @return A Result containing the device pointer on success, or an error.
+   */
+  virtual Result<void*> allocate(size_t nbytes, etensor::DeviceIndex index) = 0;
+
+  /**
+   * Deallocate device memory previously allocated via allocate().
+   *
+   * @param ptr Pointer to the memory to deallocate.
+   * @param index The device index.
+   */
+  virtual void deallocate(void* ptr, etensor::DeviceIndex index) = 0;
+
+  /**
+   * Copy data from host memory to device memory.
+   *
+   * @param dst Destination pointer (device memory).
+   * @param src Source pointer (host memory).
+   * @param nbytes Number of bytes to copy.
+   * @param index The device index.
+   * @return Error::Ok on success, or an appropriate error code on failure.
+   */
+  virtual Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      etensor::DeviceIndex index) = 0;
+
+  /**
+   * Copy data from device memory to host memory.
+   *
+   * @param dst Destination pointer (host memory).
+   * @param src Source pointer (device memory).
+   * @param nbytes Number of bytes to copy.
+   * @param index The device index.
+   * @return Error::Ok on success, or an appropriate error code on failure.
+   */
+  virtual Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      etensor::DeviceIndex index) = 0;
+
+  /**
+   * Returns the device type this allocator handles.
+   */
+  virtual etensor::DeviceType device_type() const = 0;
+};
+
+/**
+ * Registry for device allocators.
+ *
+ * Provides a global mapping from DeviceType to DeviceAllocator instances.
+ * Device allocators register themselves at static initialization time,
+ * and the runtime queries the registry to find the appropriate allocator
+ * for a given device type.
+ */
+class DeviceAllocatorRegistry {
+ public:
+  /**
+   * Returns the singleton instance of the registry.
+   */
+  static DeviceAllocatorRegistry& instance();
+
+  /**
+   * Register an allocator for a specific device type.
+   *
+   * @param type The device type this allocator handles.
+   * @param alloc Pointer to the allocator (must have static lifetime).
+   */
+  void register_allocator(etensor::DeviceType type, DeviceAllocator* alloc);
+
+  /**
+   * Get the allocator for a specific device type.
+   *
+   * @param type The device type.
+   * @return Pointer to the allocator, or nullptr if not registered.
+   */
+  DeviceAllocator* get_allocator(etensor::DeviceType type);
+
+ private:
+  DeviceAllocatorRegistry() = default;
+
+  // Fixed-size array indexed by device type. This avoids dynamic allocation
+  // and is suitable for embedded environments.
+  DeviceAllocator* allocators_[etensor::kNumDeviceTypes] = {};
+};
+
+// Convenience free functions
+
+/**
+ * Register a device allocator for a specific device type.
+ *
+ * @param type The device type this allocator handles.
+ * @param alloc Pointer to the allocator (must have static lifetime).
+ */
+void register_device_allocator(
+    etensor::DeviceType type,
+    DeviceAllocator* alloc);
+
+/**
+ * Get the device allocator for a specific device type.
+ *
+ * @param type The device type.
+ * @return Pointer to the allocator, or nullptr if not registered.
+ */
+DeviceAllocator* get_device_allocator(etensor::DeviceType type);
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::DeviceAllocator;
+using ::executorch::runtime::DeviceAllocatorRegistry;
+using ::executorch::runtime::get_device_allocator;
+using ::executorch::runtime::register_device_allocator;
+} // namespace executor
+} // namespace torch
diff --git a/runtime/core/test/device_allocator_test.cpp b/runtime/core/test/device_allocator_test.cpp
new file mode 100644
index 00000000000..3bf0f5ad583
--- /dev/null
+++ b/runtime/core/test/device_allocator_test.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_allocator.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using namespace ::testing;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceAllocatorRegistry;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+using executorch::runtime::etensor::kNumDeviceTypes;
+
+/**
+ * A mock DeviceAllocator implementation for testing purposes.
+ * Tracks calls to verify the registry dispatches correctly.
+ */
+class MockDeviceAllocator : public DeviceAllocator {
+ public:
+  explicit MockDeviceAllocator(DeviceType type) : type_(type) {}
+
+  Error init_buffer(uint32_t memory_id, size_t size, DeviceIndex index)
+      override {
+    last_init_buffer_memory_id_ = memory_id;
+    last_init_buffer_size_ = size;
+    last_init_buffer_index_ = index;
+    init_buffer_call_count_++;
+    return Error::Ok;
+  }
+
+  Result<void*> get_offset_address(
+      uint32_t memory_id,
+      size_t offset_bytes,
+      size_t size_bytes,
+      DeviceIndex index) override {
+    last_get_offset_memory_id_ = memory_id;
+    last_get_offset_offset_ = offset_bytes;
+    last_get_offset_size_ = size_bytes;
+    last_get_offset_index_ = index;
+    get_offset_address_call_count_++;
+    return &dummy_buffer_;
+  }
+
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    last_allocate_size_ = nbytes;
+    last_allocate_index_ = index;
+    allocate_call_count_++;
+    return &dummy_buffer_;
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    last_deallocate_ptr_ = ptr;
+    last_deallocate_index_ = index;
+    deallocate_call_count_++;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    last_h2d_dst_ = dst;
+    last_h2d_src_ = src;
+    last_h2d_size_ = nbytes;
+    last_h2d_index_ = index;
+    copy_h2d_call_count_++;
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    last_d2h_dst_ = dst;
+    last_d2h_src_ = src;
+    last_d2h_size_ = nbytes;
+    last_d2h_index_ = index;
+    copy_d2h_call_count_++;
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  // Tracking variables for verification
+  uint32_t last_init_buffer_memory_id_ = 0;
+  size_t last_init_buffer_size_ = 0;
+  DeviceIndex last_init_buffer_index_ = -1;
+  int init_buffer_call_count_ = 0;
+
+  uint32_t last_get_offset_memory_id_ = 0;
+  size_t last_get_offset_offset_ = 0;
+  size_t last_get_offset_size_ = 0;
+  DeviceIndex last_get_offset_index_ = -1;
+  int get_offset_address_call_count_ = 0;
+
+  size_t last_allocate_size_ = 0;
+  DeviceIndex last_allocate_index_ = -1;
+  int allocate_call_count_ = 0;
+
+  void* last_deallocate_ptr_ = nullptr;
+  DeviceIndex last_deallocate_index_ = -1;
+  int deallocate_call_count_ = 0;
+
+  void* last_h2d_dst_ = nullptr;
+  const void* last_h2d_src_ = nullptr;
+  size_t last_h2d_size_ = 0;
+  DeviceIndex last_h2d_index_ = -1;
+  int copy_h2d_call_count_ = 0;
+
+  void* last_d2h_dst_ = nullptr;
+  const void* last_d2h_src_ = nullptr;
+  size_t last_d2h_size_ = 0;
+  DeviceIndex last_d2h_index_ = -1;
+  int copy_d2h_call_count_ = 0;
+
+ private:
+  DeviceType type_;
+  uint8_t dummy_buffer_[64] = {};
+};
+
+class DeviceAllocatorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    executorch::runtime::runtime_init();
+  }
+};
+
+TEST_F(DeviceAllocatorTest, MockAllocatorDeviceType) {
+  MockDeviceAllocator cpu_allocator(DeviceType::CPU);
+  MockDeviceAllocator cuda_allocator(DeviceType::CUDA);
+
+  EXPECT_EQ(cpu_allocator.device_type(), DeviceType::CPU);
+  EXPECT_EQ(cuda_allocator.device_type(), DeviceType::CUDA);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorInitBuffer) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+
+  Error err =
+      allocator.init_buffer(/*memory_id=*/1, /*size=*/1024, /*index=*/0);
+
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(allocator.init_buffer_call_count_, 1);
+  EXPECT_EQ(allocator.last_init_buffer_memory_id_, 1);
+  EXPECT_EQ(allocator.last_init_buffer_size_, 1024);
+  EXPECT_EQ(allocator.last_init_buffer_index_, 0);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorGetOffsetAddress) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+
+  Result<void*> result = allocator.get_offset_address(
+      /*memory_id=*/2, /*offset_bytes=*/128, /*size_bytes=*/256, /*index=*/1);
+
+  EXPECT_TRUE(result.ok());
+  EXPECT_NE(result.get(), nullptr);
+  EXPECT_EQ(allocator.get_offset_address_call_count_, 1);
+  EXPECT_EQ(allocator.last_get_offset_memory_id_, 2);
+  EXPECT_EQ(allocator.last_get_offset_offset_, 128);
+  EXPECT_EQ(allocator.last_get_offset_size_, 256);
+  EXPECT_EQ(allocator.last_get_offset_index_, 1);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorAllocateAndDeallocate) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+
+  Result<void*> result = allocator.allocate(/*nbytes=*/512, /*index=*/0);
+  EXPECT_TRUE(result.ok());
+  void* ptr = result.get();
+  EXPECT_NE(ptr, nullptr);
+  EXPECT_EQ(allocator.allocate_call_count_, 1);
+  EXPECT_EQ(allocator.last_allocate_size_, 512);
+  EXPECT_EQ(allocator.last_allocate_index_, 0);
+
+  allocator.deallocate(ptr, /*index=*/0);
+  EXPECT_EQ(allocator.deallocate_call_count_, 1);
+  EXPECT_EQ(allocator.last_deallocate_ptr_, ptr);
+  EXPECT_EQ(allocator.last_deallocate_index_, 0);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorCopyHostToDevice) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+  uint8_t host_data[64] = {1, 2, 3, 4};
+  uint8_t device_data[64] = {};
+
+  Error err = allocator.copy_host_to_device(
+      device_data, host_data, sizeof(host_data), /*index=*/0);
+
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(allocator.copy_h2d_call_count_, 1);
+  EXPECT_EQ(allocator.last_h2d_dst_, device_data);
+  EXPECT_EQ(allocator.last_h2d_src_, host_data);
+  EXPECT_EQ(allocator.last_h2d_size_, sizeof(host_data));
+  EXPECT_EQ(allocator.last_h2d_index_, 0);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorCopyDeviceToHost) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+  uint8_t device_data[64] = {5, 6, 7, 8};
+  uint8_t host_data[64] = {};
+
+  Error err = allocator.copy_device_to_host(
+      host_data, device_data, sizeof(device_data), /*index=*/1);
+
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(allocator.copy_d2h_call_count_, 1);
+  EXPECT_EQ(allocator.last_d2h_dst_, host_data);
+  EXPECT_EQ(allocator.last_d2h_src_, device_data);
+  EXPECT_EQ(allocator.last_d2h_size_, sizeof(device_data));
+  EXPECT_EQ(allocator.last_d2h_index_, 1);
+}
+
+TEST_F(DeviceAllocatorTest, RegistryGetUnregisteredReturnsNullptr) {
+  // Getting an allocator for an unregistered device type should return nullptr
+  // Note that there shouldn't be any regsitered allocators for CPU backend.
+  DeviceAllocator* alloc = get_device_allocator(DeviceType::CPU);
+  (void)alloc;
+}
+
+TEST_F(DeviceAllocatorTest, RegistrySingletonInstance) {
+  // Verify that instance() returns the same object each time
+  DeviceAllocatorRegistry& instance1 = DeviceAllocatorRegistry::instance();
+  DeviceAllocatorRegistry& instance2 = DeviceAllocatorRegistry::instance();
+
+  EXPECT_EQ(&instance1, &instance2);
+}
+
+TEST_F(DeviceAllocatorTest, RegisterAndGetDeviceAllocator) {
+  // Register a mock allocator for CUDA and retrieve it via the free function.
+  MockDeviceAllocator cuda_allocator(DeviceType::CUDA);
+  register_device_allocator(DeviceType::CUDA, &cuda_allocator);
+
+  DeviceAllocator* retrieved = get_device_allocator(DeviceType::CUDA);
+  EXPECT_EQ(retrieved, &cuda_allocator);
+  EXPECT_EQ(retrieved->device_type(), DeviceType::CUDA);
+
+  // Registering the same device type twice should abort.
+  MockDeviceAllocator another_allocator(DeviceType::CUDA);
+  EXPECT_DEATH(
+      register_device_allocator(DeviceType::CUDA, &another_allocator),
+      "Allocator already registered");
+}
+
+TEST_F(DeviceAllocatorTest, RegisterAndDispatchThroughRegistry) {
+  // Verify that after registration, calls dispatch to the registered allocator.
+  DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA);
+  ASSERT_NE(alloc, nullptr);
+
+  // Use the allocator through the registry and verify it reaches the mock.
+  Error err = alloc->init_buffer(/*memory_id=*/5, /*size=*/2048, /*index=*/0);
+  EXPECT_EQ(err, Error::Ok);
+
+  Result<void*> result = alloc->allocate(/*nbytes=*/256, /*index=*/1);
+  EXPECT_TRUE(result.ok());
+  EXPECT_NE(result.get(), nullptr);
+}

From d630230274e98f87ac6b089d964f2a2a04cb273a Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:28:57 -0800
Subject: [PATCH 04/15] Update base for Update on "[ET Device Support]
 DeviceAllocator interface and DeviceAllocatorRegistry"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.).

**DeviceAllocator interface provides:**
- `init_buffer()` - Initialize memory buffer pools for memory-planned tensors
- `get_offset_address()` - Get pointer to offset within pre-allocated buffer
- `allocate()` / `deallocate()` - Dynamic device memory allocation
- `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device
- `device_type()` - Returns the device type this allocator handles

**DeviceAllocatorRegistry provides:**
- Singleton registry mapping DeviceType → DeviceAllocator
- `register_allocator()` / `get_allocator()` methods
- Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly)

**Design notes:**
- Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime
- Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry)
- Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use

Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/)

[ghstack-poisoned]
---
 schema/program.fbs | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/schema/program.fbs b/schema/program.fbs
index 07901923410..5b81afa905d 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -62,22 +62,9 @@ enum TensorDataLocation : byte {
 }
 
 // Device type enum indicating where a tensor resides or should be allocated.
-// Follows PyTorch DeviceType convention for compatibility.
 enum DeviceType : byte {
   CPU = 0,
   CUDA = 1,
-  // Reserve slots for future device types following PyTorch convention:
-  // MKLDNN = 2,
-  // OPENGL = 3,
-  // OPENCL = 4,
-  // IDEEP = 5,
-  // HIP = 6,
-  // FPGA = 7,
-  // MAIA = 8,
-  // XLA = 9,
-  // MPS = 10,
-  // XPU = 11,
-  // PrivateUse1 = 12,
 }
 
 // Table to put additional information about tensors in that is not applicable

From 228bdeb7c9511024c769f4fe08b44153c3af0201 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 19 Mar 2026 11:15:20 -0700
Subject: [PATCH 05/15] [ET Device Support] Add NonConstBufferDevice schema for
 per-buffer device mapping

Adds the NonConstBufferDevice table to the FlatBuffer schema (program.fbs) and the
corresponding Python dataclass to schema.py. This enables mapping each non-constant
planned memory buffer to a specific device type (CPU, CUDA, etc.).

The field is optional and absent for CPU-only programs, ensuring zero binary size regression.

Differential Revision: [D97335597](https://our.internmc.facebook.com/intern/diff/D97335597/)

[ghstack-poisoned]
---
 .../executorch_flatbuffer/ExecutionPlan.py    |  62 ++++++++-
 .../NonConstBufferDevice.py                   | 130 ++++++++++++++++++
 .../executorch_flatbuffer/__init__.py         |   2 +
 exir/_serialize/test/test_program.py          |  28 ++++
 exir/schema.py                                |  15 ++
 schema/program.fbs                            |  21 +++
 6 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100644 exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py

diff --git a/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py b/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py
index b8ed496b8a8..340a0ad69aa 100644
--- a/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py
+++ b/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py
@@ -10,6 +10,7 @@
 from executorch.exir._serialize.generated.executorch_flatbuffer.Chain import Chain
 from executorch.exir._serialize.generated.executorch_flatbuffer.ContainerMetadata import ContainerMetadata
 from executorch.exir._serialize.generated.executorch_flatbuffer.EValue import EValue
+from executorch.exir._serialize.generated.executorch_flatbuffer.NonConstBufferDevice import NonConstBufferDevice
 from executorch.exir._serialize.generated.executorch_flatbuffer.Operator import Operator
 from typing import Optional
 np = import_numpy()
@@ -230,8 +231,32 @@ def NonConstBufferSizesIsNone(self) -> bool:
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
         return o == 0
 
+    # ExecutionPlan
+    def NonConstBufferDevice(self, j: int) -> Optional[NonConstBufferDevice]:
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            obj = NonConstBufferDevice()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # ExecutionPlan
+    def NonConstBufferDeviceLength(self) -> int:
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ExecutionPlan
+    def NonConstBufferDeviceIsNone(self) -> bool:
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        return o == 0
+
 def ExecutionPlanStart(builder: flatbuffers.Builder):
-    builder.StartObject(9)
+    builder.StartObject(10)
 
 def Start(builder: flatbuffers.Builder):
     ExecutionPlanStart(builder)
@@ -332,6 +357,18 @@ def ExecutionPlanStartNonConstBufferSizesVector(builder, numElems: int) -> int:
 def StartNonConstBufferSizesVector(builder, numElems: int) -> int:
     return ExecutionPlanStartNonConstBufferSizesVector(builder, numElems)
 
+def ExecutionPlanAddNonConstBufferDevice(builder: flatbuffers.Builder, nonConstBufferDevice: int):
+    builder.PrependUOffsetTRelativeSlot(9, flatbuffers.number_types.UOffsetTFlags.py_type(nonConstBufferDevice), 0)
+
+def AddNonConstBufferDevice(builder: flatbuffers.Builder, nonConstBufferDevice: int):
+    ExecutionPlanAddNonConstBufferDevice(builder, nonConstBufferDevice)
+
+def ExecutionPlanStartNonConstBufferDeviceVector(builder, numElems: int) -> int:
+    return builder.StartVector(4, numElems, 4)
+
+def StartNonConstBufferDeviceVector(builder, numElems: int) -> int:
+    return ExecutionPlanStartNonConstBufferDeviceVector(builder, numElems)
+
 def ExecutionPlanEnd(builder: flatbuffers.Builder) -> int:
     return builder.EndObject()
 
@@ -342,6 +379,7 @@ def End(builder: flatbuffers.Builder) -> int:
 from executorch.exir._serialize.generated.executorch_flatbuffer import Chain
 from executorch.exir._serialize.generated.executorch_flatbuffer import ContainerMetadata
 from executorch.exir._serialize.generated.executorch_flatbuffer import EValue
+from executorch.exir._serialize.generated.executorch_flatbuffer import NonConstBufferDevice
 from executorch.exir._serialize.generated.executorch_flatbuffer import Operator
 try:
     from typing import List, Optional
@@ -361,6 +399,7 @@ def __init__(self):
         self.operators = None  # type: List[executorch_flatbuffer.Operator.OperatorT]
         self.delegates = None  # type: List[executorch_flatbuffer.BackendDelegate.BackendDelegateT]
         self.nonConstBufferSizes = None  # type: List[int]
+        self.nonConstBufferDevice = None  # type: List[executorch_flatbuffer.NonConstBufferDevice.NonConstBufferDeviceT]
 
     @classmethod
     def InitFromBuf(cls, buf, pos):
@@ -389,7 +428,8 @@ def __eq__(self, other):
             self.chains == other.chains and \
             self.operators == other.operators and \
             self.delegates == other.delegates and \
-            self.nonConstBufferSizes == other.nonConstBufferSizes
+            self.nonConstBufferSizes == other.nonConstBufferSizes and \
+            self.nonConstBufferDevice == other.nonConstBufferDevice
 
     # ExecutionPlanT
     def _UnPack(self, executionPlan):
@@ -451,6 +491,14 @@ def _UnPack(self, executionPlan):
                     self.nonConstBufferSizes.append(executionPlan.NonConstBufferSizes(i))
             else:
                 self.nonConstBufferSizes = executionPlan.NonConstBufferSizesAsNumpy()
+        if not executionPlan.NonConstBufferDeviceIsNone():
+            self.nonConstBufferDevice = []
+            for i in range(executionPlan.NonConstBufferDeviceLength()):
+                if executionPlan.NonConstBufferDevice(i) is None:
+                    self.nonConstBufferDevice.append(None)
+                else:
+                    nonConstBufferDevice_ = executorch_flatbuffer.NonConstBufferDevice.NonConstBufferDeviceT.InitFromObj(executionPlan.NonConstBufferDevice(i))
+                    self.nonConstBufferDevice.append(nonConstBufferDevice_)
 
     # ExecutionPlanT
     def Pack(self, builder):
@@ -514,6 +562,14 @@ def Pack(self, builder):
                 for i in reversed(range(len(self.nonConstBufferSizes))):
                     builder.PrependInt64(self.nonConstBufferSizes[i])
                 nonConstBufferSizes = builder.EndVector()
+        if self.nonConstBufferDevice is not None:
+            nonConstBufferDevicelist = []
+            for i in range(len(self.nonConstBufferDevice)):
+                nonConstBufferDevicelist.append(self.nonConstBufferDevice[i].Pack(builder))
+            ExecutionPlanStartNonConstBufferDeviceVector(builder, len(self.nonConstBufferDevice))
+            for i in reversed(range(len(self.nonConstBufferDevice))):
+                builder.PrependUOffsetTRelative(nonConstBufferDevicelist[i])
+            nonConstBufferDevice = builder.EndVector()
         ExecutionPlanStart(builder)
         if self.name is not None:
             ExecutionPlanAddName(builder, name)
@@ -533,5 +589,7 @@ def Pack(self, builder):
             ExecutionPlanAddDelegates(builder, delegates)
         if self.nonConstBufferSizes is not None:
             ExecutionPlanAddNonConstBufferSizes(builder, nonConstBufferSizes)
+        if self.nonConstBufferDevice is not None:
+            ExecutionPlanAddNonConstBufferDevice(builder, nonConstBufferDevice)
         executionPlan = ExecutionPlanEnd(builder)
         return executionPlan
diff --git a/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py b/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py
new file mode 100644
index 00000000000..d82df37d29b
--- /dev/null
+++ b/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py
@@ -0,0 +1,130 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: executorch_flatbuffer
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+from typing import Any
+np = import_numpy()
+
+class NonConstBufferDevice(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset: int = 0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NonConstBufferDevice()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsNonConstBufferDevice(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def NonConstBufferDeviceBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x45\x54\x31\x32", size_prefixed=size_prefixed)
+
+    # NonConstBufferDevice
+    def Init(self, buf: bytes, pos: int):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # NonConstBufferDevice
+    def BufferIdx(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # NonConstBufferDevice
+    def DeviceType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # NonConstBufferDevice
+    def DeviceIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def NonConstBufferDeviceStart(builder: flatbuffers.Builder):
+    builder.StartObject(3)
+
+def Start(builder: flatbuffers.Builder):
+    NonConstBufferDeviceStart(builder)
+
+def NonConstBufferDeviceAddBufferIdx(builder: flatbuffers.Builder, bufferIdx: int):
+    builder.PrependInt32Slot(0, bufferIdx, 0)
+
+def AddBufferIdx(builder: flatbuffers.Builder, bufferIdx: int):
+    NonConstBufferDeviceAddBufferIdx(builder, bufferIdx)
+
+def NonConstBufferDeviceAddDeviceType(builder: flatbuffers.Builder, deviceType: int):
+    builder.PrependInt8Slot(1, deviceType, 0)
+
+def AddDeviceType(builder: flatbuffers.Builder, deviceType: int):
+    NonConstBufferDeviceAddDeviceType(builder, deviceType)
+
+def NonConstBufferDeviceAddDeviceIndex(builder: flatbuffers.Builder, deviceIndex: int):
+    builder.PrependInt8Slot(2, deviceIndex, 0)
+
+def AddDeviceIndex(builder: flatbuffers.Builder, deviceIndex: int):
+    NonConstBufferDeviceAddDeviceIndex(builder, deviceIndex)
+
+def NonConstBufferDeviceEnd(builder: flatbuffers.Builder) -> int:
+    return builder.EndObject()
+
+def End(builder: flatbuffers.Builder) -> int:
+    return NonConstBufferDeviceEnd(builder)
+
+
+class NonConstBufferDeviceT(object):
+
+    # NonConstBufferDeviceT
+    def __init__(self):
+        self.bufferIdx = 0  # type: int
+        self.deviceType = 0  # type: int
+        self.deviceIndex = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        nonConstBufferDevice = NonConstBufferDevice()
+        nonConstBufferDevice.Init(buf, pos)
+        return cls.InitFromObj(nonConstBufferDevice)
+
+    @classmethod
+    def InitFromPackedBuf(cls, buf, pos=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, pos)
+        return cls.InitFromBuf(buf, pos+n)
+
+    @classmethod
+    def InitFromObj(cls, nonConstBufferDevice):
+        x = NonConstBufferDeviceT()
+        x._UnPack(nonConstBufferDevice)
+        return x
+
+    def __eq__(self, other):
+        return type(self) == type(other) and \
+            self.bufferIdx == other.bufferIdx and \
+            self.deviceType == other.deviceType and \
+            self.deviceIndex == other.deviceIndex
+
+    # NonConstBufferDeviceT
+    def _UnPack(self, nonConstBufferDevice):
+        if nonConstBufferDevice is None:
+            return
+        self.bufferIdx = nonConstBufferDevice.BufferIdx()
+        self.deviceType = nonConstBufferDevice.DeviceType()
+        self.deviceIndex = nonConstBufferDevice.DeviceIndex()
+
+    # NonConstBufferDeviceT
+    def Pack(self, builder):
+        NonConstBufferDeviceStart(builder)
+        NonConstBufferDeviceAddBufferIdx(builder, self.bufferIdx)
+        NonConstBufferDeviceAddDeviceType(builder, self.deviceType)
+        NonConstBufferDeviceAddDeviceIndex(builder, self.deviceIndex)
+        nonConstBufferDevice = NonConstBufferDeviceEnd(builder)
+        return nonConstBufferDevice
diff --git a/exir/_serialize/generated/executorch_flatbuffer/__init__.py b/exir/_serialize/generated/executorch_flatbuffer/__init__.py
index df59751e724..7cc3b482376 100644
--- a/exir/_serialize/generated/executorch_flatbuffer/__init__.py
+++ b/exir/_serialize/generated/executorch_flatbuffer/__init__.py
@@ -31,6 +31,7 @@
 from . import KernelTypes
 from . import MoveCall
 from . import NamedData
+from . import NonConstBufferDevice
 from . import Null
 from . import Operator
 from . import OptionalTensorList
@@ -75,6 +76,7 @@
     "KernelTypes",
     "MoveCall",
     "NamedData",
+    "NonConstBufferDevice",
     "Null",
     "Operator",
     "OptionalTensorList",
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index 46e8f020a0b..1b6aab94af3 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -38,7 +38,9 @@
     ContainerMetadata,
     DataLocation,
     DataSegment,
+    DeviceType,
     ExecutionPlan,
+    NonConstBufferDevice,
     Program,
     SubsegmentOffsets,
 )
@@ -477,6 +479,32 @@ def test_round_trip_large_buffer_sizes(self) -> None:
             program, deserialize_pte_binary(flatbuffer_from_py).program
         )
 
+    def test_round_trip_with_non_const_buffer_device(self) -> None:
+        """Tests that non_const_buffer_device survives round-trip
+        serialization/deserialization. This verifies the schema extension
+        for per-buffer device mapping works correctly.
+        """
+        program = get_test_program()
+        program.execution_plan[0].non_const_buffer_device = [
+            NonConstBufferDevice(buffer_idx=0, device_type=DeviceType.CPU, device_index=0),
+            NonConstBufferDevice(buffer_idx=1, device_type=DeviceType.CUDA, device_index=0),
+        ]
+        flatbuffer_from_py = bytes(serialize_pte_binary(pte_file=PTEFile(program)))
+        self.assert_programs_equal(
+            program, deserialize_pte_binary(flatbuffer_from_py).program
+        )
+
+    def test_round_trip_without_non_const_buffer_device(self) -> None:
+        """Tests backward compatibility: a program without non_const_buffer_device
+        (the default) round-trips correctly and the field remains None.
+        """
+        program = get_test_program()
+        self.assertIsNone(program.execution_plan[0].non_const_buffer_device)
+        flatbuffer_from_py = bytes(serialize_pte_binary(pte_file=PTEFile(program)))
+        deserialized = deserialize_pte_binary(flatbuffer_from_py).program
+        self.assert_programs_equal(program, deserialized)
+        self.assertIsNone(deserialized.execution_plan[0].non_const_buffer_device)
+
     def test_round_trip_no_segments_and_no_header(self) -> None:
         """Tests that a Program serialized with extract_delegate_segments=True
         when there are no segments does not contain an extended header,
diff --git a/exir/schema.py b/exir/schema.py
index 993a473dabb..add90dec45c 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -268,6 +268,18 @@ class Operator:
     overload: str
 
 
+@dataclass
+class NonConstBufferDevice:
+    """Maps a non-constant buffer to the device where it should be allocated."""
+
+    # Index into the non_const_buffer_sizes list.
+    buffer_idx: int = 0
+    # The device type for this buffer (CPU, CUDA, etc.).
+    device_type: DeviceType = DeviceType.CPU
+    # The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+    device_index: int = 0
+
+
 @dataclass
 class ExecutionPlan:
     name: str
@@ -283,6 +295,9 @@ class ExecutionPlan:
     # Runtime should use the len(constant_buffer) as the ground truch of
     # constant memory buffer size, and ignore non_const_buffer_sizes[0].
     non_const_buffer_sizes: List[int]
+    # Per-buffer device mapping. Each entry maps a non-constant buffer to the
+    # device where it should be allocated. For CPU-only programs, this is empty.
+    non_const_buffer_device: Optional[List[NonConstBufferDevice]] = None
 
 
 @dataclass
diff --git a/schema/program.fbs b/schema/program.fbs
index f5872633ac8..c6e6edc790f 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -401,6 +401,27 @@ table ExecutionPlan {
   // constants memory buffer size, and ignore non_const_buffer_sizes[0].
   non_const_buffer_sizes: [int64];
 
+  // [Optional] Per-buffer device mapping, parallel to non_const_buffer_sizes.
+  // Each entry maps a non-constant buffer to the device where it should be
+  // allocated. For CPU-only programs, this field is absent and all buffers
+  // default to CPU, ensuring zero regression.
+  non_const_buffer_device: [NonConstBufferDevice];
+
+}
+
+// Maps a non-constant buffer to the device where it should be allocated.
+// When present as part of ExecutionPlan.non_const_buffer_device, each entry
+// describes the device placement for the corresponding planned memory buffer.
+// For CPU-only programs, this table is absent (all buffers default to CPU).
+table NonConstBufferDevice {
+  // Index into the non_const_buffer_sizes list.
+  buffer_idx: int;
+
+  // The device type for this buffer (CPU, CUDA, etc.).
+  device_type: DeviceType = CPU;
+
+  // The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+  device_index: byte = 0;
 }
 
 // Constant tensor data stored directly in the flatbuffer.

From 747dbaaa4c49ea659009445ad634e0dcedecc1ee Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 20 Mar 2026 11:21:56 -0700
Subject: [PATCH 06/15] [ET Device Support] Device-aware memory planning:
 separate buffers per device type

Extends memory planning to separate device tensors from CPU tensors into distinct
memory buffers. Non-CPU TensorSpecs (e.g., CUDA) are pre-assigned device-specific
mem_ids before the greedy/naive algorithm runs, ensuring they get planned into
independent memory buffers that never share space with CPU tensors.

Differential Revision: [D97447105](https://our.internmc.facebook.com/intern/diff/D97447105/)

[ghstack-poisoned]
---
 exir/capture/_config.py             |   8 +-
 exir/memory_planning.py             | 116 ++++++++++++++-----
 exir/passes/memory_planning_pass.py |   3 +
 exir/program/_program.py            |   6 +
 exir/tests/test_memory_planning.py  | 169 ++++++++++++++++++++++++++++
 5 files changed, 273 insertions(+), 29 deletions(-)

diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index 3fbc8ae7ef3..f8c3be6e7c8 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -115,5 +115,11 @@ class ExecutorchBackendConfig:
     # If set to true, we run quant fusion and constant propagation passes
     do_quant_fusion_and_const_prop: bool = False
 
-    # Experimental: If set to true, we run a pass to reinplace ops in the graph.
+    # If set to true, we run a pass to reinplace ops in the graph.
     run_reinplace_pass: bool = False
+
+    # When True, memory planning partitions specs by device and runs the
+    # algorithm independently per device, producing separate buffers for CPU
+    # vs. accelerator memory.  Default False preserves the legacy behavior
+    # where all tensors are planned into CPU memory regardless of device.
+    enable_non_cpu_memory_planning: bool = False
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index c5d3441bcde..f6e3234fce5 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -28,6 +28,7 @@
 import torch
 from executorch.exir import memory
 from executorch.exir.control_flow import while_loop as exir_while
+from executorch.exir.schema import DeviceType, NonConstBufferDevice
 from executorch.exir.delegate import executorch_call_delegate
 from executorch.exir.error import internal_assert, InternalError
 from executorch.exir.operator.convert import is_inplace_variant, is_out_variant
@@ -1211,10 +1212,19 @@ def apply_algo(
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
     alloc_mutable_buffers: bool = True,
+    enable_non_cpu_memory_planning: bool = False,
 ) -> list[int]:
     """
     Recursively apply algo to graph_module and its submodules for control flow.
 
+    Partitions specs by device type and device idx, and runs the memory planning
+    algorithm independently per device, then merges results into separate buffers.
+    This ensures device memory and CPU memory are never mixed.
+
+    When enable_non_cpu_memory_planning is False (default), all specs are planned
+    into a single CPU memory pool regardless of their device attribute. This
+    preserves the legacy behavior. Set to True to enable per-device partitioning.
+
     Algo implementation should handle one of two meta entries for submodules:
     1. input_mem_buffer_sizes: List of int offset bytes. Memory allocated by
        `algo` should start at the offset specified by this list;
@@ -1229,18 +1239,19 @@ def apply_algo(
     `operand` arg. The memory for operands is unused.
     """
     # Extract the nodes and their lifespans from the graph_module
-    # Difficult to just filter the list of specs returned by this due to
-    # how we flag trainable weights.
     _ = update_all_tensors_lifetime(graph_module, graph_signature)
 
-    # Filter specs based on alloc_graph_input and alloc_graph_output
-    specs = collect_specs_from_nodes(
-        graph_module.graph.nodes,
-        graph_signature,
-        do_assertion=False,
-        ignore_graph_input=not alloc_graph_input,
-        ignore_graph_output=not alloc_graph_output,
-        ignore_mutable_buffers=not alloc_mutable_buffers,
+    # Collect and materialize specs into a set so we can iterate multiple
+    # times and partition by device.
+    all_specs: set[TensorSpec] = set(
+        collect_specs_from_nodes(
+            graph_module.graph.nodes,
+            graph_signature,
+            do_assertion=False,
+            ignore_graph_input=not alloc_graph_input,
+            ignore_graph_output=not alloc_graph_output,
+            ignore_mutable_buffers=not alloc_mutable_buffers,
+        )
     )
 
     # Get temporary specs for submodules to set aside space during execution
@@ -1249,29 +1260,78 @@ def apply_algo(
         algo, graph_module, alignment, graph_signature
     )
 
-    # Update `input_mem_buffer_sizes` in graph_module. This will allow existing
-    # algos to work using `input_mem_buffer_sizes` or use
-    # `non_const_buffer_sizes` directly.
-    # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
-    graph_module.input_mem_buffer_sizes = submodule_bufsizes
-
     # Get extra padding for XNNPACK if needed
     extra_padding = 0
     if _contains_xnnpack_delegate(graph_module):
         extra_padding = 64
 
-    # Pass the filtered specs to the algorithm
-    bufsizes: list[int] = algo(
-        alignment,
-        specs,
-        graph_module,
-        graph_signature,
-        extra_padding,
+    # 1. Partition specs by device
+    specs_by_device: dict[DeviceType, set[TensorSpec]] = defaultdict(set)
+    if enable_non_cpu_memory_planning:
+        for spec in all_specs:
+            specs_by_device[spec.device].add(spec)
+    else:
+        # Legacy behavior: all specs planned into CPU memory regardless of device
+        specs_by_device[DeviceType.CPU] = all_specs
+
+    # 2. Plan each device independently
+    global_bufsizes: list[int] = [0]  # index 0 reserved for constants
+    buffer_device_types: list[DeviceType] = [DeviceType.CPU]
+
+    # Process CPU first (if present), then other devices sorted by enum value
+    device_order = sorted(
+        specs_by_device.keys(),
+        key=lambda d: (d != DeviceType.CPU, d.value),
     )
 
-    # pyre-ignore[6]: Incompatible parameter type [6]
-    # In call `insert_calls_to_free`, for 2nd positional argument, expected `Set[TensorSpec]` but got `Iterable[TensorSpec]`
-    insert_calls_to_free(graph_module, specs)
+    for device_type in device_order:
+        device_specs = specs_by_device[device_type]
 
-    graph_module.meta.update({"non_const_buffer_sizes": bufsizes})
-    return bufsizes
+        # Only apply submodule pre-allocation for CPU specs; device buffers
+        # do not share memory space with CPU submodule arenas.
+        # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
+        graph_module.input_mem_buffer_sizes = (
+            submodule_bufsizes if device_type == DeviceType.CPU else []
+        )
+
+        # Run algorithm independently on this device's specs
+        device_bufsizes = algo(
+            alignment, device_specs, graph_module, graph_signature, extra_padding
+        )
+
+        # Calculate base mem_id in global space
+        base_mem_id = len(global_bufsizes)
+
+        # Append buffer sizes (skip index 0 which is constants placeholder)
+        global_bufsizes.extend(device_bufsizes[1:])
+
+        # Track device type for each new buffer slot
+        for _ in device_bufsizes[1:]:
+            buffer_device_types.append(device_type)
+
+        # Remap spec mem_ids from algo-local to global.
+        # The algorithm assigns mem_id starting from 1; remap to global position.
+        for spec in device_specs:
+            if spec.mem_id is not None:
+                spec.mem_id = (spec.mem_id - 1) + base_mem_id
+
+    # Ensure backward compatibility: at least [0, 0] when no specs exist
+    if len(global_bufsizes) < 2:
+        global_bufsizes.append(0)
+        buffer_device_types.append(DeviceType.CPU)
+
+    # 3. Insert free calls and build device buffer mapping
+    insert_calls_to_free(graph_module, all_specs)
+
+    has_device_buffers = any(dt != DeviceType.CPU for dt in buffer_device_types)
+    non_const_buffer_device: Optional[list[NonConstBufferDevice]] = None
+    if has_device_buffers:
+        non_const_buffer_device = [
+            NonConstBufferDevice(buffer_idx=i, device_type=dt, device_index=0)
+            for i, dt in enumerate(buffer_device_types)
+        ]
+
+    graph_module.meta["non_const_buffer_sizes"] = global_bufsizes
+    if non_const_buffer_device is not None:
+        graph_module.meta["non_const_buffer_device"] = non_const_buffer_device
+    return global_bufsizes
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index f3970f13b56..32c343a4607 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -153,6 +153,7 @@ def __init__(
         alloc_mutable_buffers: bool = True,
         share_mutable_buffers: bool = False,
         alignment: int = ALIGNMENT,
+        enable_non_cpu_memory_planning: bool = False,
     ) -> None:
         r"""
         alloc_graph_input/alloc_graph_output will have 4 different combinations
@@ -173,6 +174,7 @@ def __init__(
         self.alloc_mutable_buffers = alloc_mutable_buffers
         self.share_mutable_buffers = share_mutable_buffers
         self.alignment = alignment
+        self.enable_non_cpu_memory_planning = enable_non_cpu_memory_planning
         self.state = _MemoryPlanningState()
 
     def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
@@ -250,6 +252,7 @@ def run(
             # If mutable buffers are shared, then do not allocate them in the
             # main memory planning algo; they are allocated in run_multimethod.
             self.alloc_mutable_buffers and not self.share_mutable_buffers,
+            self.enable_non_cpu_memory_planning,
         )
 
         if self.share_mutable_buffers and graph_signature is not None:
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 9813b12d594..f1a22773b69 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1792,6 +1792,12 @@ def to_executorch(  # noqa (FLAKE8) C901
             else:
                 memory_planning_pass = config.memory_planning_pass
             # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
+            # Propagate enable_non_cpu_memory_planning from the top-level config
+            # to the pass instance so that device-aware partitioning is applied.
+            if hasattr(memory_planning_pass, "enable_non_cpu_memory_planning"):
+                memory_planning_pass.enable_non_cpu_memory_planning = (
+                    config.enable_non_cpu_memory_planning
+                )
             if hasattr(memory_planning_pass, "run"):
                 new_gm_res = memory_planning_pass.run(new_gm, new_signature)
             else:
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index f364541d900..27ecbdfe633 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -29,6 +29,8 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.memory_planning import (
     _do_user_inputs_exist,
+    apply_algo,
+    collect_specs_from_nodes,
     filter_nodes,
     get_node_tensor_specs,
     greedy,
@@ -45,6 +47,7 @@
     ToOutVarPass,
 )
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+from executorch.exir.schema import DeviceType
 from executorch.exir.tensor import TensorSpec
 from functorch.experimental.control_flow import map as torch_map
 from parameterized import parameterized
@@ -1259,3 +1262,169 @@ def reset(self, k_zeros: torch.Tensor, v_zeros: torch.Tensor) -> None:
             self.assertEqual(v_cache[0].val.allocation_info.memory_id, 2)
             self.assertEqual(v_cache[0].val.allocation_info.memory_offset_low, 256)
             self.assertEqual(v_cache[0].val.allocation_info.memory_offset_high, 0)
+
+
+class TestDeviceAwareMemoryPlanning(unittest.TestCase):
+    """Tests for per-device memory planning (separate buffers per device type)."""
+
+    def _prepare_model(
+        self,
+    ) -> Tuple[GraphModule, ExportGraphSignature]:
+        """Prepare ToyModelForMemPlanning through SpecPropPass + ToOutVarPass."""
+        model = ToyModelForMemPlanning()
+        inputs = model.get_random_inputs()
+        edge = to_edge(export(model, inputs, strict=True))
+        gm = edge.exported_program().graph_module
+        gs = edge.exported_program().graph_signature
+        gm = PassManager(passes=[SpecPropPass(), ToOutVarPass()])(gm).graph_module
+        return gm, gs
+
+    def _get_planned_specs(
+        self,
+        gm: GraphModule,
+        gs: ExportGraphSignature,
+    ) -> list[TensorSpec]:
+        """Get the unique set of specs that apply_algo would plan."""
+        return list(
+            collect_specs_from_nodes(
+                gm.graph.nodes,
+                gs,
+                do_assertion=False,
+                ignore_graph_input=False,
+                ignore_graph_output=False,
+                ignore_mutable_buffers=False,
+            )
+        )
+
+    def test_cpu_only_unchanged(self) -> None:
+        """CPU-only specs produce bufsizes = [0, X] with no device metadata."""
+        gm, gs = self._prepare_model()
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(
+            algo, gm, 16, gs, enable_non_cpu_memory_planning=True
+        )
+
+        # The CUDA spec is the only tensor in its buffer
+        self.assertEqual(bufsizes[0], 0)  # constants
+        self.assertGreater(bufsizes[1], 0)  # CPU activations
+        self.assertNotIn("non_const_buffer_device", gm.meta)
+
+    def test_all_cuda_no_wasted_slots(self) -> None:
+        """CUDA-only specs produce [0, X] with CUDA at buffer index 1."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        for spec in specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [0, cuda_size] — no wasted CPU buffer slot
+        self.assertEqual(len(bufsizes), 2)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        # Device mapping should be present
+        self.assertIn("non_const_buffer_device", gm.meta)
+        device_map = gm.meta["non_const_buffer_device"]
+        self.assertEqual(len(device_map), 2)
+        self.assertEqual(device_map[0].device_type, DeviceType.CPU)  # constants
+        self.assertEqual(device_map[1].device_type, DeviceType.CUDA)
+
+    def test_mixed_cpu_cuda_separate_buffers(self) -> None:
+        """CPU specs at mem_id=1, CUDA specs at mem_id=2, separate sizes."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set second half of specs to CUDA
+        mid = len(specs) // 2
+        self.assertGreater(mid, 0)
+        cpu_specs = specs[:mid]
+        cuda_specs = specs[mid:]
+        for spec in cuda_specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [constants, cpu_activations, cuda_activations]
+        self.assertEqual(len(bufsizes), 3)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        self.assertGreater(bufsizes[2], 0)
+
+        # CPU specs should have mem_id=1, CUDA specs should have mem_id=2
+        for spec in cpu_specs:
+            self.assertEqual(spec.mem_id, 1, f"CPU spec has wrong mem_id: {spec.mem_id}")
+        for spec in cuda_specs:
+            self.assertEqual(spec.mem_id, 2, f"CUDA spec has wrong mem_id: {spec.mem_id}")
+
+    def test_mem_offset_correct_after_remap(self) -> None:
+        """After remapping, mem_offset is relative to its own buffer."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set the last spec to CUDA (sole CUDA tensor)
+        cuda_spec = specs[-1]
+        cuda_spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(
+            algo, gm, 16, gs, enable_non_cpu_memory_planning=True
+        )
+
+        # The CUDA spec is the only tensor in its buffer, so offset should be 0
+        self.assertEqual(cuda_spec.mem_offset, 0)
+        # The CUDA buffer should fit exactly this tensor
+        cuda_mem_id = cuda_spec.mem_id
+        self.assertIsNotNone(cuda_mem_id)
+        assert cuda_mem_id is not None
+        self.assertGreaterEqual(bufsizes[cuda_mem_id], cuda_spec.allocated_memory)
+
+    def test_no_cross_device_memory_sharing(self) -> None:
+        """Specs on different devices never share buffers, regardless of lifetime."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        self.assertGreaterEqual(len(specs), 2)
+
+        # Assign alternating specs to CUDA to ensure some pairs have
+        # non-overlapping lifetimes (which greedy would normally share).
+        for i, spec in enumerate(specs):
+            if i % 2 == 0:
+                spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # Verify CPU and CUDA specs have disjoint mem_ids
+        cpu_mem_ids: set[int] = set()
+        cuda_mem_ids: set[int] = set()
+        for i, spec in enumerate(specs):
+            if spec.mem_id is not None:
+                if i % 2 == 0:
+                    cuda_mem_ids.add(spec.mem_id)
+                else:
+                    cpu_mem_ids.add(spec.mem_id)
+
+        self.assertTrue(
+            cpu_mem_ids.isdisjoint(cuda_mem_ids),
+            f"CPU {cpu_mem_ids} and CUDA {cuda_mem_ids} should not share buffers",
+        )
+
+    def test_disabled_falls_back_to_cpu(self) -> None:
+        """With enable_non_cpu_memory_planning=False (default), CUDA specs are
+        planned into CPU memory — no device-specific buffers are created."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        for spec in specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        # Default: enable_non_cpu_memory_planning=False
+        bufsizes = apply_algo(algo, gm, 16, gs)
+
+        # All specs planned into a single CPU pool — same as CPU-only
+        self.assertEqual(len(bufsizes), 2)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        self.assertNotIn("non_const_buffer_device", gm.meta)

From 0829c5d321b1156c0bc179eef87895cdd89eb375 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 14:58:38 -0700
Subject: [PATCH 07/15] [ET Device Support] Emitter reads
 non_const_buffer_device from graph meta

Enable serialzing non_const_buffer_device into into PTE file.

Differential Revision: [D97850707](https://our.internmc.facebook.com/intern/diff/D97850707/)

[ghstack-poisoned]
---
 exir/emit/_emitter.py       |   5 +
 exir/emit/test/test_emit.py | 183 ++++++++++++++++++++++++++++++++++++
 2 files changed, 188 insertions(+)

diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index a48d88fa224..5c1f2db465d 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan:
                 self.module.meta["non_const_buffer_sizes"],
             ),
             container_meta_type=self.container_meta_type,
+            # non_const_buffer_device is set by apply_algo in memory_planning.py
+            # when device tensors are present. None for CPU-only programs.
+            non_const_buffer_device=self.module.meta.get(
+                "non_const_buffer_device", None
+            ),
         )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 04284398656..7d08a94a8af 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -2643,3 +2643,186 @@ def forward(self, a, b):
             0,
             "No tensor should have CUDA device when model runs entirely on CPU",
         )
+
+    def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None:
+        """Verify that non_const_buffer_device is emitted into ExecutionPlan when
+        device-aware memory planning is enabled and non-CPU tensors are present."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        et_prog = lowered.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNotNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be set when device tensors are present "
+            "and enable_non_cpu_memory_planning is True",
+        )
+        self.assertGreater(len(plan.non_const_buffer_device), 0)
+        for entry in plan.non_const_buffer_device:
+            self.assertEqual(entry.device_type, schema.DeviceType.CUDA)
+            self.assertEqual(entry.device_index, 0)
+
+    def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None:
+        """When all tensors are on CPU, non_const_buffer_device should be None
+        even with enable_non_cpu_memory_planning=True."""
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        et_prog = edge.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None for CPU-only programs",
+        )
+
+    def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None:
+        """Even with device tensors, non_const_buffer_device should be None when
+        enable_non_cpu_memory_planning is False (default)."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        # Default: enable_non_cpu_memory_planning=False
+        et_prog = lowered.to_executorch()
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None when "
+            "enable_non_cpu_memory_planning is False",
+        )

From 9dc075bff1e8d77a41a549c5e06e9f68d4d80d41 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 14:59:55 -0700
Subject: [PATCH 08/15] [ET Device Support] DeviceMemoryBuffer RAII class for
 device memory lifetime management

Introduces DeviceMemoryBuffer, an RAII wrapper that owns a single device
memory allocation. On destruction, it automatically calls
DeviceAllocator::deallocate() to free the memory. This mirrors the role of
std::vector<uint8_t> for CPU planned buffers, but for non-cpu device memory (CUDA, etc.).

Key features:
- Static factory create(size, type, index) looks up DeviceAllocator from registry
- Move-only semantics (no copy) to enforce single ownership
- as_span() accessor wraps device pointer for use with HierarchicalAllocator
- Destructor is no-op for default-constructed or moved-from instances

Differential Revision: [D97850709](https://our.internmc.facebook.com/intern/diff/D97850709/)

[ghstack-poisoned]
---
 runtime/core/device_memory_buffer.cpp         |  34 ++++
 runtime/core/device_memory_buffer.h           | 126 +++++++++++++
 runtime/core/portable_type/targets.bzl        |   1 +
 runtime/core/targets.bzl                      |  27 +++
 .../core/test/device_memory_buffer_test.cpp   | 169 ++++++++++++++++++
 runtime/core/test/targets.bzl                 |   8 +
 6 files changed, 365 insertions(+)
 create mode 100644 runtime/core/device_memory_buffer.cpp
 create mode 100644 runtime/core/device_memory_buffer.h
 create mode 100644 runtime/core/test/device_memory_buffer_test.cpp

diff --git a/runtime/core/device_memory_buffer.cpp b/runtime/core/device_memory_buffer.cpp
new file mode 100644
index 00000000000..7eb3f0e3ae2
--- /dev/null
+++ b/runtime/core/device_memory_buffer.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+namespace executorch::runtime {
+
+Result<DeviceMemoryBuffer> DeviceMemoryBuffer::create(
+    size_t size,
+    etensor::DeviceType type,
+    etensor::DeviceIndex index) {
+  DeviceAllocator* allocator = get_device_allocator(type);
+  if (allocator == nullptr) {
+    ET_LOG(
+        Error,
+        "No device allocator registered for device type %d",
+        static_cast<int>(type));
+    return Error::NotFound;
+  }
+
+  auto result = allocator->allocate(size, index);
+  if (!result.ok()) {
+    return result.error();
+  }
+
+  return DeviceMemoryBuffer(result.get(), size, allocator, index);
+}
+
+} // namespace executorch::runtime
diff --git a/runtime/core/device_memory_buffer.h b/runtime/core/device_memory_buffer.h
new file mode 100644
index 00000000000..7071f3de58d
--- /dev/null
+++ b/runtime/core/device_memory_buffer.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch::runtime {
+
+/**
+ * RAII wrapper that owns a single device memory allocation.
+ *
+ * On destruction, calls DeviceAllocator::deallocate() to free the memory.
+ * This mirrors the role of std::vector<uint8_t> for CPU planned buffers,
+ * but for device memory (CUDA, etc.).
+ *
+ * Move-only: cannot be copied, but can be moved to transfer ownership.
+ */
+class DeviceMemoryBuffer final {
+ public:
+  /**
+   * Creates a DeviceMemoryBuffer by allocating device memory.
+   *
+   * Looks up the DeviceAllocator for the given device type via the
+   * DeviceAllocatorRegistry. If no allocator is registered for the type,
+   * returns Error::NotFound.
+   *
+   * @param size Number of bytes to allocate.
+   * @param type The device type (e.g., CUDA).
+   * @param index The device index (e.g., 0 for cuda:0).
+   * @return A Result containing the DeviceMemoryBuffer on success, or an error.
+   */
+  static Result<DeviceMemoryBuffer> create(
+      size_t size,
+      etensor::DeviceType type,
+      etensor::DeviceIndex index = 0);
+
+  DeviceMemoryBuffer() = default;
+
+  ~DeviceMemoryBuffer() {
+    if (ptr_ != nullptr && allocator_ != nullptr) {
+      allocator_->deallocate(ptr_, device_index_);
+    }
+  }
+
+  // Move constructor: transfer ownership.
+  DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept
+      : ptr_(other.ptr_),
+        size_(other.size_),
+        allocator_(other.allocator_),
+        device_index_(other.device_index_) {
+    other.ptr_ = nullptr;
+    other.size_ = 0;
+    other.allocator_ = nullptr;
+  }
+
+  // Move assignment: release current, take ownership.
+  DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept {
+    if (this != &other) {
+      if (ptr_ != nullptr && allocator_ != nullptr) {
+        allocator_->deallocate(ptr_, device_index_);
+      }
+      ptr_ = other.ptr_;
+      size_ = other.size_;
+      allocator_ = other.allocator_;
+      device_index_ = other.device_index_;
+      other.ptr_ = nullptr;
+      other.size_ = 0;
+      other.allocator_ = nullptr;
+    }
+    return *this;
+  }
+
+  // Non-copyable.
+  DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete;
+  DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete;
+
+  /// Returns the device pointer, or nullptr if empty/moved-from.
+  void* data() const {
+    return ptr_;
+  }
+
+  /// Returns the size in bytes of the allocation.
+  size_t size() const {
+    return size_;
+  }
+
+  /**
+   * Returns a Span<uint8_t> wrapping the device pointer.
+   *
+   * This is intended for use with HierarchicalAllocator, which only performs
+   * pointer arithmetic on the span data and never dereferences it. Device
+   * pointers are valid for pointer arithmetic from the CPU side.
+   */
+  Span<uint8_t> as_span() const {
+    return {static_cast<uint8_t*>(ptr_), size_};
+  }
+
+ private:
+  DeviceMemoryBuffer(
+      void* ptr,
+      size_t size,
+      DeviceAllocator* allocator,
+      etensor::DeviceIndex device_index)
+      : ptr_(ptr),
+        size_(size),
+        allocator_(allocator),
+        device_index_(device_index) {}
+
+  void* ptr_ = nullptr;
+  size_t size_ = 0;
+  DeviceAllocator* allocator_ = nullptr;
+  etensor::DeviceIndex device_index_ = 0;
+};
+
+} // namespace executorch::runtime
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 5b6e67fa213..33f18c68006 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -27,6 +27,7 @@ def define_common_targets():
             "//executorch/backends/...",
             "//executorch/extension/fb/dynamic_shim/...",
             "//executorch/kernels/portable/cpu/...",
+            "//executorch/runtime/core/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 2c13cdbdae3..9b40e947626 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -141,6 +141,33 @@ def define_common_targets():
             visibility = ["//executorch/..."],
         )
 
+    runtime.cxx_library(
+        name = "device_allocator",
+        srcs = ["device_allocator.cpp"],
+        exported_headers = [
+            "device_allocator.h",
+        ],
+        exported_deps = [
+            ":core",
+            "//executorch/runtime/core/portable_type:portable_type",
+        ],
+        deps = [
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "device_memory_buffer",
+        srcs = ["device_memory_buffer.cpp"],
+        exported_headers = ["device_memory_buffer.h"],
+        exported_deps = [
+            ":core",
+            ":device_allocator",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
     runtime.cxx_library(
         name = "tag",
         srcs = ["tag.cpp"],
diff --git a/runtime/core/test/device_memory_buffer_test.cpp b/runtime/core/test/device_memory_buffer_test.cpp
new file mode 100644
index 00000000000..81d0a757cf4
--- /dev/null
+++ b/runtime/core/test/device_memory_buffer_test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+/**
+ * A mock DeviceAllocator for testing DeviceMemoryBuffer.
+ * Returns pointers into a local buffer and tracks call counts.
+ */
+class MockAllocator : public DeviceAllocator {
+ public:
+  explicit MockAllocator(DeviceType type) : type_(type) {}
+
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    allocate_count_++;
+    last_allocate_size_ = nbytes;
+    return static_cast<void*>(buffer_);
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    last_deallocate_ptr_ = ptr;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  size_t last_allocate_size_ = 0;
+  void* last_deallocate_ptr_ = nullptr;
+  uint8_t buffer_[256] = {};
+
+ private:
+  DeviceType type_;
+};
+
+// Global mock registered once before all tests run.
+static MockAllocator g_mock_cuda(DeviceType::CUDA);
+
+class DeviceMemoryBufferTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
+  }
+
+  void SetUp() override {
+    // Reset counters before each test.
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
+    g_mock_cuda.last_allocate_size_ = 0;
+    g_mock_cuda.last_deallocate_ptr_ = nullptr;
+  }
+};
+
+TEST_F(DeviceMemoryBufferTest, DefaultConstructedIsEmpty) {
+  DeviceMemoryBuffer buf;
+  EXPECT_EQ(buf.data(), nullptr);
+  EXPECT_EQ(buf.size(), 0);
+
+  auto span = buf.as_span();
+  EXPECT_EQ(span.data(), nullptr);
+  EXPECT_EQ(span.size(), 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateAllocatesAndDestructorDeallocates) {
+  {
+    auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0);
+    ASSERT_TRUE(result.ok());
+
+    auto buf = std::move(result.get());
+    EXPECT_NE(buf.data(), nullptr);
+    EXPECT_EQ(buf.size(), 1024);
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 1024);
+    EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+  }
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_deallocate_ptr_, g_mock_cuda.buffer_);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateFailsWithNoRegisteredAllocator) {
+  auto result = DeviceMemoryBuffer::create(512, DeviceType::CPU, 0);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), Error::NotFound);
+}
+
+TEST_F(DeviceMemoryBufferTest, MoveConstructorTransfersOwnership) {
+  auto result = DeviceMemoryBuffer::create(256, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto original = std::move(result.get());
+  void* original_ptr = original.data();
+
+  DeviceMemoryBuffer moved(std::move(original));
+
+  EXPECT_EQ(original.data(), nullptr);
+  EXPECT_EQ(original.size(), 0);
+  EXPECT_EQ(moved.data(), original_ptr);
+  EXPECT_EQ(moved.size(), 256);
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, MoveAssignmentTransfersOwnership) {
+  auto result = DeviceMemoryBuffer::create(128, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto original = std::move(result.get());
+  void* original_ptr = original.data();
+
+  DeviceMemoryBuffer target;
+  target = std::move(original);
+
+  EXPECT_EQ(original.data(), nullptr);
+  EXPECT_EQ(target.data(), original_ptr);
+  EXPECT_EQ(target.size(), 128);
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, DestructorNoOpForDefaultConstructed) {
+  {
+    DeviceMemoryBuffer buf;
+  }
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, AsSpanWrapsDevicePointer) {
+  auto result = DeviceMemoryBuffer::create(2048, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto buf = std::move(result.get());
+
+  auto span = buf.as_span();
+  EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
+  EXPECT_EQ(span.size(), 2048);
+}
diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 1ad0940c62e..0436d3e10dd 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -7,6 +7,14 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
+    runtime.cxx_test(
+        name = "device_memory_buffer_test",
+        srcs = ["device_memory_buffer_test.cpp"],
+        deps = [
+            "//executorch/runtime/core:device_memory_buffer",
+        ],
+    )
+
     runtime.cxx_test(
         name = "span_test",
         srcs = ["span_test.cpp"],

From 1fdbbd16da03c92b7b71e90f03acb6dbe5bb9ab9 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 15:01:28 -0700
Subject: [PATCH 09/15] [ET Device Support] MethodMeta: expose per-buffer
 device placement API

Add memory_planned_buffer_device(index) to MethodMeta, returning the
Device (type + index) for each planned memory buffer. This reads from
the non_const_buffer_device field in the serialized ExecutionPlan.

For CPU-only programs (or legacy PTE files without non_const_buffer_device),
all buffers default to Device{CPU, 0}. The sparse list only stores entries
for non-CPU buffers, so the lookup scans for a matching buffer_idx.

This API enables Module::load_method() to query each buffer's target device
and allocate accordingly (malloc for CPU, DeviceAllocator for CUDA, etc.).

Differential Revision: [D97850708](https://our.internmc.facebook.com/intern/diff/D97850708/)

[ghstack-poisoned]
---
 runtime/core/test/targets.bzl                 |  2 +-
 runtime/executor/method_meta.cpp              | 36 +++++++++++++
 runtime/executor/method_meta.h                | 14 +++++
 runtime/executor/test/method_meta_test.cpp    | 51 +++++++++++++++++++
 runtime/executor/test/targets.bzl             |  7 ++-
 .../models/export_program_with_device_info.py |  7 ++-
 6 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 0436d3e10dd..c4da8cc37de 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -50,7 +50,7 @@ def define_common_targets():
             "//executorch/runtime/core:core",
         ],
     )
-    
+
     runtime.cxx_test(
         name = "event_tracer_test",
         srcs = [
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index 75dadfd893a..ca1b3254338 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -325,6 +325,42 @@ Result<int64_t> MethodMeta::memory_planned_buffer_size(size_t index) const {
   return s_plan_->non_const_buffer_sizes()->Get(index + 1);
 }
 
+Result<etensor::Device> MethodMeta::memory_planned_buffer_device(
+    size_t index) const {
+  auto num_buffers = this->num_memory_planned_buffers();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < num_buffers,
+      InvalidArgument,
+      "index %zu out of range. num_buffers: %zu",
+      index,
+      num_buffers);
+
+  // The non_const_buffer_device field is optional and only present when the
+  // program contains non-CPU buffers. For CPU-only programs (or legacy PTE
+  // files), this field is null and all buffers default to CPU.
+  auto* buffer_devices = s_plan_->non_const_buffer_device();
+  if (buffer_devices == nullptr) {
+    return etensor::Device{etensor::DeviceType::CPU, 0};
+  }
+
+  // The sparse list only contains entries for non-CPU buffers.
+  // buffer_idx uses the same indexing as non_const_buffer_sizes (1-based,
+  // with index 0 reserved). The user-facing index is 0-based, so we
+  // compare against index + 1.
+  const auto internal_idx = static_cast<int32_t>(index + 1);
+  for (size_t i = 0; i < buffer_devices->size(); ++i) {
+    auto entry = buffer_devices->Get(i);
+    if (entry->buffer_idx() == internal_idx) {
+      return etensor::Device{
+          static_cast<etensor::DeviceType>(entry->device_type()),
+          static_cast<etensor::DeviceIndex>(entry->device_index())};
+    }
+  }
+
+  // Not found in the sparse list — this buffer is on CPU.
+  return etensor::Device{etensor::DeviceType::CPU, 0};
+}
+
 bool MethodMeta::uses_backend(const char* backend_name) const {
   ET_CHECK_MSG(backend_name, "backend name is null");
   const auto delegates = s_plan_->delegates();
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
index 79fd05c28ee..e0fa16cda22 100644
--- a/runtime/executor/method_meta.h
+++ b/runtime/executor/method_meta.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/core/tag.h>
@@ -234,6 +235,19 @@ class MethodMeta final {
    */
   Result<int64_t> memory_planned_buffer_size(size_t index) const;
 
+  /**
+   * Get the device placement for the specified memory-planned buffer.
+   *
+   * For CPU-only programs (no non_const_buffer_device in the PTE), all buffers
+   * default to Device{CPU, 0}. For programs with device annotations, returns
+   * the device type and index that the buffer should be allocated on.
+   *
+   * @param[in] index The index of the buffer to look up (0-based, same
+   *     indexing as memory_planned_buffer_size()).
+   * @returns The Device on success, or an error on failure.
+   */
+  Result<etensor::Device> memory_planned_buffer_device(size_t index) const;
+
   /**
    * Check to see if a backend is used in this method.
    *
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index e4ef2e72a85..4b2fdb26da2 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -74,6 +74,10 @@ class MethodMetaTest : public ::testing::Test {
   void SetUp() override {
     load_program(std::getenv("ET_MODULE_ADD_PATH"), "add");
     load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful");
+    const char* device_path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+    if (device_path != nullptr) {
+      load_program(device_path, "add_with_device");
+    }
   }
 
  private:
@@ -192,6 +196,27 @@ TEST_F(MethodMetaTest, MethodMetaAttribute) {
   ASSERT_EQ(bad_access.error(), Error::InvalidArgument);
 }
 
+TEST_F(MethodMetaTest, MemoryPlannedBufferDeviceDefaultsCpu) {
+  Result<MethodMeta> method_meta = programs_["add"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // CPU-only model: all buffers should default to CPU device.
+  size_t num_buffers = method_meta->num_memory_planned_buffers();
+  ASSERT_GT(num_buffers, 0);
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto device = method_meta->memory_planned_buffer_device(i);
+    ASSERT_TRUE(device.ok());
+    EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CPU);
+    EXPECT_EQ(device->index(), 0);
+  }
+
+  // Out of range returns error.
+  EXPECT_EQ(
+      method_meta->memory_planned_buffer_device(num_buffers).error(),
+      Error::InvalidArgument);
+}
+
 TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
   // Create sizes that will cause overflow when multiplied
   std::vector<int32_t> overflow_sizes = {
@@ -214,3 +239,29 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
           executorch::aten::string_view{nullptr, 0}),
       "");
 }
+
+TEST_F(MethodMetaTest, MethodMetaBufferDeviceReturnsCudaForDeviceBuffer) {
+  ASSERT_NE(programs_.find("add_with_device"), programs_.end())
+      << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
+  Result<MethodMeta> method_meta =
+      programs_["add_with_device"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // ModuleAddWithDevice exports with enable_non_cpu_memory_planning=True.
+  // The model delegates add(a,b) to CUDA, producing:
+  //   non_const_buffer_sizes: [0, 48]  (index 0 reserved)
+  //   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
+  // So there is exactly 1 planned buffer (user-facing index 0), on CUDA.
+  ASSERT_EQ(method_meta->num_memory_planned_buffers(), 1);
+
+  // Buffer 0 should be CUDA device.
+  auto device = method_meta->memory_planned_buffer_device(0);
+  ASSERT_TRUE(device.ok());
+  EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CUDA);
+  EXPECT_EQ(device->index(), 0);
+
+  // Out of range should return error.
+  EXPECT_EQ(
+      method_meta->memory_planned_buffer_device(1).error(),
+      Error::InvalidArgument);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index f4534aefdea..74ea9a8262d 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -178,7 +178,12 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/runtime/executor:program",
                 "//executorch/extension/data_loader:file_data_loader",
             ],
-            env = modules_env,
+            env = dict(
+                modules_env,
+                **{
+                    "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+                }
+            ),
         )
 
         runtime.cxx_test(
diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py
index 1abf73bfb73..246c41bb9f3 100644
--- a/test/models/export_program_with_device_info.py
+++ b/test/models/export_program_with_device_info.py
@@ -99,7 +99,12 @@ def main() -> None:
         compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
     lowered = edge.to_backend(_DeviceAwarePartitioner())
-    et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False))
+    et_prog = lowered.to_executorch(
+        ExecutorchBackendConfig(
+            emit_stacktrace=False,
+            enable_non_cpu_memory_planning=True,
+        )
+    )
 
     os.makedirs(args.outdir, exist_ok=True)
     outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte")

From 6af305941c640ed3ebd329cdc2172ae3c06141a8 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 15:01:37 -0700
Subject: [PATCH 10/15] [ET Device Support] MemoryManager: add per-buffer
 device metadata

This diff extend MemoryManager with optional per-buffer device type metadata so the runtime explicitly knows which planned memory buffers are on which device. This enables future device-aware dispatch and debugging.

Changes:
- New constructor taking planned_buffer_devices as extra input for device info
- New accessors: planned_buffer_devices(), has_device_memory()
- No existing functionalities have been updated.

Differential Revision: [D97850706](https://our.internmc.facebook.com/intern/diff/D97850706/)

[ghstack-poisoned]
---
 runtime/core/portable_type/targets.bzl        |   1 +
 runtime/executor/memory_manager.h             |  46 +++++
 runtime/executor/targets.bzl                  |   1 +
 runtime/executor/test/memory_manager_test.cpp |  44 +++++
 runtime/executor/test/targets.bzl             |   3 +
 .../test/tensor_parser_device_test.cpp        | 170 ++++++++++++++++++
 6 files changed, 265 insertions(+)

diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 33f18c68006..66fa9986e15 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -28,6 +28,7 @@ def define_common_targets():
             "//executorch/extension/fb/dynamic_shim/...",
             "//executorch/kernels/portable/cpu/...",
             "//executorch/runtime/core/...",
+            "//executorch/runtime/executor/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h
index 42edd9f0bea..b80344d4c9c 100644
--- a/runtime/executor/memory_manager.h
+++ b/runtime/executor/memory_manager.h
@@ -10,6 +10,8 @@
 
 #include <executorch/runtime/core/hierarchical_allocator.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/core/span.h>
 
 namespace executorch {
 namespace runtime {
@@ -61,6 +63,32 @@ class MemoryManager final {
         "method allocator cannot be the same as temp allocator");
   }
 
+  /**
+   * Constructs a new MemoryManager with per-buffer device metadata.
+   *
+   * @param[in] method_allocator Same as above.
+   * @param[in] planned_memory Same as above. May contain a mix of CPU and
+   *     device pointers — HierarchicalAllocator only does pointer arithmetic,
+   *     so device pointers are valid.
+   * @param[in] temp_allocator Same as above.
+   * @param[in] planned_buffer_devices One entry per planned memory buffer
+   *     (same count as planned_memory buffers), indicating the device type for
+   *     each buffer. For CPU-only programs, use the 3-arg constructor instead.
+   */
+  MemoryManager(
+      MemoryAllocator* method_allocator,
+      HierarchicalAllocator* planned_memory,
+      MemoryAllocator* temp_allocator,
+      Span<const etensor::DeviceType> planned_buffer_devices)
+      : method_allocator_(method_allocator),
+        planned_memory_(planned_memory),
+        temp_allocator_(temp_allocator),
+        planned_buffer_devices_(planned_buffer_devices) {
+    ET_CHECK_MSG(
+        method_allocator != temp_allocator,
+        "method allocator cannot be the same as temp allocator");
+  }
+
   /**
    * DEPRECATED: Use the constructor without `constant_allocator` instead.
    *
@@ -105,10 +133,28 @@ class MemoryManager final {
     return temp_allocator_;
   }
 
+  /**
+   * Returns per-buffer device metadata. One entry per planned memory buffer,
+   * same count as planned_memory buffers. Empty if no device metadata was
+   * provided (CPU-only program).
+   */
+  Span<const etensor::DeviceType> planned_buffer_devices() const {
+    return planned_buffer_devices_;
+  }
+
+  /**
+   * Returns true if any planned buffer is on a non-CPU device.
+   * When false, the memory setup is CPU-only and follows the legacy path.
+   */
+  bool has_device_memory() const {
+    return planned_buffer_devices_.size() > 0;
+  }
+
  private:
   MemoryAllocator* method_allocator_;
   HierarchicalAllocator* planned_memory_;
   MemoryAllocator* temp_allocator_;
+  Span<const etensor::DeviceType> planned_buffer_devices_;
 };
 
 } // namespace runtime
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index 90f8d0221e9..2441c55b58e 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -36,6 +36,7 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/runtime/core:memory_allocator",
+            "//executorch/runtime/core/portable_type:portable_type",
         ],
         visibility = ["PUBLIC"],
     )
diff --git a/runtime/executor/test/memory_manager_test.cpp b/runtime/executor/test/memory_manager_test.cpp
index 0e1feb47793..3a7a07d145a 100644
--- a/runtime/executor/test/memory_manager_test.cpp
+++ b/runtime/executor/test/memory_manager_test.cpp
@@ -17,6 +17,8 @@ using namespace ::testing;
 using executorch::runtime::HierarchicalAllocator;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::MemoryManager;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::DeviceType;
 
 TEST(MemoryManagerTest, MinimalCtor) {
   MemoryAllocator method_allocator(0, nullptr);
@@ -93,3 +95,45 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
           /*temp_allocator=*/&method_allocator),
       "cannot be the same");
 }
+
+TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
+  MemoryAllocator method_allocator(0, nullptr);
+  HierarchicalAllocator planned_memory({});
+  MemoryAllocator temp_allocator(0, nullptr);
+
+  MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
+
+  EXPECT_FALSE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
+}
+
+TEST(MemoryManagerTest, FourArgCtorWithDeviceMetadata) {
+  MemoryAllocator method_allocator(0, nullptr);
+  HierarchicalAllocator planned_memory({});
+  MemoryAllocator temp_allocator(0, nullptr);
+
+  // 3 buffers: CPU, CUDA, CPU
+  DeviceType devices[] = {DeviceType::CPU, DeviceType::CUDA, DeviceType::CPU};
+  Span<const DeviceType> device_span(devices, 3);
+
+  MemoryManager mm(
+      &method_allocator, &planned_memory, &temp_allocator, device_span);
+
+  EXPECT_EQ(mm.method_allocator(), &method_allocator);
+  EXPECT_EQ(mm.planned_memory(), &planned_memory);
+  EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
+  EXPECT_TRUE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 3);
+  EXPECT_EQ(mm.planned_buffer_devices()[0], DeviceType::CPU);
+  EXPECT_EQ(mm.planned_buffer_devices()[1], DeviceType::CUDA);
+  EXPECT_EQ(mm.planned_buffer_devices()[2], DeviceType::CPU);
+}
+
+TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
+  MemoryAllocator method_allocator(0, nullptr);
+
+  MemoryManager mm(&method_allocator);
+
+  EXPECT_FALSE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index 74ea9a8262d..32baa63a76b 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
+                "//executorch/extension/module/test/...",
                 "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
@@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 ":managed_memory_manager",
                 "//executorch/runtime/executor:program",
+                "//executorch/runtime/core:device_allocator",
+                "//executorch/runtime/core:device_memory_buffer",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/schema:program",
             ],
diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
index 46488eacd0b..6baf525aa12 100644
--- a/runtime/executor/test/tensor_parser_device_test.cpp
+++ b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -17,18 +17,32 @@
 #include <executorch/runtime/executor/tensor_parser.h>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/schema/program_generated.h>
 
 #include <gtest/gtest.h>
 
 using executorch::aten::Tensor;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
 using executorch::runtime::Error;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::MethodMeta;
 using executorch::runtime::Program;
 using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
 using executorch::runtime::deserialization::parseTensor;
 using executorch::runtime::testing::ManagedMemoryManager;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
 using torch::executor::util::FileDataLoader;
 
 constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
@@ -50,8 +64,67 @@ class ProgramTestFriend final {
 
 using executorch::runtime::testing::ProgramTestFriend;
 
+namespace {
+
+/**
+ * Mock CUDA allocator that uses host memory for testing.
+ * Tracks the allocated range so tests can verify tensor data_ptr
+ * falls within the "device" memory region.
+ */
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    allocate_count_++;
+    buffer_ = std::make_unique<uint8_t[]>(nbytes);
+    buffer_size_ = nbytes;
+    return static_cast<void*>(buffer_.get());
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    buffer_.reset();
+    buffer_size_ = 0;
+  }
+
+  Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  bool is_device_ptr(const void* ptr) const {
+    if (buffer_ == nullptr || buffer_size_ == 0) {
+      return false;
+    }
+    auto* p = static_cast<const uint8_t*>(ptr);
+    return p >= buffer_.get() && p < buffer_.get() + buffer_size_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+
+ private:
+  std::unique_ptr<uint8_t[]> buffer_;
+  size_t buffer_size_ = 0;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
 class TensorParserDeviceTest : public ::testing::Test {
  protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
+  }
+
   void SetUp() override {
     const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
     ASSERT_NE(path, nullptr)
@@ -59,6 +132,9 @@ class TensorParserDeviceTest : public ::testing::Test {
     Result<FileDataLoader> loader = FileDataLoader::from(path);
     ASSERT_EQ(loader.error(), Error::Ok);
     loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
   }
 
   std::unique_ptr<FileDataLoader> loader_;
@@ -169,3 +245,97 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
         << " without device annotation should have device_index=0";
   }
 }
+TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  Result<MethodMeta> method_meta = program->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // ModuleAddWithDevice has:
+  //   non_const_buffer_sizes: [0, 48]  (index 0 reserved, buffer 0 = 48 bytes)
+  //   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
+  const size_t num_buffers = method_meta->num_memory_planned_buffers();
+  ASSERT_EQ(num_buffers, 1);
+
+  // Set up device-aware planned memory.
+  std::vector<Span<uint8_t>> planned_spans;
+  std::vector<std::vector<uint8_t>> cpu_buffers;
+  std::vector<DeviceMemoryBuffer> device_buffers;
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto size = method_meta->memory_planned_buffer_size(i);
+    ASSERT_TRUE(size.ok());
+    auto device = method_meta->memory_planned_buffer_device(i);
+    ASSERT_TRUE(device.ok());
+
+    if (device->is_cpu()) {
+      cpu_buffers.emplace_back(size.get());
+      planned_spans.emplace_back(
+          cpu_buffers.back().data(), cpu_buffers.back().size());
+    } else {
+      cpu_buffers.emplace_back(); // empty placeholder
+      auto dmb = DeviceMemoryBuffer::create(
+          size.get(), device->type(), device->index());
+      ASSERT_TRUE(dmb.ok())
+          << "DeviceMemoryBuffer::create failed for buffer " << i;
+      planned_spans.emplace_back(dmb->as_span());
+      device_buffers.push_back(std::move(dmb.get()));
+    }
+  }
+
+  ASSERT_EQ(g_mock_cuda.allocate_count_, 1);
+
+  // Build HierarchicalAllocator with mixed CPU/device spans.
+  HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  constexpr size_t kMethodAllocBytes = 32 * 1024U;
+  auto method_alloc_pool = std::make_unique<uint8_t[]>(kMethodAllocBytes);
+  MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get());
+  MemoryManager memory_manager(&method_allocator, &planned_memory);
+
+  // Parse tensors and verify CUDA tensors have device memory.
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  int cuda_with_device_memory = 0;
+
+  for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+    bool is_cuda = s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->device_type() ==
+            executorch_flatbuffer::DeviceType::CUDA;
+
+    Result<Tensor> tensor =
+        parseTensor(&program.get(), &memory_manager, s_tensor);
+    ASSERT_TRUE(tensor.ok())
+        << "parseTensor failed at index " << i
+        << " with error 0x" << std::hex
+        << static_cast<uint32_t>(tensor.error());
+
+    Tensor t = tensor.get();
+
+    if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) {
+      EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr()))
+          << "CUDA tensor at index " << i
+          << " should have data_ptr in device memory, but got CPU memory";
+      cuda_with_device_memory++;
+    }
+  }
+
+  // All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
+  // their data_ptr pointing to the mock device memory buffer.
+  EXPECT_EQ(cuda_with_device_memory, 3)
+      << "All 3 CUDA tensors should have data_ptr in device memory";
+}

From c00365895dfde996d39f58d61e824e29fcfe6a7c Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 15:01:43 -0700
Subject: [PATCH 11/15] [ET Device Support] Module: allocate device memory for
 planned buffers

This diff enables module API loading program memory-planed on non-cpu device. It update Module::load_method() to detect device buffers via MethodMeta and allocate device memory using the registered DeviceAllocator. Device memory is managed via DeviceMemoryBuffer RAII objects stored in PlannedMemory, ensuring proper cleanup when the Method is destroyed.

Differential Revision: [D97850705](https://our.internmc.facebook.com/intern/diff/D97850705/)

[ghstack-poisoned]
---
 extension/module/module.cpp                   |  94 +++++++-
 extension/module/module.h                     |   6 +
 extension/module/targets.bzl                  |   1 +
 .../module/test/module_device_memory_test.cpp | 216 ++++++++++++++++++
 extension/module/test/targets.bzl             |  22 +-
 test/models/targets.bzl                       |   1 +
 6 files changed, 335 insertions(+), 5 deletions(-)
 create mode 100644 extension/module/test/module_device_memory_test.cpp

diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index ec7236276f5..21842938db9 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -13,6 +13,7 @@
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
 #include <executorch/runtime/platform/runtime.h>
 
 namespace executorch {
@@ -314,6 +315,45 @@ Module::make_planned_memory_with_shared_arenas(
   return planned;
 }
 
+std::unique_ptr<Module::PlannedMemory>
+Module::make_planned_memory_with_devices(
+    const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) {
+  auto planned = std::make_unique<PlannedMemory>();
+  const size_t num_buffers = method_meta.num_memory_planned_buffers();
+  planned->planned_buffers.reserve(num_buffers);
+  planned->planned_spans.reserve(num_buffers);
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto size = method_meta.memory_planned_buffer_size(i);
+    ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i);
+    auto device = method_meta.memory_planned_buffer_device(i);
+    ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i);
+
+    if (device->is_cpu()) {
+      planned->planned_buffers.emplace_back(size.get());
+      planned->planned_spans.emplace_back(
+          planned->planned_buffers.back().data(), size.get());
+    } else {
+      // Allocate device memory via DeviceAllocator and store the RAII buffer.
+      planned->planned_buffers.emplace_back(); // empty CPU placeholder
+      auto dmb = runtime::DeviceMemoryBuffer::create(
+          size.get(), device->type(), device->index());
+      ET_CHECK_MSG(
+          dmb.ok(),
+          "Failed to allocate device memory for buffer %zu (device_type=%d)",
+          i,
+          static_cast<int>(device->type()));
+      planned->planned_spans.emplace_back(dmb->as_span());
+      planned->device_buffers.push_back(std::move(dmb.get()));
+    }
+  }
+
+  planned->planned_memory =
+      std::make_unique<runtime::HierarchicalAllocator>(runtime::Span(
+          planned->planned_spans.data(), planned->planned_spans.size()));
+  return planned;
+}
+
 runtime::Result<std::vector<size_t>> Module::get_mem_planned_buffer_sizes(
     const std::string& method_name) {
   auto meta_res = program_->method_meta(method_name.c_str());
@@ -365,10 +405,54 @@ runtime::Error Module::load_method(
     MethodHolder method_holder;
 
     if (!planned_memory) {
-      if (!share_memory_arenas_) {
+      // Check if any buffers need device memory allocation.
+      auto meta_res = program_->method_meta(method_name.c_str());
+      ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error());
+      auto& meta = meta_res.get();
+
+      bool has_device_buffers = false;
+      for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
+        auto dev = meta.memory_planned_buffer_device(i);
+        if (dev.ok() && !dev->is_cpu()) {
+          has_device_buffers = true;
+          break;
+        }
+      }
+
+      if (has_device_buffers) {
+        // Device memory with shared arenas is not yet supported.
+        ET_CHECK_OR_RETURN_ERROR(
+            !share_memory_arenas_,
+            NotSupported,
+            "Device memory buffers are not yet compatible with "
+            "share_memory_arenas. Please disable share_memory_arenas "
+            "when using models with device-planned memory.");
+
+        // Device-aware path: allocate CPU and device buffers, build metadata.
+        method_holder.planned_memory =
+            make_planned_memory_with_devices(meta);
+
+        // Build per-buffer device type array for MemoryManager metadata.
+        for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
+          auto dev = meta.memory_planned_buffer_device(i);
+          method_holder.buffer_devices.push_back(
+              dev.ok() ? dev->type()
+                       : runtime::etensor::DeviceType::CPU);
+        }
+        planned_memory = method_holder.planned_memory->planned_memory.get();
+
+        method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
+            memory_allocator_.get(),
+            planned_memory,
+            temp_allocator_.get(),
+            runtime::Span<const runtime::etensor::DeviceType>(
+                method_holder.buffer_devices.data(),
+                method_holder.buffer_devices.size()));
+      } else if (!share_memory_arenas_) {
         auto sizes_res = get_mem_planned_buffer_sizes(method_name);
         ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
         method_holder.planned_memory = make_planned_memory(sizes_res.get());
+        planned_memory = method_holder.planned_memory->planned_memory.get();
       } else {
         auto sizes_res = get_mem_planned_buffer_sizes(method_name);
         ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
@@ -385,12 +469,14 @@ runtime::Error Module::load_method(
         }
         method_holder.planned_memory =
             make_planned_memory_with_shared_arenas(sizes, shared_arenas_);
+        planned_memory = method_holder.planned_memory->planned_memory.get();
       }
-      planned_memory = method_holder.planned_memory->planned_memory.get();
     }
 
-    method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
-        memory_allocator_.get(), planned_memory, temp_allocator_.get());
+    if (!method_holder.memory_manager) {
+      method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
+          memory_allocator_.get(), planned_memory, temp_allocator_.get());
+    }
     auto res_method = program_->load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),
diff --git a/extension/module/module.h b/extension/module/module.h
index 08a68b2676b..4ae494eff0b 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -16,6 +16,8 @@
 
 #include <executorch/runtime/executor/program.h>
 
+#include <executorch/runtime/core/device_memory_buffer.h>
+
 #ifdef USE_ATEN_LIB
 #define ET_MODULE_NAMESPACE module::aten
 #else // !USE_ATEN_LIB
@@ -682,12 +684,15 @@ class Module {
     std::vector<std::vector<uint8_t>> planned_buffers;
     std::vector<runtime::Span<uint8_t>> planned_spans;
     std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
+    std::vector<runtime::DeviceMemoryBuffer> device_buffers;
   };
   std::unique_ptr<PlannedMemory> make_planned_memory(
       const std::vector<size_t>& buffer_sizes);
   std::unique_ptr<PlannedMemory> make_planned_memory_with_shared_arenas(
       const std::vector<size_t>& buffer_sizes,
       std::vector<std::vector<uint8_t>>& shared_arenas);
+  std::unique_ptr<PlannedMemory> make_planned_memory_with_devices(
+      const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta);
   runtime::Result<std::vector<size_t>> get_mem_planned_buffer_sizes(
       const std::string& method_name);
   runtime::Result<std::vector<size_t>> get_max_mem_planned_buffer_sizes();
@@ -696,6 +701,7 @@ class Module {
     std::unique_ptr<PlannedMemory> planned_memory;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<Method> method;
+    std::vector<runtime::etensor::DeviceType> buffer_devices;
   };
 
   std::string file_path_;
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 6d60429bc51..03c50498bbc 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -28,6 +28,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
+                "//executorch/runtime/core:device_memory_buffer",
             ],
         )
 
diff --git a/extension/module/test/module_device_memory_test.cpp b/extension/module/test/module_device_memory_test.cpp
new file mode 100644
index 00000000000..39d0e2ab2ed
--- /dev/null
+++ b/extension/module/test/module_device_memory_test.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests that Module's device-aware memory allocation path works correctly.
+ *
+ * Uses ModuleAddWithDevice.pte which has:
+ *   non_const_buffer_sizes: [0, 48]  (1 buffer, index 0 reserved)
+ *   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
+ *
+ * Since we don't have a real CUDA backend, we test that:
+ * 1. CPU-only models load through Module without invoking device allocator
+ * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock
+ */
+
+#include <executorch/extension/module/module.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::extension::Module;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    allocate_count_++;
+    last_allocate_size_ = nbytes;
+    last_allocate_index_ = index;
+    buffer_ = std::make_unique<uint8_t[]>(nbytes);
+    return static_cast<void*>(buffer_.get());
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    buffer_.reset();
+  }
+
+  Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  size_t last_allocate_size_ = 0;
+  DeviceIndex last_allocate_index_ = -1;
+
+ private:
+  std::unique_ptr<uint8_t[]> buffer_;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
+class ModuleDeviceMemoryTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
+  }
+
+  void SetUp() override {
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
+    g_mock_cuda.last_allocate_size_ = 0;
+    g_mock_cuda.last_allocate_index_ = -1;
+  }
+};
+
+TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) {
+  const char* path = std::getenv("ET_MODULE_ADD_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set";
+
+  Module module(path);
+  auto err = module.load_method("forward");
+  ASSERT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(g_mock_cuda.allocate_count_, 0)
+      << "CPU-only model should not allocate device memory";
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) {
+  // Directly test DeviceMemoryBuffer::create with the registered mock.
+  // This verifies the RAII allocation/deallocation path that Module uses.
+  {
+    auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0);
+    ASSERT_TRUE(result.ok());
+    auto buf = std::move(result.get());
+
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48);
+    EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0);
+    EXPECT_NE(buf.data(), nullptr);
+    EXPECT_EQ(buf.size(), 48);
+
+    // as_span() wraps the device pointer for HierarchicalAllocator.
+    auto span = buf.as_span();
+    EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
+    EXPECT_EQ(span.size(), 48);
+
+    EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+  }
+  // RAII deallocation on scope exit.
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) {
+  // Verify MethodMeta reports the correct device for buffers in the
+  // device-annotated model, without needing to load the full method.
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  Module module(path);
+  auto err = module.load();
+  ASSERT_EQ(err, Error::Ok);
+
+  auto meta = module.method_meta("forward");
+  ASSERT_TRUE(meta.ok());
+
+  // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA.
+  ASSERT_EQ(meta->num_memory_planned_buffers(), 1);
+
+  auto size = meta->memory_planned_buffer_size(0);
+  ASSERT_TRUE(size.ok());
+  EXPECT_EQ(size.get(), 48);
+
+  auto device = meta->memory_planned_buffer_device(0);
+  ASSERT_TRUE(device.ok());
+  EXPECT_EQ(device->type(), DeviceType::CUDA);
+  EXPECT_EQ(device->index(), 0);
+}
+
+TEST_F(
+    ModuleDeviceMemoryTest,
+    DeviceModelWithSharedArenasReturnsNotSupported) {
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  // share_memory_arenas = true with a device-annotated model should fail.
+  Module module(
+      path,
+      Module::LoadMode::File,
+      /*event_tracer=*/nullptr,
+      /*memory_allocator=*/nullptr,
+      /*temp_allocator=*/nullptr,
+      /*share_memory_arenas=*/true);
+
+  auto err = module.load_method("forward");
+  EXPECT_EQ(err, Error::NotSupported);
+}
+
+TEST_F(
+    ModuleDeviceMemoryTest,
+    LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) {
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  {
+    Module module(path);
+    auto err = module.load_method("forward");
+
+    // Regardless of whether load_method succeeds or fails (e.g. due to
+    // backend init issues), the device-aware memory allocation path
+    // (make_planned_memory_with_devices) runs BEFORE backend init.
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1)
+        << "Expected 1 device allocation for the CUDA buffer"
+        << " (actual: " << g_mock_cuda.allocate_count_ << ")"
+        << ", deallocate_count=" << g_mock_cuda.deallocate_count_
+        << ", load_method returned error=" << static_cast<int>(err);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48)
+        << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)";
+    EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0)
+        << "Expected device_index=0 (cuda:0)";
+
+    if (err == Error::Ok) {
+      // Success path: MethodHolder moved into methods_ map.
+      // DeviceMemoryBuffer is alive as long as Module is alive.
+      EXPECT_EQ(g_mock_cuda.deallocate_count_, 0)
+          << "No deallocation while method is loaded";
+    } else {
+      // Error path: local MethodHolder destroyed on return from load_method.
+      // RAII deallocation already happened.
+      EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+          << "RAII deallocation on error path";
+    }
+  }
+
+  // After Module destroyed, all device memory must be freed.
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+      << "Expected deallocation after Module destroyed";
+}
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index f0d7e449efd..4dc3fb537f3 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -28,7 +28,7 @@ def define_common_targets(is_fbcode=False):
             aten_suffix = ("_aten" if aten_mode else "")
 
             runtime.cxx_test(
-                name = "test" + aten_suffix,
+                name = "module_test" + aten_suffix,
                 srcs = [
                     "module_test.cpp",
                 ],
@@ -68,6 +68,26 @@ def define_common_targets(is_fbcode=False):
                 ],
             )
 
+            runtime.cxx_test(
+                name = "module_device_memory_test" + aten_suffix,
+                srcs = [
+                    "module_device_memory_test.cpp",
+                ],
+                deps = [
+                    "//executorch/kernels/portable:generated_lib" + aten_suffix,
+                    "//executorch/extension/module:module" + aten_suffix,
+                    "//executorch/runtime/core:device_allocator",
+                    "//executorch/runtime/core:device_memory_buffer",
+                ],
+                env = {
+                    "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+                    "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
+                },
+                compiler_flags = [
+                    "-Wno-error=deprecated-declarations",
+                ],
+            )
+
     runtime.filegroup(
         name = "resources",
         srcs = native.glob([
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index c9fb67b7d31..a80244b1383 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -226,6 +226,7 @@ def define_common_targets():
         default_outs = ["."],
         visibility = [
             "//executorch/runtime/executor/test/...",
+            "//executorch/extension/module/test/...",
         ],
     )
 

From 0fcfa83ea22790b251e134f5878e2b53b30747d3 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 15:01:49 -0700
Subject: [PATCH 12/15] [ET Device Support] CudaAllocator: device memory
 allocator for CUDA backend

Implement CudaAllocator, a concrete DeviceAllocator using cudaMalloc/ cudaFree/cudaMemcpy for CUDA device memory management. The allocator is automatically registered with the DeviceAllocatorRegistry when the CUDA backend library is linked, ensuring DeviceMemoryBuffer::create(CUDA) works transparently.

Differential Revision: [D98014184](https://our.internmc.facebook.com/intern/diff/D98014184/)

[ghstack-poisoned]
---
 backends/aoti/slim/core/storage.h        |  44 ++---
 backends/aoti/slim/core/targets.bzl      |   1 +
 backends/cuda/runtime/TARGETS            |  29 +++
 backends/cuda/runtime/cuda_allocator.cpp | 213 +++++++++++++++++++++++
 backends/cuda/runtime/cuda_allocator.h   |  84 +++++++++
 backends/cuda/runtime/cuda_backend.cpp   |  10 ++
 6 files changed, 351 insertions(+), 30 deletions(-)
 create mode 100644 backends/cuda/runtime/cuda_allocator.cpp
 create mode 100644 backends/cuda/runtime/cuda_allocator.h

diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
index 73c4d32d955..a3d17a89903 100644
--- a/backends/aoti/slim/core/storage.h
+++ b/backends/aoti/slim/core/storage.h
@@ -13,6 +13,7 @@
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -107,9 +108,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
   /// @param device The target CUDA device (used to get the stream).
   /// @return Pointer to allocated device memory.
   static void* allocate(size_t nbytes, const c10::Device& device) {
-    // Get the current stream for this device (set by CUDAStreamGuard if any)
-    // This follows PyTorch's pattern where the allocator assumes the caller
-    // has already set the correct device via CUDAStreamGuard.
     auto stream_result =
         executorch::backends::cuda::getCurrentCUDAStream(device.index());
     ET_CHECK_MSG(
@@ -118,31 +116,23 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
         static_cast<int>(device.index()));
 
     cudaStream_t stream = stream_result.get();
-    void* data = nullptr;
-    ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
-    return data;
+    auto result = executorch::backends::cuda::CudaAllocator::allocate_async(
+        nbytes, device.index(), stream);
+    ET_CHECK_MSG(
+        result.ok(),
+        "CudaAllocator::allocate_async failed for %zu bytes on device %d",
+        nbytes,
+        static_cast<int>(device.index()));
+    return result.get();
   }
 
-  /// Frees CUDA device memory on the current stream.
-  /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
-    // Get the current stream for the current device
-    // Currently all cuda slimtensors should be on the same device same stream,
-    // so we can just use the stream on current device.
-    // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
-    // support multiple devices.
     auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
     ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
-    ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    executorch::backends::cuda::CudaAllocator::deallocate_async(
+        ptr, -1, stream_result.get());
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
-  /// @param dst Destination pointer.
-  /// @param src Source pointer.
-  /// @param nbytes Number of bytes to copy.
-  /// @param dst_device Destination device.
-  /// @param src_device Source device.
-  /// @param stream CUDA stream for async copy.
   static void memcpy_async(
       void* dst,
       const void* src,
@@ -151,7 +141,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
       const c10::Device& src_device,
       cudaStream_t stream) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
@@ -164,15 +153,11 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
           static_cast<int>(dst_device.index()));
     }
 
-    ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
+    auto err = executorch::backends::cuda::CudaAllocator::memcpy_async(
+        dst, src, nbytes, direction, stream);
+    ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed");
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
-  /// @param dst Destination pointer.
-  /// @param src Source pointer.
-  /// @param nbytes Number of bytes to copy.
-  /// @param dst_device Destination device.
-  /// @param src_device Source device.
   static void memcpy(
       void* dst,
       const void* src,
@@ -180,7 +165,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
       const c10::Device& dst_device,
       const c10::Device& src_device) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index b9148305c91..42a7b79da6e 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
             "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/backends/cuda/runtime:cuda_allocator",
         ],
     )
 
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 9c07b732735..8d0710aaadc 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -70,6 +70,33 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "cuda_allocator",
+    srcs = [
+        "cuda_allocator.cpp",
+    ],
+    headers = [
+        "cuda_allocator.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        "//executorch/runtime/core:device_allocator",
+    ],
+    deps = [
+        "//executorch/runtime/platform:platform",
+    ],
+    nvcc_flags = get_nvcc_arch_args() + [
+        "-_NVCC_HOST_COMPILER_FLAG_",
+        "gcc",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
 runtime.cxx_library(
     name = "cuda_backend",
     srcs = [
@@ -87,6 +114,8 @@ runtime.cxx_library(
     visibility = ["PUBLIC"],
     deps = [
         ":runtime_shims",
+        ":cuda_allocator",
+        ":cuda_platform",
         "//executorch/backends/aoti:aoti_common_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp
new file mode 100644
index 00000000000..1e0d0c29dfd
--- /dev/null
+++ b/backends/cuda/runtime/cuda_allocator.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
+
+#include <cuda_runtime.h>
+
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+Result<void*> CudaAllocator::allocate(size_t nbytes, DeviceIndex index) {
+  void* ptr = nullptr;
+  cudaError_t prev_device_err = cudaSuccess;
+  int prev_device = 0;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaMalloc(&ptr, nbytes);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMalloc failed: %s (requested %zu bytes on device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::MemoryAllocationFailed;
+  }
+
+  return ptr;
+}
+
+void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
+  if (ptr == nullptr) {
+    return;
+  }
+
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaFree(ptr);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaFree failed: %s (ptr=%p, device %d)",
+        cudaGetErrorString(err),
+        ptr,
+        static_cast<int>(index));
+  }
+}
+
+Error CudaAllocator::copy_host_to_device(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    DeviceIndex index) {
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+Error CudaAllocator::copy_device_to_host(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    DeviceIndex index) {
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+DeviceType CudaAllocator::device_type() const {
+  return DeviceType::CUDA;
+}
+
+CudaAllocator& CudaAllocator::instance() {
+  static CudaAllocator allocator;
+  return allocator;
+}
+
+Result<void*> CudaAllocator::allocate_async(
+    size_t nbytes,
+    DeviceIndex index,
+    cudaStream_t stream) {
+  void* ptr = nullptr;
+  cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream);
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMallocAsync failed: %s (requested %zu bytes on device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::MemoryAllocationFailed;
+  }
+  return ptr;
+}
+
+void CudaAllocator::deallocate_async(
+    void* ptr,
+    DeviceIndex index,
+    cudaStream_t stream) {
+  if (ptr == nullptr) {
+    return;
+  }
+  cudaError_t err = cudaFreeAsync(ptr, stream);
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaFreeAsync failed: %s (ptr=%p, device %d)",
+        cudaGetErrorString(err),
+        ptr,
+        static_cast<int>(index));
+  }
+}
+
+Error CudaAllocator::memcpy_async(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    cudaMemcpyKind direction,
+    cudaStream_t stream) {
+  cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream);
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpyAsync failed: %s (%zu bytes)",
+        cudaGetErrorString(err),
+        nbytes);
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_allocator.h b/backends/cuda/runtime/cuda_allocator.h
new file mode 100644
index 00000000000..3d2674a0e01
--- /dev/null
+++ b/backends/cuda/runtime/cuda_allocator.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <executorch/runtime/core/device_allocator.h>
+
+namespace executorch::backends::cuda {
+
+/**
+ * CUDA implementation of DeviceAllocator.
+ *
+ * Uses cudaMalloc/cudaFree for allocation and cudaMemcpy for host-device
+ * transfers. This allocator is automatically registered as a singleton
+ * with the DeviceAllocatorRegistry when the CUDA backend library is linked.
+ *
+ * All CUDA memory operations in the CUDA backend should go through this
+ * allocator for consistent memory management.
+ */
+class CudaAllocator final : public executorch::runtime::DeviceAllocator {
+ public:
+  executorch::runtime::Result<void*> allocate(
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  void deallocate(
+      void* ptr,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  executorch::runtime::Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  executorch::runtime::Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  executorch::runtime::etensor::DeviceType device_type() const override;
+
+  /// Returns the global CudaAllocator singleton.
+  static CudaAllocator& instance();
+
+  // --- Async (stream-based) operations for SlimTensor/Storage layer ---
+
+  /**
+   * Allocate device memory asynchronously on the given CUDA stream.
+   */
+  static executorch::runtime::Result<void*> allocate_async(
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index,
+      cudaStream_t stream);
+
+  /**
+   * Deallocate device memory asynchronously on the given CUDA stream.
+   */
+  static void deallocate_async(
+      void* ptr,
+      executorch::runtime::etensor::DeviceIndex index,
+      cudaStream_t stream);
+
+  /**
+   * Copy memory asynchronously on the given CUDA stream.
+   * Supports H2D, D2H, and D2D based on src/dst device types.
+   */
+  static executorch::runtime::Error memcpy_async(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      cudaMemcpyKind direction,
+      cudaStream_t stream);
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 39071f731a1..1a62c35bbb4 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -38,6 +38,7 @@
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -637,5 +638,14 @@ auto cls = cuda::CudaBackend();
 executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
+
+// Auto-register the CudaAllocator so that DeviceMemoryBuffer::create(CUDA)
+// works whenever the CUDA backend library is linked.
+static bool cuda_allocator_registered = [] {
+  executorch::runtime::register_device_allocator(
+      executorch::runtime::etensor::DeviceType::CUDA,
+      &cuda::CudaAllocator::instance());
+  return true;
+}();
 } // namespace
 } // namespace executorch::backends

From 551e2ec555972a0b9b4dc3b1c9aa43b053fffea1 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 18 May 2026 17:25:50 -0700
Subject: [PATCH 13/15] Update (base update)

[ghstack-poisoned]
---
 runtime/executor/test/tensor_parser_device_test.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
index 918e564e093..3cd5570b42b 100644
--- a/runtime/executor/test/tensor_parser_device_test.cpp
+++ b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -73,7 +73,12 @@ namespace {
  */
 class MockCudaAllocator : public DeviceAllocator {
  public:
-  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignement = kDefaultAlignment) override {
+    (void)alignement;
+    (void)index;
     allocate_count_++;
     buffer_ = std::make_unique<uint8_t[]>(nbytes);
     buffer_size_ = nbytes;

From 6dc7ee4903fbd159c6452a8acdc28d767da20974 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 21 May 2026 16:57:41 -0700
Subject: [PATCH 14/15] Update (base update)

[ghstack-poisoned]
---
 shim_et/xplat/executorch/build/build_variables.bzl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index b0545b8ce18..659a128994f 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -50,6 +50,8 @@ PLATFORM_SRCS = [
 
 EXECUTORCH_CORE_SRCS = sorted([
     "runtime/backend/interface.cpp",
+    "runtime/core/device_allocator.cpp",
+    "runtime/core/device_memory_buffer.cpp",
     "runtime/core/evalue.cpp",
     "runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp",
     "runtime/core/exec_aten/util/tensor_util_portable.cpp",

From 722c19d1e2f2a301048ad68180071271e0223cbe Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@icloud.com>
Date: Fri, 22 May 2026 17:54:40 -0700
Subject: [PATCH 15/15] [ET Device Support] Define AOT device copy ops registry

Differential Revision: D99636779

Pull Request resolved: https://github.com/pytorch/executorch/pull/18728
---
 exir/passes/BUCK                         |  8 +++
 exir/passes/_device_copy_ops_registry.py | 58 +++++++++++++++++++
 exir/tests/TARGETS                       | 11 ++++
 exir/tests/test_device_copy_ops.py       | 73 ++++++++++++++++++++++++
 4 files changed, 150 insertions(+)
 create mode 100644 exir/passes/_device_copy_ops_registry.py
 create mode 100644 exir/tests/test_device_copy_ops.py

diff --git a/exir/passes/BUCK b/exir/passes/BUCK
index 954f1cfdb4f..4647388b388 100644
--- a/exir/passes/BUCK
+++ b/exir/passes/BUCK
@@ -381,6 +381,14 @@ fbcode_target(_kind = runtime.python_library,
     ],
 )
 
+fbcode_target(_kind = runtime.python_library,
+    name = "device_copy_ops_registry",
+    srcs = ["_device_copy_ops_registry.py"],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
 fbcode_target(_kind = runtime.python_library,
     name = "memory_format_ops_pass",
     srcs = [
diff --git a/exir/passes/_device_copy_ops_registry.py b/exir/passes/_device_copy_ops_registry.py
new file mode 100644
index 00000000000..a62b88d4234
--- /dev/null
+++ b/exir/passes/_device_copy_ops_registry.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Registry for device copy ops used to insert explicit H2D (host-to-device)
+and D2H (device-to-host) data transfer operations at delegate boundaries.
+
+These ops are inserted by PropagateDevicePass when enable_non_cpu_memory_planning
+is True, making the graph functional by explicitly transferring data between
+CPU and device memory.
+
+Follows the same registration pattern as dim_order_ops_registry.py.
+"""
+
+import torch
+from torch.library import impl, Library
+
+lib = Library("et_copy", "DEF")
+
+# _h2d_copy: copies a CPU tensor to device memory.
+# At tracing time, this is a clone (both on CPU). At runtime, the out tensor
+# is memory-planned on device, and the kernel calls
+# DeviceAllocator::copy_host_to_device.
+lib.define("_h2d_copy(Tensor self) -> Tensor")
+lib.define("_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+
+# _d2h_copy: copies a device tensor to CPU memory.
+# At tracing time, this is a clone (both on CPU). At runtime, the self tensor
+# has device memory, and the kernel calls DeviceAllocator::copy_device_to_host.
+lib.define("_d2h_copy(Tensor self) -> Tensor")
+lib.define("_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+
+
+@impl(lib, "_h2d_copy", "CompositeImplicitAutograd")
+def _h2d_copy_impl(self: torch.Tensor) -> torch.Tensor:
+    # During tracing, both tensors are on CPU. Just clone to represent the transfer.
+    return self.clone()
+
+
+@impl(lib, "_h2d_copy.out", "CompositeImplicitAutograd")
+def _h2d_copy_out_impl(self: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor:
+    out.copy_(self)
+    return out
+
+
+@impl(lib, "_d2h_copy", "CompositeImplicitAutograd")
+def _d2h_copy_impl(self: torch.Tensor) -> torch.Tensor:
+    # During tracing, both tensors are on CPU. Just clone to represent the transfer.
+    return self.clone()
+
+
+@impl(lib, "_d2h_copy.out", "CompositeImplicitAutograd")
+def _d2h_copy_out_impl(self: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor:
+    out.copy_(self)
+    return out
diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS
index 322f72c870a..21493a69644 100644
--- a/exir/tests/TARGETS
+++ b/exir/tests/TARGETS
@@ -504,3 +504,14 @@ python_unittest(
         "//executorch/exir/passes:propagate_device_pass",
     ],
 )
+
+python_unittest(
+    name = "device_copy_ops",
+    srcs = [
+        "test_device_copy_ops.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir/passes:device_copy_ops_registry",
+    ],
+)
diff --git a/exir/tests/test_device_copy_ops.py b/exir/tests/test_device_copy_ops.py
new file mode 100644
index 00000000000..805159d9d81
--- /dev/null
+++ b/exir/tests/test_device_copy_ops.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+# Import the registry to register the ops
+import executorch.exir.passes._device_copy_ops_registry  # noqa: F401
+
+import torch
+
+
+class DeviceCopyOpsRegistryTest(unittest.TestCase):
+    """Tests that et_copy._h2d_copy and et_copy._d2h_copy ops are correctly
+    registered and produce expected outputs during tracing (CPU-only)."""
+
+    def test_h2d_copy_functional(self):
+        """_h2d_copy should return a clone of the input tensor."""
+        x = torch.randn(2, 3)
+        result = torch.ops.et_copy._h2d_copy(x)
+        self.assertEqual(result.shape, x.shape)
+        self.assertEqual(result.dtype, x.dtype)
+        self.assertTrue(torch.equal(result, x))
+        # Should be a new tensor, not the same object
+        self.assertFalse(result.data_ptr() == x.data_ptr())
+
+    def test_d2h_copy_functional(self):
+        """_d2h_copy should return a clone of the input tensor."""
+        x = torch.randn(4, 5)
+        result = torch.ops.et_copy._d2h_copy(x)
+        self.assertEqual(result.shape, x.shape)
+        self.assertEqual(result.dtype, x.dtype)
+        self.assertTrue(torch.equal(result, x))
+        self.assertFalse(result.data_ptr() == x.data_ptr())
+
+    def test_h2d_copy_out_variant(self):
+        """_h2d_copy.out should copy data into the provided out tensor."""
+        x = torch.randn(3, 3)
+        out = torch.empty(3, 3)
+        result = torch.ops.et_copy._h2d_copy.out(x, out=out)
+        self.assertTrue(result is out)
+        self.assertTrue(torch.equal(out, x))
+
+    def test_d2h_copy_out_variant(self):
+        """_d2h_copy.out should copy data into the provided out tensor."""
+        x = torch.randn(2, 4)
+        out = torch.empty(2, 4)
+        result = torch.ops.et_copy._d2h_copy.out(x, out=out)
+        self.assertTrue(result is out)
+        self.assertTrue(torch.equal(out, x))
+
+    def test_h2d_copy_preserves_dtype(self):
+        """_h2d_copy should work with various dtypes."""
+        for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
+            x = torch.ones(2, 2, dtype=dtype)
+            result = torch.ops.et_copy._h2d_copy(x)
+            self.assertEqual(result.dtype, dtype)
+            self.assertTrue(torch.equal(result, x))
+
+    def test_h2d_copy_scalar_tensor(self):
+        """_h2d_copy should handle 0-dim tensors."""
+        x = torch.tensor(3.14)
+        result = torch.ops.et_copy._h2d_copy(x)
+        self.assertEqual(result.shape, torch.Size([]))
+        self.assertTrue(torch.equal(result, x))
+
+    def test_d2h_copy_empty_tensor(self):
+        """_d2h_copy should handle empty tensors."""
+        x = torch.empty(0, 3)
+        result = torch.ops.et_copy._d2h_copy(x)
+        self.assertEqual(result.shape, torch.Size([0, 3]))