From 9f5c14a7c7b2d92f4db45db48b39350007cff452 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:25:39 -0800
Subject: [PATCH 01/12] [ET Device Support] Schema changes: device info on
 Tensor and buffer-level device array

This diff adds device placement information to the ExecuTorch schema to support representing tensor-level device type information, which will be the basic requirement for the following tensor_parser updates.

This is part of the Phase 1 implementation to make ET device type work E2E without user-specified device placement.

Design doc: https://docs.google.com/document/d/1lwd9BlohmwkN5EEvRulO_b-XnZBwv1nMb5l2K3jfuwA/edit?tab=t.0#heading=h.o6anuvkix4bu

Differential Revision: [D93635657](https://our.internmc.facebook.com/intern/diff/D93635657/)

[ghstack-poisoned]
---
 exir/schema.py     | 43 +++++++++++++++++++++++++++++++++++++
 schema/program.fbs | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/exir/schema.py b/exir/schema.py
index 7dba623aebf..0d06a85acfa 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -48,6 +48,17 @@ class TensorDataLocation(IntEnum):
     EXTERNAL = 1
 
 
+class DeviceType(IntEnum):
+    """
+    Device type enum indicating where a tensor resides or should be allocated.
+    Note that this enum is not directly mapped to the DeviceType enum in pytorch/pytorch
+    Check program.fbs for explanations of this enum.
+    """
+
+    CPU = 0
+    CUDA = 1
+
+
 @dataclass
 class ExtraTensorInfo:
     """
@@ -57,6 +68,12 @@ class ExtraTensorInfo:
     mutable_data_segments_idx: int = 0
     fully_qualified_name: Optional[str] = None
     location: TensorDataLocation = TensorDataLocation.SEGMENT
+    # Device type where this tensor resides or should be allocated.
+    # Defaults to CPU for backward compatibility.
+    device_type: DeviceType = DeviceType.CPU
+    # Device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+    # A value of -1 indicates the default device.
+    device_index: int = -1
 
 
 @dataclass
@@ -261,6 +278,26 @@ class Operator:
     overload: str
 
 
+@dataclass
+class NonConstBufferDevice:
+    """
+    Device placement information for a non-constant memory buffer.
+    This is a sparse representation: only buffers that are NOT on CPU need entries.
+    Buffers not listed in ExecutionPlan.non_const_buffer_device default to CPU.
+    Check program.fbs for explanations.
+    """
+
+    # Index into ExecutionPlan.non_const_buffer_sizes identifying which buffer
+    # this entry applies to.
+    buffer_index: int
+    # The device type where this buffer should be allocated.
+    # Defaults to CPU for backward compatibility.
+    device_type: DeviceType = DeviceType.CPU
+    # The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+    # A value of -1 indicates the default device.
+    device_index: int = -1
+
+
 @dataclass
 class ExecutionPlan:
     name: str
@@ -276,6 +313,12 @@ class ExecutionPlan:
     # Runtime should use the len(constant_buffer) as the ground truch of
     # constant memory buffer size, and ignore non_const_buffer_sizes[0].
     non_const_buffer_sizes: List[int]
+    # [Optional] Sparse device placement information for non-constant buffers.
+    # Only buffers that are NOT on CPU need to be listed here. Each entry
+    # specifies a buffer_index (into non_const_buffer_sizes) and its device.
+    # Buffers not listed here default to CPU, saving binary size when most
+    # buffers are on CPU.
+    non_const_buffer_device: Optional[List[NonConstBufferDevice]] = None
 
 
 @dataclass
diff --git a/schema/program.fbs b/schema/program.fbs
index 18e96de69b6..07901923410 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -61,6 +61,25 @@ enum TensorDataLocation : byte {
   EXTERNAL = 1,
 }
 
+// Device type enum indicating where a tensor resides or should be allocated.
+// Follows PyTorch DeviceType convention for compatibility.
+enum DeviceType : byte {
+  CPU = 0,
+  CUDA = 1,
+  // Reserve slots for future device types following PyTorch convention:
+  // MKLDNN = 2,
+  // OPENGL = 3,
+  // OPENCL = 4,
+  // IDEEP = 5,
+  // HIP = 6,
+  // FPGA = 7,
+  // MAIA = 8,
+  // XLA = 9,
+  // MPS = 10,
+  // XPU = 11,
+  // PrivateUse1 = 12,
+}
+
 // Table to put additional information about tensors in that is not applicable
 // to the vast majority of tensors in the vast majority of programs.
 table ExtraTensorInfo {
@@ -79,6 +98,15 @@ table ExtraTensorInfo {
   //   must be non-empty, and is used as a key to find the tensor's external
   //   data. Tensor.data_buffer_idx is ignored.
   location: TensorDataLocation;
+
+  // [Optional] The device type where this tensor resides or should be allocated.
+  // Defaults to CPU for backward compatibility with existing PTE files.
+  device_type: DeviceType = CPU;
+
+  // [Optional] The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+  // A value of -1 indicates the default device. Defaults to -1 for backward
+  // compatibility.
+  device_index: byte = -1;
 }
 
 table Tensor {
@@ -386,6 +414,13 @@ table ExecutionPlan {
   // constants memory buffer size, and ignore non_const_buffer_sizes[0].
   non_const_buffer_sizes: [int64];
 
+  // [Optional] Sparse device placement information for non-constant buffers.
+  // Only buffers that are NOT on CPU need to be listed here. Each entry
+  // specifies a buffer_index (into non_const_buffer_sizes) and its device.
+  // Buffers not listed here default to CPU, saving binary size when most
+  // buffers are on CPU.
+  non_const_buffer_device: [NonConstBufferDevice];
+
 }
 
 // Constant tensor data stored directly in the flatbuffer.
@@ -406,6 +441,24 @@ table BackendDelegateInlineData {
   data: [ubyte] (force_align: 16);  // @executorch-delegate-alignment
 }
 
+// Device placement information for a non-constant memory buffer.
+// This is a sparse representation: only buffers that are NOT on CPU need entries.
+// Buffers not listed in ExecutionPlan.non_const_buffer_device default to CPU.
+table NonConstBufferDevice {
+  // Index into ExecutionPlan.non_const_buffer_sizes identifying which buffer
+  // this entry applies to.
+  buffer_index: uint32;
+
+  // The device type where this buffer should be allocated.
+  // Defaults to CPU for backward compatibility with existing PTE files.
+  device_type: DeviceType = CPU;
+
+  // The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+  // A value of -1 indicates the default device. Defaults to -1 for backward
+  // compatibility.
+  device_index: byte = -1;
+}
+
 // Describes a contiguous piece of data that lives outside of the flatbuffer data,
 // typically appended afterwards in the file. The "extended header" in the file,
 // when present, points to the segment base offset.

From af472d76b6cbe487d81c4ecbd7bf5d00368d91b4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:25:45 -0800
Subject: [PATCH 02/12] [ET Device Support] TensorImpl carries device info

This diff extends `TensorImpl` to carry device information, enabling the runtime tensor to track which device its data resides on (CPU, CUDA, etc.). This is a prerequisite for parsing device info from the schema and allocating device memory.

Differential Revision: [D93635655](https://our.internmc.facebook.com/intern/diff/D93635655/)

[ghstack-poisoned]
---
 runtime/core/portable_type/tensor_impl.cpp    |   7 +-
 runtime/core/portable_type/tensor_impl.h      |  25 +++-
 .../portable_type/test/tensor_impl_test.cpp   | 112 ++++++++++++++++++
 3 files changed, 141 insertions(+), 3 deletions(-)

diff --git a/runtime/core/portable_type/tensor_impl.cpp b/runtime/core/portable_type/tensor_impl.cpp
index ede5a3d4101..17243fca0fd 100644
--- a/runtime/core/portable_type/tensor_impl.cpp
+++ b/runtime/core/portable_type/tensor_impl.cpp
@@ -50,7 +50,9 @@ TensorImpl::TensorImpl(
     void* data,
     DimOrderType* dim_order,
     StridesType* strides,
-    TensorShapeDynamism dynamism)
+    TensorShapeDynamism dynamism,
+    DeviceType device_type,
+    DeviceIndex device_index)
     : sizes_(sizes),
       dim_order_(dim_order),
       strides_(strides),
@@ -59,7 +61,8 @@ TensorImpl::TensorImpl(
       numel_(compute_numel(sizes, dim)),
       numel_bound_(numel_),
       type_(type),
-      shape_dynamism_(dynamism) {
+      shape_dynamism_(dynamism),
+      device_(device_type, device_index) {
   ET_CHECK_MSG(
       isValid(type_), "Invalid type %" PRId8, static_cast<int8_t>(type_));
   ET_CHECK_MSG(dim_ >= 0, "Dimension must be non-negative, got %zd", dim_);
diff --git a/runtime/core/portable_type/tensor_impl.h b/runtime/core/portable_type/tensor_impl.h
index 1e2b3620ca2..767a53bffae 100644
--- a/runtime/core/portable_type/tensor_impl.h
+++ b/runtime/core/portable_type/tensor_impl.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/array_ref.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/portable_type/scalar_type.h>
 #include <executorch/runtime/core/tensor_shape_dynamism.h>
 
@@ -99,6 +100,8 @@ class TensorImpl {
    * @param strides Strides of the tensor at each dimension. Must contain `dim`
    *     entries.
    * @param dynamism The mutability of the shape of the tensor.
+   * @param device_type The type of device where tensor data resides.
+   * @param device_index The device index for multi-device scenarios.
    */
   TensorImpl(
       ScalarType type,
@@ -107,7 +110,9 @@ class TensorImpl {
       void* data = nullptr,
       DimOrderType* dim_order = nullptr,
       StridesType* strides = nullptr,
-      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC);
+      TensorShapeDynamism dynamism = TensorShapeDynamism::STATIC,
+      DeviceType device_type = DeviceType::CPU,
+      DeviceIndex device_index = -1);
 
   /**
    * Returns the size of the tensor in bytes.
@@ -176,6 +181,21 @@ class TensorImpl {
     return shape_dynamism_;
   }
 
+  /// Returns the device where tensor data resides.
+  Device device() const {
+    return device_;
+  }
+
+  /// Returns the type of device where tensor data resides.
+  DeviceType device_type() const {
+    return device_.type();
+  }
+
+  /// Returns the device index, or -1 if default/unspecified.
+  DeviceIndex device_index() const {
+    return device_.index();
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* data() const {
@@ -261,6 +281,9 @@ class TensorImpl {
 
   /// Specifies the mutability of the shape of the tensor.
   const TensorShapeDynamism shape_dynamism_;
+
+  /// Device where tensor data resides (CPU, CUDA, etc.)
+  Device device_;
 };
 
 /**
diff --git a/runtime/core/portable_type/test/tensor_impl_test.cpp b/runtime/core/portable_type/test/tensor_impl_test.cpp
index 0b8ae05f4da..f51ac5374dd 100644
--- a/runtime/core/portable_type/test/tensor_impl_test.cpp
+++ b/runtime/core/portable_type/test/tensor_impl_test.cpp
@@ -21,6 +21,9 @@ using namespace ::testing;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Error;
 using executorch::runtime::TensorShapeDynamism;
+using executorch::runtime::etensor::Device;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
 using executorch::runtime::etensor::ScalarType;
 using executorch::runtime::etensor::TensorImpl;
 using SizesType = TensorImpl::SizesType;
@@ -449,3 +452,112 @@ TEST_F(TensorImplTest, TestResizingTensorToZeroAndBack) {
   EXPECT_GT(t.numel(), 0);
   EXPECT_EQ(t.data(), data);
 }
+
+// ============== Device Tests ==============
+
+TEST_F(TensorImplTest, TestDefaultDeviceIsCPU) {
+  // TensorImpl constructed without device parameters should default to CPU
+  SizesType sizes[2] = {3, 2};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(ScalarType::Float, 2, sizes, data);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CPU);
+  EXPECT_EQ(t.device_index(), -1);
+  EXPECT_EQ(t.device(), Device(DeviceType::CPU, -1));
+}
+
+TEST_F(TensorImplTest, TestExplicitCPUDevice) {
+  // TensorImpl constructed with explicit CPU device
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CPU);
+  EXPECT_EQ(t.device_index(), 0);
+  EXPECT_EQ(t.device(), Device(DeviceType::CPU, 0));
+}
+
+TEST_F(TensorImplTest, TestCUDADevice) {
+  // TensorImpl constructed with CUDA device
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 0);
+  EXPECT_EQ(t.device(), Device(DeviceType::CUDA, 0));
+}
+
+TEST_F(TensorImplTest, TestCUDADeviceMultiGPU) {
+  // TensorImpl with CUDA device index 1 (second GPU)
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      1);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 1);
+  EXPECT_EQ(t.device(), Device(DeviceType::CUDA, 1));
+}
+
+TEST_F(TensorImplTest, TestDeviceWithDynamicTensor) {
+  // Device info should work correctly with dynamic tensors
+  SizesType sizes[2] = {3, 2};
+  DimOrderType dim_order[2] = {0, 1};
+  StridesType strides[2] = {2, 1};
+  float data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TensorImpl t(
+      ScalarType::Float,
+      2,
+      sizes,
+      data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::DYNAMIC_BOUND,
+      DeviceType::CUDA,
+      0);
+
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 0);
+
+  // Resize should not affect device
+  SizesType new_sizes[2] = {2, 2};
+  Error err = resize_tensor_impl(&t, {new_sizes, 2});
+  EXPECT_EQ(err, Error::Ok);
+
+  // Device should remain unchanged after resize
+  EXPECT_EQ(t.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(t.device_index(), 0);
+}

From 2b669855d081d0f16446774c78553f9326e0bfa3 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:25:51 -0800
Subject: [PATCH 03/12] [ET Device Support] DeviceAllocator interface and
 DeviceAllocatorRegistry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.).

**DeviceAllocator interface provides:**
- `init_buffer()` - Initialize memory buffer pools for memory-planned tensors
- `get_offset_address()` - Get pointer to offset within pre-allocated buffer
- `allocate()` / `deallocate()` - Dynamic device memory allocation
- `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device
- `device_type()` - Returns the device type this allocator handles

**DeviceAllocatorRegistry provides:**
- Singleton registry mapping DeviceType → DeviceAllocator
- `register_allocator()` / `get_allocator()` methods
- Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly)

**Design notes:**
- Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime
- Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry)
- Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use

Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/)

[ghstack-poisoned]
---
 runtime/core/device_allocator.cpp           |  58 +++++
 runtime/core/device_allocator.h             | 185 +++++++++++++
 runtime/core/test/device_allocator_test.cpp | 271 ++++++++++++++++++++
 3 files changed, 514 insertions(+)
 create mode 100644 runtime/core/device_allocator.cpp
 create mode 100644 runtime/core/device_allocator.h
 create mode 100644 runtime/core/test/device_allocator_test.cpp

diff --git a/runtime/core/device_allocator.cpp b/runtime/core/device_allocator.cpp
new file mode 100644
index 00000000000..6046445d3be
--- /dev/null
+++ b/runtime/core/device_allocator.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_allocator.h>
+
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch {
+namespace runtime {
+
+DeviceAllocatorRegistry& DeviceAllocatorRegistry::instance() {
+  static DeviceAllocatorRegistry registry;
+  return registry;
+}
+
+void DeviceAllocatorRegistry::register_allocator(
+    etensor::DeviceType type,
+    DeviceAllocator* alloc) {
+  auto index = static_cast<size_t>(type);
+  ET_CHECK_MSG(
+      index < etensor::kNumDeviceTypes,
+      "Invalid device type: %d",
+      static_cast<int>(type));
+  ET_CHECK_MSG(
+      allocators_[index] == nullptr,
+      "Allocator already registered for device type: %d",
+      static_cast<int>(type));
+  allocators_[index] = alloc;
+}
+
+DeviceAllocator* DeviceAllocatorRegistry::get_allocator(
+    etensor::DeviceType type) {
+  auto index = static_cast<size_t>(type);
+  if (index >= etensor::kNumDeviceTypes) {
+    return nullptr;
+  }
+  return allocators_[index];
+}
+
+// Convenience free functions
+
+void register_device_allocator(
+    etensor::DeviceType type,
+    DeviceAllocator* alloc) {
+  DeviceAllocatorRegistry::instance().register_allocator(type, alloc);
+}
+
+DeviceAllocator* get_device_allocator(etensor::DeviceType type) {
+  return DeviceAllocatorRegistry::instance().get_allocator(type);
+}
+
+} // namespace runtime
+} // namespace executorch
diff --git a/runtime/core/device_allocator.h b/runtime/core/device_allocator.h
new file mode 100644
index 00000000000..52f9710902d
--- /dev/null
+++ b/runtime/core/device_allocator.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/core/result.h>
+
+namespace executorch {
+namespace runtime {
+
+/**
+ * Abstract interface for device-specific memory allocation.
+ *
+ * Each device type (CUDA, etc.) provides a concrete implementation
+ * that handles memory allocation on that device. Implementations are
+ * expected to be singletons with static lifetime, registered via
+ * DeviceAllocatorRegistry.
+
+ */
+class DeviceAllocator {
+ public:
+  virtual ~DeviceAllocator() = default;
+
+  /**
+   * Initialize a memory buffer pool for memory-planned tensors.
+   *
+   * @param memory_id The ID of the memory buffer (index into
+   *     ExecutionPlan.non_const_buffer_sizes).
+   * @param size The size in bytes to allocate for this buffer.
+   * @param index The device index (e.g., GPU 0 vs GPU 1).
+   * @return Error::Ok on success, or an appropriate error code on failure.
+   */
+  virtual Error
+  init_buffer(uint32_t memory_id, size_t size, etensor::DeviceIndex index) = 0;
+
+  /**
+   * Get a pointer to a specific offset within a pre-allocated buffer pool.
+   *
+   * @param memory_id The ID of the memory buffer.
+   * @param offset_bytes Offset in bytes from the start of the buffer.
+   * @param size_bytes Size of the requested region in bytes.
+   * @param index The device index.
+   * @return A Result containing the device pointer on success, or an error.
+   */
+  virtual Result<void*> get_offset_address(
+      uint32_t memory_id,
+      size_t offset_bytes,
+      size_t size_bytes,
+      etensor::DeviceIndex index) = 0;
+
+  /**
+   * Allocate device memory.
+   *
+   * @param nbytes Number of bytes to allocate.
+   * @param index The device index.
+   * @return A Result containing the device pointer on success, or an error.
+   */
+  virtual Result<void*> allocate(size_t nbytes, etensor::DeviceIndex index) = 0;
+
+  /**
+   * Deallocate device memory previously allocated via allocate().
+   *
+   * @param ptr Pointer to the memory to deallocate.
+   * @param index The device index.
+   */
+  virtual void deallocate(void* ptr, etensor::DeviceIndex index) = 0;
+
+  /**
+   * Copy data from host memory to device memory.
+   *
+   * @param dst Destination pointer (device memory).
+   * @param src Source pointer (host memory).
+   * @param nbytes Number of bytes to copy.
+   * @param index The device index.
+   * @return Error::Ok on success, or an appropriate error code on failure.
+   */
+  virtual Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      etensor::DeviceIndex index) = 0;
+
+  /**
+   * Copy data from device memory to host memory.
+   *
+   * @param dst Destination pointer (host memory).
+   * @param src Source pointer (device memory).
+   * @param nbytes Number of bytes to copy.
+   * @param index The device index.
+   * @return Error::Ok on success, or an appropriate error code on failure.
+   */
+  virtual Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      etensor::DeviceIndex index) = 0;
+
+  /**
+   * Returns the device type this allocator handles.
+   */
+  virtual etensor::DeviceType device_type() const = 0;
+};
+
+/**
+ * Registry for device allocators.
+ *
+ * Provides a global mapping from DeviceType to DeviceAllocator instances.
+ * Device allocators register themselves at static initialization time,
+ * and the runtime queries the registry to find the appropriate allocator
+ * for a given device type.
+ */
+class DeviceAllocatorRegistry {
+ public:
+  /**
+   * Returns the singleton instance of the registry.
+   */
+  static DeviceAllocatorRegistry& instance();
+
+  /**
+   * Register an allocator for a specific device type.
+   *
+   * @param type The device type this allocator handles.
+   * @param alloc Pointer to the allocator (must have static lifetime).
+   */
+  void register_allocator(etensor::DeviceType type, DeviceAllocator* alloc);
+
+  /**
+   * Get the allocator for a specific device type.
+   *
+   * @param type The device type.
+   * @return Pointer to the allocator, or nullptr if not registered.
+   */
+  DeviceAllocator* get_allocator(etensor::DeviceType type);
+
+ private:
+  DeviceAllocatorRegistry() = default;
+
+  // Fixed-size array indexed by device type. This avoids dynamic allocation
+  // and is suitable for embedded environments.
+  DeviceAllocator* allocators_[etensor::kNumDeviceTypes] = {};
+};
+
+// Convenience free functions
+
+/**
+ * Register a device allocator for a specific device type.
+ *
+ * @param type The device type this allocator handles.
+ * @param alloc Pointer to the allocator (must have static lifetime).
+ */
+void register_device_allocator(
+    etensor::DeviceType type,
+    DeviceAllocator* alloc);
+
+/**
+ * Get the device allocator for a specific device type.
+ *
+ * @param type The device type.
+ * @return Pointer to the allocator, or nullptr if not registered.
+ */
+DeviceAllocator* get_device_allocator(etensor::DeviceType type);
+
+} // namespace runtime
+} // namespace executorch
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::runtime::DeviceAllocator;
+using ::executorch::runtime::DeviceAllocatorRegistry;
+using ::executorch::runtime::get_device_allocator;
+using ::executorch::runtime::register_device_allocator;
+} // namespace executor
+} // namespace torch
diff --git a/runtime/core/test/device_allocator_test.cpp b/runtime/core/test/device_allocator_test.cpp
new file mode 100644
index 00000000000..3bf0f5ad583
--- /dev/null
+++ b/runtime/core/test/device_allocator_test.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_allocator.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using namespace ::testing;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceAllocatorRegistry;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+using executorch::runtime::etensor::kNumDeviceTypes;
+
+/**
+ * A mock DeviceAllocator implementation for testing purposes.
+ * Tracks calls to verify the registry dispatches correctly.
+ */
+class MockDeviceAllocator : public DeviceAllocator {
+ public:
+  explicit MockDeviceAllocator(DeviceType type) : type_(type) {}
+
+  Error init_buffer(uint32_t memory_id, size_t size, DeviceIndex index)
+      override {
+    last_init_buffer_memory_id_ = memory_id;
+    last_init_buffer_size_ = size;
+    last_init_buffer_index_ = index;
+    init_buffer_call_count_++;
+    return Error::Ok;
+  }
+
+  Result<void*> get_offset_address(
+      uint32_t memory_id,
+      size_t offset_bytes,
+      size_t size_bytes,
+      DeviceIndex index) override {
+    last_get_offset_memory_id_ = memory_id;
+    last_get_offset_offset_ = offset_bytes;
+    last_get_offset_size_ = size_bytes;
+    last_get_offset_index_ = index;
+    get_offset_address_call_count_++;
+    return &dummy_buffer_;
+  }
+
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    last_allocate_size_ = nbytes;
+    last_allocate_index_ = index;
+    allocate_call_count_++;
+    return &dummy_buffer_;
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    last_deallocate_ptr_ = ptr;
+    last_deallocate_index_ = index;
+    deallocate_call_count_++;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    last_h2d_dst_ = dst;
+    last_h2d_src_ = src;
+    last_h2d_size_ = nbytes;
+    last_h2d_index_ = index;
+    copy_h2d_call_count_++;
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    last_d2h_dst_ = dst;
+    last_d2h_src_ = src;
+    last_d2h_size_ = nbytes;
+    last_d2h_index_ = index;
+    copy_d2h_call_count_++;
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  // Tracking variables for verification
+  uint32_t last_init_buffer_memory_id_ = 0;
+  size_t last_init_buffer_size_ = 0;
+  DeviceIndex last_init_buffer_index_ = -1;
+  int init_buffer_call_count_ = 0;
+
+  uint32_t last_get_offset_memory_id_ = 0;
+  size_t last_get_offset_offset_ = 0;
+  size_t last_get_offset_size_ = 0;
+  DeviceIndex last_get_offset_index_ = -1;
+  int get_offset_address_call_count_ = 0;
+
+  size_t last_allocate_size_ = 0;
+  DeviceIndex last_allocate_index_ = -1;
+  int allocate_call_count_ = 0;
+
+  void* last_deallocate_ptr_ = nullptr;
+  DeviceIndex last_deallocate_index_ = -1;
+  int deallocate_call_count_ = 0;
+
+  void* last_h2d_dst_ = nullptr;
+  const void* last_h2d_src_ = nullptr;
+  size_t last_h2d_size_ = 0;
+  DeviceIndex last_h2d_index_ = -1;
+  int copy_h2d_call_count_ = 0;
+
+  void* last_d2h_dst_ = nullptr;
+  const void* last_d2h_src_ = nullptr;
+  size_t last_d2h_size_ = 0;
+  DeviceIndex last_d2h_index_ = -1;
+  int copy_d2h_call_count_ = 0;
+
+ private:
+  DeviceType type_;
+  uint8_t dummy_buffer_[64] = {};
+};
+
+class DeviceAllocatorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    executorch::runtime::runtime_init();
+  }
+};
+
+TEST_F(DeviceAllocatorTest, MockAllocatorDeviceType) {
+  MockDeviceAllocator cpu_allocator(DeviceType::CPU);
+  MockDeviceAllocator cuda_allocator(DeviceType::CUDA);
+
+  EXPECT_EQ(cpu_allocator.device_type(), DeviceType::CPU);
+  EXPECT_EQ(cuda_allocator.device_type(), DeviceType::CUDA);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorInitBuffer) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+
+  Error err =
+      allocator.init_buffer(/*memory_id=*/1, /*size=*/1024, /*index=*/0);
+
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(allocator.init_buffer_call_count_, 1);
+  EXPECT_EQ(allocator.last_init_buffer_memory_id_, 1);
+  EXPECT_EQ(allocator.last_init_buffer_size_, 1024);
+  EXPECT_EQ(allocator.last_init_buffer_index_, 0);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorGetOffsetAddress) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+
+  Result<void*> result = allocator.get_offset_address(
+      /*memory_id=*/2, /*offset_bytes=*/128, /*size_bytes=*/256, /*index=*/1);
+
+  EXPECT_TRUE(result.ok());
+  EXPECT_NE(result.get(), nullptr);
+  EXPECT_EQ(allocator.get_offset_address_call_count_, 1);
+  EXPECT_EQ(allocator.last_get_offset_memory_id_, 2);
+  EXPECT_EQ(allocator.last_get_offset_offset_, 128);
+  EXPECT_EQ(allocator.last_get_offset_size_, 256);
+  EXPECT_EQ(allocator.last_get_offset_index_, 1);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorAllocateAndDeallocate) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+
+  Result<void*> result = allocator.allocate(/*nbytes=*/512, /*index=*/0);
+  EXPECT_TRUE(result.ok());
+  void* ptr = result.get();
+  EXPECT_NE(ptr, nullptr);
+  EXPECT_EQ(allocator.allocate_call_count_, 1);
+  EXPECT_EQ(allocator.last_allocate_size_, 512);
+  EXPECT_EQ(allocator.last_allocate_index_, 0);
+
+  allocator.deallocate(ptr, /*index=*/0);
+  EXPECT_EQ(allocator.deallocate_call_count_, 1);
+  EXPECT_EQ(allocator.last_deallocate_ptr_, ptr);
+  EXPECT_EQ(allocator.last_deallocate_index_, 0);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorCopyHostToDevice) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+  uint8_t host_data[64] = {1, 2, 3, 4};
+  uint8_t device_data[64] = {};
+
+  Error err = allocator.copy_host_to_device(
+      device_data, host_data, sizeof(host_data), /*index=*/0);
+
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(allocator.copy_h2d_call_count_, 1);
+  EXPECT_EQ(allocator.last_h2d_dst_, device_data);
+  EXPECT_EQ(allocator.last_h2d_src_, host_data);
+  EXPECT_EQ(allocator.last_h2d_size_, sizeof(host_data));
+  EXPECT_EQ(allocator.last_h2d_index_, 0);
+}
+
+TEST_F(DeviceAllocatorTest, MockAllocatorCopyDeviceToHost) {
+  MockDeviceAllocator allocator(DeviceType::CUDA);
+  uint8_t device_data[64] = {5, 6, 7, 8};
+  uint8_t host_data[64] = {};
+
+  Error err = allocator.copy_device_to_host(
+      host_data, device_data, sizeof(device_data), /*index=*/1);
+
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(allocator.copy_d2h_call_count_, 1);
+  EXPECT_EQ(allocator.last_d2h_dst_, host_data);
+  EXPECT_EQ(allocator.last_d2h_src_, device_data);
+  EXPECT_EQ(allocator.last_d2h_size_, sizeof(device_data));
+  EXPECT_EQ(allocator.last_d2h_index_, 1);
+}
+
+TEST_F(DeviceAllocatorTest, RegistryGetUnregisteredReturnsNullptr) {
+  // Getting an allocator for an unregistered device type should return nullptr
+  // Note that there shouldn't be any regsitered allocators for CPU backend.
+  DeviceAllocator* alloc = get_device_allocator(DeviceType::CPU);
+  (void)alloc;
+}
+
+TEST_F(DeviceAllocatorTest, RegistrySingletonInstance) {
+  // Verify that instance() returns the same object each time
+  DeviceAllocatorRegistry& instance1 = DeviceAllocatorRegistry::instance();
+  DeviceAllocatorRegistry& instance2 = DeviceAllocatorRegistry::instance();
+
+  EXPECT_EQ(&instance1, &instance2);
+}
+
+TEST_F(DeviceAllocatorTest, RegisterAndGetDeviceAllocator) {
+  // Register a mock allocator for CUDA and retrieve it via the free function.
+  MockDeviceAllocator cuda_allocator(DeviceType::CUDA);
+  register_device_allocator(DeviceType::CUDA, &cuda_allocator);
+
+  DeviceAllocator* retrieved = get_device_allocator(DeviceType::CUDA);
+  EXPECT_EQ(retrieved, &cuda_allocator);
+  EXPECT_EQ(retrieved->device_type(), DeviceType::CUDA);
+
+  // Registering the same device type twice should abort.
+  MockDeviceAllocator another_allocator(DeviceType::CUDA);
+  EXPECT_DEATH(
+      register_device_allocator(DeviceType::CUDA, &another_allocator),
+      "Allocator already registered");
+}
+
+TEST_F(DeviceAllocatorTest, RegisterAndDispatchThroughRegistry) {
+  // Verify that after registration, calls dispatch to the registered allocator.
+  DeviceAllocator* alloc = get_device_allocator(DeviceType::CUDA);
+  ASSERT_NE(alloc, nullptr);
+
+  // Use the allocator through the registry and verify it reaches the mock.
+  Error err = alloc->init_buffer(/*memory_id=*/5, /*size=*/2048, /*index=*/0);
+  EXPECT_EQ(err, Error::Ok);
+
+  Result<void*> result = alloc->allocate(/*nbytes=*/256, /*index=*/1);
+  EXPECT_TRUE(result.ok());
+  EXPECT_NE(result.get(), nullptr);
+}

From d630230274e98f87ac6b089d964f2a2a04cb273a Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 18 Feb 2026 11:28:57 -0800
Subject: [PATCH 04/12] Update base for Update on "[ET Device Support]
 DeviceAllocator interface and DeviceAllocatorRegistry"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This diff introduces the `DeviceAllocator` abstract interface and `DeviceAllocatorRegistry` for device-specific memory allocation. This is a foundational abstraction that enables the runtime to dispatch memory operations to the appropriate device backend other than CPU (CUDA, etc.).

**DeviceAllocator interface provides:**
- `init_buffer()` - Initialize memory buffer pools for memory-planned tensors
- `get_offset_address()` - Get pointer to offset within pre-allocated buffer
- `allocate()` / `deallocate()` - Dynamic device memory allocation
- `copy_host_to_device()` / `copy_device_to_host()` - Data transfer between host and device
- `device_type()` - Returns the device type this allocator handles

**DeviceAllocatorRegistry provides:**
- Singleton registry mapping DeviceType → DeviceAllocator
- `register_allocator()` / `get_allocator()` methods
- Fixed-size array indexed by device type (no dynamic allocation, embedded-friendly)

**Design notes:**
- Registry stores raw pointers (non-owning) - allocators are expected to be singletons with static lifetime
- Follows ExecuTorch's embedded-first philosophy (no std::unique_ptr, no heap allocation in registry)
- Convenience free functions `register_device_allocator()` and `get_device_allocator()` for ease of use

Differential Revision: [D93635656](https://our.internmc.facebook.com/intern/diff/D93635656/)

[ghstack-poisoned]
---
 schema/program.fbs | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/schema/program.fbs b/schema/program.fbs
index 07901923410..5b81afa905d 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -62,22 +62,9 @@ enum TensorDataLocation : byte {
 }
 
 // Device type enum indicating where a tensor resides or should be allocated.
-// Follows PyTorch DeviceType convention for compatibility.
 enum DeviceType : byte {
   CPU = 0,
   CUDA = 1,
-  // Reserve slots for future device types following PyTorch convention:
-  // MKLDNN = 2,
-  // OPENGL = 3,
-  // OPENCL = 4,
-  // IDEEP = 5,
-  // HIP = 6,
-  // FPGA = 7,
-  // MAIA = 8,
-  // XLA = 9,
-  // MPS = 10,
-  // XPU = 11,
-  // PrivateUse1 = 12,
 }
 
 // Table to put additional information about tensors in that is not applicable

From 228bdeb7c9511024c769f4fe08b44153c3af0201 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 19 Mar 2026 11:15:20 -0700
Subject: [PATCH 05/12] [ET Device Support] Add NonConstBufferDevice schema for
 per-buffer device mapping

Adds the NonConstBufferDevice table to the FlatBuffer schema (program.fbs) and the
corresponding Python dataclass to schema.py. This enables mapping each non-constant
planned memory buffer to a specific device type (CPU, CUDA, etc.).

The field is optional and absent for CPU-only programs, ensuring zero binary size regression.

Differential Revision: [D97335597](https://our.internmc.facebook.com/intern/diff/D97335597/)

[ghstack-poisoned]
---
 .../executorch_flatbuffer/ExecutionPlan.py    |  62 ++++++++-
 .../NonConstBufferDevice.py                   | 130 ++++++++++++++++++
 .../executorch_flatbuffer/__init__.py         |   2 +
 exir/_serialize/test/test_program.py          |  28 ++++
 exir/schema.py                                |  15 ++
 schema/program.fbs                            |  21 +++
 6 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100644 exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py

diff --git a/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py b/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py
index b8ed496b8a8..340a0ad69aa 100644
--- a/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py
+++ b/exir/_serialize/generated/executorch_flatbuffer/ExecutionPlan.py
@@ -10,6 +10,7 @@
 from executorch.exir._serialize.generated.executorch_flatbuffer.Chain import Chain
 from executorch.exir._serialize.generated.executorch_flatbuffer.ContainerMetadata import ContainerMetadata
 from executorch.exir._serialize.generated.executorch_flatbuffer.EValue import EValue
+from executorch.exir._serialize.generated.executorch_flatbuffer.NonConstBufferDevice import NonConstBufferDevice
 from executorch.exir._serialize.generated.executorch_flatbuffer.Operator import Operator
 from typing import Optional
 np = import_numpy()
@@ -230,8 +231,32 @@ def NonConstBufferSizesIsNone(self) -> bool:
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
         return o == 0
 
+    # ExecutionPlan
+    def NonConstBufferDevice(self, j: int) -> Optional[NonConstBufferDevice]:
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            x = self._tab.Vector(o)
+            x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+            x = self._tab.Indirect(x)
+            obj = NonConstBufferDevice()
+            obj.Init(self._tab.Bytes, x)
+            return obj
+        return None
+
+    # ExecutionPlan
+    def NonConstBufferDeviceLength(self) -> int:
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        if o != 0:
+            return self._tab.VectorLen(o)
+        return 0
+
+    # ExecutionPlan
+    def NonConstBufferDeviceIsNone(self) -> bool:
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(22))
+        return o == 0
+
 def ExecutionPlanStart(builder: flatbuffers.Builder):
-    builder.StartObject(9)
+    builder.StartObject(10)
 
 def Start(builder: flatbuffers.Builder):
     ExecutionPlanStart(builder)
@@ -332,6 +357,18 @@ def ExecutionPlanStartNonConstBufferSizesVector(builder, numElems: int) -> int:
 def StartNonConstBufferSizesVector(builder, numElems: int) -> int:
     return ExecutionPlanStartNonConstBufferSizesVector(builder, numElems)
 
+def ExecutionPlanAddNonConstBufferDevice(builder: flatbuffers.Builder, nonConstBufferDevice: int):
+    builder.PrependUOffsetTRelativeSlot(9, flatbuffers.number_types.UOffsetTFlags.py_type(nonConstBufferDevice), 0)
+
+def AddNonConstBufferDevice(builder: flatbuffers.Builder, nonConstBufferDevice: int):
+    ExecutionPlanAddNonConstBufferDevice(builder, nonConstBufferDevice)
+
+def ExecutionPlanStartNonConstBufferDeviceVector(builder, numElems: int) -> int:
+    return builder.StartVector(4, numElems, 4)
+
+def StartNonConstBufferDeviceVector(builder, numElems: int) -> int:
+    return ExecutionPlanStartNonConstBufferDeviceVector(builder, numElems)
+
 def ExecutionPlanEnd(builder: flatbuffers.Builder) -> int:
     return builder.EndObject()
 
@@ -342,6 +379,7 @@ def End(builder: flatbuffers.Builder) -> int:
 from executorch.exir._serialize.generated.executorch_flatbuffer import Chain
 from executorch.exir._serialize.generated.executorch_flatbuffer import ContainerMetadata
 from executorch.exir._serialize.generated.executorch_flatbuffer import EValue
+from executorch.exir._serialize.generated.executorch_flatbuffer import NonConstBufferDevice
 from executorch.exir._serialize.generated.executorch_flatbuffer import Operator
 try:
     from typing import List, Optional
@@ -361,6 +399,7 @@ def __init__(self):
         self.operators = None  # type: List[executorch_flatbuffer.Operator.OperatorT]
         self.delegates = None  # type: List[executorch_flatbuffer.BackendDelegate.BackendDelegateT]
         self.nonConstBufferSizes = None  # type: List[int]
+        self.nonConstBufferDevice = None  # type: List[executorch_flatbuffer.NonConstBufferDevice.NonConstBufferDeviceT]
 
     @classmethod
     def InitFromBuf(cls, buf, pos):
@@ -389,7 +428,8 @@ def __eq__(self, other):
             self.chains == other.chains and \
             self.operators == other.operators and \
             self.delegates == other.delegates and \
-            self.nonConstBufferSizes == other.nonConstBufferSizes
+            self.nonConstBufferSizes == other.nonConstBufferSizes and \
+            self.nonConstBufferDevice == other.nonConstBufferDevice
 
     # ExecutionPlanT
     def _UnPack(self, executionPlan):
@@ -451,6 +491,14 @@ def _UnPack(self, executionPlan):
                     self.nonConstBufferSizes.append(executionPlan.NonConstBufferSizes(i))
             else:
                 self.nonConstBufferSizes = executionPlan.NonConstBufferSizesAsNumpy()
+        if not executionPlan.NonConstBufferDeviceIsNone():
+            self.nonConstBufferDevice = []
+            for i in range(executionPlan.NonConstBufferDeviceLength()):
+                if executionPlan.NonConstBufferDevice(i) is None:
+                    self.nonConstBufferDevice.append(None)
+                else:
+                    nonConstBufferDevice_ = executorch_flatbuffer.NonConstBufferDevice.NonConstBufferDeviceT.InitFromObj(executionPlan.NonConstBufferDevice(i))
+                    self.nonConstBufferDevice.append(nonConstBufferDevice_)
 
     # ExecutionPlanT
     def Pack(self, builder):
@@ -514,6 +562,14 @@ def Pack(self, builder):
                 for i in reversed(range(len(self.nonConstBufferSizes))):
                     builder.PrependInt64(self.nonConstBufferSizes[i])
                 nonConstBufferSizes = builder.EndVector()
+        if self.nonConstBufferDevice is not None:
+            nonConstBufferDevicelist = []
+            for i in range(len(self.nonConstBufferDevice)):
+                nonConstBufferDevicelist.append(self.nonConstBufferDevice[i].Pack(builder))
+            ExecutionPlanStartNonConstBufferDeviceVector(builder, len(self.nonConstBufferDevice))
+            for i in reversed(range(len(self.nonConstBufferDevice))):
+                builder.PrependUOffsetTRelative(nonConstBufferDevicelist[i])
+            nonConstBufferDevice = builder.EndVector()
         ExecutionPlanStart(builder)
         if self.name is not None:
             ExecutionPlanAddName(builder, name)
@@ -533,5 +589,7 @@ def Pack(self, builder):
             ExecutionPlanAddDelegates(builder, delegates)
         if self.nonConstBufferSizes is not None:
             ExecutionPlanAddNonConstBufferSizes(builder, nonConstBufferSizes)
+        if self.nonConstBufferDevice is not None:
+            ExecutionPlanAddNonConstBufferDevice(builder, nonConstBufferDevice)
         executionPlan = ExecutionPlanEnd(builder)
         return executionPlan
diff --git a/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py b/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py
new file mode 100644
index 00000000000..d82df37d29b
--- /dev/null
+++ b/exir/_serialize/generated/executorch_flatbuffer/NonConstBufferDevice.py
@@ -0,0 +1,130 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: executorch_flatbuffer
+
+import flatbuffers
+from flatbuffers.compat import import_numpy
+from typing import Any
+np = import_numpy()
+
+class NonConstBufferDevice(object):
+    __slots__ = ['_tab']
+
+    @classmethod
+    def GetRootAs(cls, buf, offset: int = 0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+        x = NonConstBufferDevice()
+        x.Init(buf, n + offset)
+        return x
+
+    @classmethod
+    def GetRootAsNonConstBufferDevice(cls, buf, offset=0):
+        """This method is deprecated. Please switch to GetRootAs."""
+        return cls.GetRootAs(buf, offset)
+    @classmethod
+    def NonConstBufferDeviceBufferHasIdentifier(cls, buf, offset, size_prefixed=False):
+        return flatbuffers.util.BufferHasIdentifier(buf, offset, b"\x45\x54\x31\x32", size_prefixed=size_prefixed)
+
+    # NonConstBufferDevice
+    def Init(self, buf: bytes, pos: int):
+        self._tab = flatbuffers.table.Table(buf, pos)
+
+    # NonConstBufferDevice
+    def BufferIdx(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+        return 0
+
+    # NonConstBufferDevice
+    def DeviceType(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+    # NonConstBufferDevice
+    def DeviceIndex(self):
+        o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+        if o != 0:
+            return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+        return 0
+
+def NonConstBufferDeviceStart(builder: flatbuffers.Builder):
+    builder.StartObject(3)
+
+def Start(builder: flatbuffers.Builder):
+    NonConstBufferDeviceStart(builder)
+
+def NonConstBufferDeviceAddBufferIdx(builder: flatbuffers.Builder, bufferIdx: int):
+    builder.PrependInt32Slot(0, bufferIdx, 0)
+
+def AddBufferIdx(builder: flatbuffers.Builder, bufferIdx: int):
+    NonConstBufferDeviceAddBufferIdx(builder, bufferIdx)
+
+def NonConstBufferDeviceAddDeviceType(builder: flatbuffers.Builder, deviceType: int):
+    builder.PrependInt8Slot(1, deviceType, 0)
+
+def AddDeviceType(builder: flatbuffers.Builder, deviceType: int):
+    NonConstBufferDeviceAddDeviceType(builder, deviceType)
+
+def NonConstBufferDeviceAddDeviceIndex(builder: flatbuffers.Builder, deviceIndex: int):
+    builder.PrependInt8Slot(2, deviceIndex, 0)
+
+def AddDeviceIndex(builder: flatbuffers.Builder, deviceIndex: int):
+    NonConstBufferDeviceAddDeviceIndex(builder, deviceIndex)
+
+def NonConstBufferDeviceEnd(builder: flatbuffers.Builder) -> int:
+    return builder.EndObject()
+
+def End(builder: flatbuffers.Builder) -> int:
+    return NonConstBufferDeviceEnd(builder)
+
+
+class NonConstBufferDeviceT(object):
+
+    # NonConstBufferDeviceT
+    def __init__(self):
+        self.bufferIdx = 0  # type: int
+        self.deviceType = 0  # type: int
+        self.deviceIndex = 0  # type: int
+
+    @classmethod
+    def InitFromBuf(cls, buf, pos):
+        nonConstBufferDevice = NonConstBufferDevice()
+        nonConstBufferDevice.Init(buf, pos)
+        return cls.InitFromObj(nonConstBufferDevice)
+
+    @classmethod
+    def InitFromPackedBuf(cls, buf, pos=0):
+        n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, pos)
+        return cls.InitFromBuf(buf, pos+n)
+
+    @classmethod
+    def InitFromObj(cls, nonConstBufferDevice):
+        x = NonConstBufferDeviceT()
+        x._UnPack(nonConstBufferDevice)
+        return x
+
+    def __eq__(self, other):
+        return type(self) == type(other) and \
+            self.bufferIdx == other.bufferIdx and \
+            self.deviceType == other.deviceType and \
+            self.deviceIndex == other.deviceIndex
+
+    # NonConstBufferDeviceT
+    def _UnPack(self, nonConstBufferDevice):
+        if nonConstBufferDevice is None:
+            return
+        self.bufferIdx = nonConstBufferDevice.BufferIdx()
+        self.deviceType = nonConstBufferDevice.DeviceType()
+        self.deviceIndex = nonConstBufferDevice.DeviceIndex()
+
+    # NonConstBufferDeviceT
+    def Pack(self, builder):
+        NonConstBufferDeviceStart(builder)
+        NonConstBufferDeviceAddBufferIdx(builder, self.bufferIdx)
+        NonConstBufferDeviceAddDeviceType(builder, self.deviceType)
+        NonConstBufferDeviceAddDeviceIndex(builder, self.deviceIndex)
+        nonConstBufferDevice = NonConstBufferDeviceEnd(builder)
+        return nonConstBufferDevice
diff --git a/exir/_serialize/generated/executorch_flatbuffer/__init__.py b/exir/_serialize/generated/executorch_flatbuffer/__init__.py
index df59751e724..7cc3b482376 100644
--- a/exir/_serialize/generated/executorch_flatbuffer/__init__.py
+++ b/exir/_serialize/generated/executorch_flatbuffer/__init__.py
@@ -31,6 +31,7 @@
 from . import KernelTypes
 from . import MoveCall
 from . import NamedData
+from . import NonConstBufferDevice
 from . import Null
 from . import Operator
 from . import OptionalTensorList
@@ -75,6 +76,7 @@
     "KernelTypes",
     "MoveCall",
     "NamedData",
+    "NonConstBufferDevice",
     "Null",
     "Operator",
     "OptionalTensorList",
diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py
index 46e8f020a0b..1b6aab94af3 100644
--- a/exir/_serialize/test/test_program.py
+++ b/exir/_serialize/test/test_program.py
@@ -38,7 +38,9 @@
     ContainerMetadata,
     DataLocation,
     DataSegment,
+    DeviceType,
     ExecutionPlan,
+    NonConstBufferDevice,
     Program,
     SubsegmentOffsets,
 )
@@ -477,6 +479,32 @@ def test_round_trip_large_buffer_sizes(self) -> None:
             program, deserialize_pte_binary(flatbuffer_from_py).program
         )
 
+    def test_round_trip_with_non_const_buffer_device(self) -> None:
+        """Tests that non_const_buffer_device survives round-trip
+        serialization/deserialization. This verifies the schema extension
+        for per-buffer device mapping works correctly.
+        """
+        program = get_test_program()
+        program.execution_plan[0].non_const_buffer_device = [
+            NonConstBufferDevice(buffer_idx=0, device_type=DeviceType.CPU, device_index=0),
+            NonConstBufferDevice(buffer_idx=1, device_type=DeviceType.CUDA, device_index=0),
+        ]
+        flatbuffer_from_py = bytes(serialize_pte_binary(pte_file=PTEFile(program)))
+        self.assert_programs_equal(
+            program, deserialize_pte_binary(flatbuffer_from_py).program
+        )
+
+    def test_round_trip_without_non_const_buffer_device(self) -> None:
+        """Tests backward compatibility: a program without non_const_buffer_device
+        (the default) round-trips correctly and the field remains None.
+        """
+        program = get_test_program()
+        self.assertIsNone(program.execution_plan[0].non_const_buffer_device)
+        flatbuffer_from_py = bytes(serialize_pte_binary(pte_file=PTEFile(program)))
+        deserialized = deserialize_pte_binary(flatbuffer_from_py).program
+        self.assert_programs_equal(program, deserialized)
+        self.assertIsNone(deserialized.execution_plan[0].non_const_buffer_device)
+
     def test_round_trip_no_segments_and_no_header(self) -> None:
         """Tests that a Program serialized with extract_delegate_segments=True
         when there are no segments does not contain an extended header,
diff --git a/exir/schema.py b/exir/schema.py
index 993a473dabb..add90dec45c 100644
--- a/exir/schema.py
+++ b/exir/schema.py
@@ -268,6 +268,18 @@ class Operator:
     overload: str
 
 
+@dataclass
+class NonConstBufferDevice:
+    """Maps a non-constant buffer to the device where it should be allocated."""
+
+    # Index into the non_const_buffer_sizes list.
+    buffer_idx: int = 0
+    # The device type for this buffer (CPU, CUDA, etc.).
+    device_type: DeviceType = DeviceType.CPU
+    # The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+    device_index: int = 0
+
+
 @dataclass
 class ExecutionPlan:
     name: str
@@ -283,6 +295,9 @@ class ExecutionPlan:
     # Runtime should use the len(constant_buffer) as the ground truch of
     # constant memory buffer size, and ignore non_const_buffer_sizes[0].
     non_const_buffer_sizes: List[int]
+    # Per-buffer device mapping. Each entry maps a non-constant buffer to the
+    # device where it should be allocated. For CPU-only programs, this is empty.
+    non_const_buffer_device: Optional[List[NonConstBufferDevice]] = None
 
 
 @dataclass
diff --git a/schema/program.fbs b/schema/program.fbs
index f5872633ac8..c6e6edc790f 100644
--- a/schema/program.fbs
+++ b/schema/program.fbs
@@ -401,6 +401,27 @@ table ExecutionPlan {
   // constants memory buffer size, and ignore non_const_buffer_sizes[0].
   non_const_buffer_sizes: [int64];
 
+  // [Optional] Per-buffer device mapping, parallel to non_const_buffer_sizes.
+  // Each entry maps a non-constant buffer to the device where it should be
+  // allocated. For CPU-only programs, this field is absent and all buffers
+  // default to CPU, ensuring zero regression.
+  non_const_buffer_device: [NonConstBufferDevice];
+
+}
+
+// Maps a non-constant buffer to the device where it should be allocated.
+// When present as part of ExecutionPlan.non_const_buffer_device, each entry
+// describes the device placement for the corresponding planned memory buffer.
+// For CPU-only programs, this table is absent (all buffers default to CPU).
+table NonConstBufferDevice {
+  // Index into the non_const_buffer_sizes list.
+  buffer_idx: int;
+
+  // The device type for this buffer (CPU, CUDA, etc.).
+  device_type: DeviceType = CPU;
+
+  // The device index for multi-device scenarios (e.g., cuda:0, cuda:1).
+  device_index: byte = 0;
 }
 
 // Constant tensor data stored directly in the flatbuffer.

From 747dbaaa4c49ea659009445ad634e0dcedecc1ee Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 20 Mar 2026 11:21:56 -0700
Subject: [PATCH 06/12] [ET Device Support] Device-aware memory planning:
 separate buffers per device type

Extends memory planning to separate device tensors from CPU tensors into distinct
memory buffers. Non-CPU TensorSpecs (e.g., CUDA) are pre-assigned device-specific
mem_ids before the greedy/naive algorithm runs, ensuring they get planned into
independent memory buffers that never share space with CPU tensors.

Differential Revision: [D97447105](https://our.internmc.facebook.com/intern/diff/D97447105/)

[ghstack-poisoned]
---
 exir/capture/_config.py             |   8 +-
 exir/memory_planning.py             | 116 ++++++++++++++-----
 exir/passes/memory_planning_pass.py |   3 +
 exir/program/_program.py            |   6 +
 exir/tests/test_memory_planning.py  | 169 ++++++++++++++++++++++++++++
 5 files changed, 273 insertions(+), 29 deletions(-)

diff --git a/exir/capture/_config.py b/exir/capture/_config.py
index 3fbc8ae7ef3..f8c3be6e7c8 100644
--- a/exir/capture/_config.py
+++ b/exir/capture/_config.py
@@ -115,5 +115,11 @@ class ExecutorchBackendConfig:
     # If set to true, we run quant fusion and constant propagation passes
     do_quant_fusion_and_const_prop: bool = False
 
-    # Experimental: If set to true, we run a pass to reinplace ops in the graph.
+    # If set to true, we run a pass to reinplace ops in the graph.
     run_reinplace_pass: bool = False
+
+    # When True, memory planning partitions specs by device and runs the
+    # algorithm independently per device, producing separate buffers for CPU
+    # vs. accelerator memory.  Default False preserves the legacy behavior
+    # where all tensors are planned into CPU memory regardless of device.
+    enable_non_cpu_memory_planning: bool = False
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index c5d3441bcde..f6e3234fce5 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -28,6 +28,7 @@
 import torch
 from executorch.exir import memory
 from executorch.exir.control_flow import while_loop as exir_while
+from executorch.exir.schema import DeviceType, NonConstBufferDevice
 from executorch.exir.delegate import executorch_call_delegate
 from executorch.exir.error import internal_assert, InternalError
 from executorch.exir.operator.convert import is_inplace_variant, is_out_variant
@@ -1211,10 +1212,19 @@ def apply_algo(
     alloc_graph_input: bool = True,
     alloc_graph_output: bool = True,
     alloc_mutable_buffers: bool = True,
+    enable_non_cpu_memory_planning: bool = False,
 ) -> list[int]:
     """
     Recursively apply algo to graph_module and its submodules for control flow.
 
+    Partitions specs by device type and device idx, and runs the memory planning
+    algorithm independently per device, then merges results into separate buffers.
+    This ensures device memory and CPU memory are never mixed.
+
+    When enable_non_cpu_memory_planning is False (default), all specs are planned
+    into a single CPU memory pool regardless of their device attribute. This
+    preserves the legacy behavior. Set to True to enable per-device partitioning.
+
     Algo implementation should handle one of two meta entries for submodules:
     1. input_mem_buffer_sizes: List of int offset bytes. Memory allocated by
        `algo` should start at the offset specified by this list;
@@ -1229,18 +1239,19 @@ def apply_algo(
     `operand` arg. The memory for operands is unused.
     """
     # Extract the nodes and their lifespans from the graph_module
-    # Difficult to just filter the list of specs returned by this due to
-    # how we flag trainable weights.
     _ = update_all_tensors_lifetime(graph_module, graph_signature)
 
-    # Filter specs based on alloc_graph_input and alloc_graph_output
-    specs = collect_specs_from_nodes(
-        graph_module.graph.nodes,
-        graph_signature,
-        do_assertion=False,
-        ignore_graph_input=not alloc_graph_input,
-        ignore_graph_output=not alloc_graph_output,
-        ignore_mutable_buffers=not alloc_mutable_buffers,
+    # Collect and materialize specs into a set so we can iterate multiple
+    # times and partition by device.
+    all_specs: set[TensorSpec] = set(
+        collect_specs_from_nodes(
+            graph_module.graph.nodes,
+            graph_signature,
+            do_assertion=False,
+            ignore_graph_input=not alloc_graph_input,
+            ignore_graph_output=not alloc_graph_output,
+            ignore_mutable_buffers=not alloc_mutable_buffers,
+        )
     )
 
     # Get temporary specs for submodules to set aside space during execution
@@ -1249,29 +1260,78 @@ def apply_algo(
         algo, graph_module, alignment, graph_signature
     )
 
-    # Update `input_mem_buffer_sizes` in graph_module. This will allow existing
-    # algos to work using `input_mem_buffer_sizes` or use
-    # `non_const_buffer_sizes` directly.
-    # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
-    graph_module.input_mem_buffer_sizes = submodule_bufsizes
-
     # Get extra padding for XNNPACK if needed
     extra_padding = 0
     if _contains_xnnpack_delegate(graph_module):
         extra_padding = 64
 
-    # Pass the filtered specs to the algorithm
-    bufsizes: list[int] = algo(
-        alignment,
-        specs,
-        graph_module,
-        graph_signature,
-        extra_padding,
+    # 1. Partition specs by device
+    specs_by_device: dict[DeviceType, set[TensorSpec]] = defaultdict(set)
+    if enable_non_cpu_memory_planning:
+        for spec in all_specs:
+            specs_by_device[spec.device].add(spec)
+    else:
+        # Legacy behavior: all specs planned into CPU memory regardless of device
+        specs_by_device[DeviceType.CPU] = all_specs
+
+    # 2. Plan each device independently
+    global_bufsizes: list[int] = [0]  # index 0 reserved for constants
+    buffer_device_types: list[DeviceType] = [DeviceType.CPU]
+
+    # Process CPU first (if present), then other devices sorted by enum value
+    device_order = sorted(
+        specs_by_device.keys(),
+        key=lambda d: (d != DeviceType.CPU, d.value),
     )
 
-    # pyre-ignore[6]: Incompatible parameter type [6]
-    # In call `insert_calls_to_free`, for 2nd positional argument, expected `Set[TensorSpec]` but got `Iterable[TensorSpec]`
-    insert_calls_to_free(graph_module, specs)
+    for device_type in device_order:
+        device_specs = specs_by_device[device_type]
 
-    graph_module.meta.update({"non_const_buffer_sizes": bufsizes})
-    return bufsizes
+        # Only apply submodule pre-allocation for CPU specs; device buffers
+        # do not share memory space with CPU submodule arenas.
+        # pyre-ignore[16]: `torch.fx.GraphModule` has no attribute `input_mem_buffer_sizes`.
+        graph_module.input_mem_buffer_sizes = (
+            submodule_bufsizes if device_type == DeviceType.CPU else []
+        )
+
+        # Run algorithm independently on this device's specs
+        device_bufsizes = algo(
+            alignment, device_specs, graph_module, graph_signature, extra_padding
+        )
+
+        # Calculate base mem_id in global space
+        base_mem_id = len(global_bufsizes)
+
+        # Append buffer sizes (skip index 0 which is constants placeholder)
+        global_bufsizes.extend(device_bufsizes[1:])
+
+        # Track device type for each new buffer slot
+        for _ in device_bufsizes[1:]:
+            buffer_device_types.append(device_type)
+
+        # Remap spec mem_ids from algo-local to global.
+        # The algorithm assigns mem_id starting from 1; remap to global position.
+        for spec in device_specs:
+            if spec.mem_id is not None:
+                spec.mem_id = (spec.mem_id - 1) + base_mem_id
+
+    # Ensure backward compatibility: at least [0, 0] when no specs exist
+    if len(global_bufsizes) < 2:
+        global_bufsizes.append(0)
+        buffer_device_types.append(DeviceType.CPU)
+
+    # 3. Insert free calls and build device buffer mapping
+    insert_calls_to_free(graph_module, all_specs)
+
+    has_device_buffers = any(dt != DeviceType.CPU for dt in buffer_device_types)
+    non_const_buffer_device: Optional[list[NonConstBufferDevice]] = None
+    if has_device_buffers:
+        non_const_buffer_device = [
+            NonConstBufferDevice(buffer_idx=i, device_type=dt, device_index=0)
+            for i, dt in enumerate(buffer_device_types)
+        ]
+
+    graph_module.meta["non_const_buffer_sizes"] = global_bufsizes
+    if non_const_buffer_device is not None:
+        graph_module.meta["non_const_buffer_device"] = non_const_buffer_device
+    return global_bufsizes
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index f3970f13b56..32c343a4607 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -153,6 +153,7 @@ def __init__(
         alloc_mutable_buffers: bool = True,
         share_mutable_buffers: bool = False,
         alignment: int = ALIGNMENT,
+        enable_non_cpu_memory_planning: bool = False,
     ) -> None:
         r"""
         alloc_graph_input/alloc_graph_output will have 4 different combinations
@@ -173,6 +174,7 @@ def __init__(
         self.alloc_mutable_buffers = alloc_mutable_buffers
         self.share_mutable_buffers = share_mutable_buffers
         self.alignment = alignment
+        self.enable_non_cpu_memory_planning = enable_non_cpu_memory_planning
         self.state = _MemoryPlanningState()
 
     def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
@@ -250,6 +252,7 @@ def run(
             # If mutable buffers are shared, then do not allocate them in the
             # main memory planning algo; they are allocated in run_multimethod.
             self.alloc_mutable_buffers and not self.share_mutable_buffers,
+            self.enable_non_cpu_memory_planning,
         )
 
         if self.share_mutable_buffers and graph_signature is not None:
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 9813b12d594..f1a22773b69 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1792,6 +1792,12 @@ def to_executorch(  # noqa (FLAKE8) C901
             else:
                 memory_planning_pass = config.memory_planning_pass
             # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
+            # Propagate enable_non_cpu_memory_planning from the top-level config
+            # to the pass instance so that device-aware partitioning is applied.
+            if hasattr(memory_planning_pass, "enable_non_cpu_memory_planning"):
+                memory_planning_pass.enable_non_cpu_memory_planning = (
+                    config.enable_non_cpu_memory_planning
+                )
             if hasattr(memory_planning_pass, "run"):
                 new_gm_res = memory_planning_pass.run(new_gm, new_signature)
             else:
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index f364541d900..27ecbdfe633 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -29,6 +29,8 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.memory_planning import (
     _do_user_inputs_exist,
+    apply_algo,
+    collect_specs_from_nodes,
     filter_nodes,
     get_node_tensor_specs,
     greedy,
@@ -45,6 +47,7 @@
     ToOutVarPass,
 )
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+from executorch.exir.schema import DeviceType
 from executorch.exir.tensor import TensorSpec
 from functorch.experimental.control_flow import map as torch_map
 from parameterized import parameterized
@@ -1259,3 +1262,169 @@ def reset(self, k_zeros: torch.Tensor, v_zeros: torch.Tensor) -> None:
             self.assertEqual(v_cache[0].val.allocation_info.memory_id, 2)
             self.assertEqual(v_cache[0].val.allocation_info.memory_offset_low, 256)
             self.assertEqual(v_cache[0].val.allocation_info.memory_offset_high, 0)
+
+
+class TestDeviceAwareMemoryPlanning(unittest.TestCase):
+    """Tests for per-device memory planning (separate buffers per device type)."""
+
+    def _prepare_model(
+        self,
+    ) -> Tuple[GraphModule, ExportGraphSignature]:
+        """Prepare ToyModelForMemPlanning through SpecPropPass + ToOutVarPass."""
+        model = ToyModelForMemPlanning()
+        inputs = model.get_random_inputs()
+        edge = to_edge(export(model, inputs, strict=True))
+        gm = edge.exported_program().graph_module
+        gs = edge.exported_program().graph_signature
+        gm = PassManager(passes=[SpecPropPass(), ToOutVarPass()])(gm).graph_module
+        return gm, gs
+
+    def _get_planned_specs(
+        self,
+        gm: GraphModule,
+        gs: ExportGraphSignature,
+    ) -> list[TensorSpec]:
+        """Get the unique set of specs that apply_algo would plan."""
+        return list(
+            collect_specs_from_nodes(
+                gm.graph.nodes,
+                gs,
+                do_assertion=False,
+                ignore_graph_input=False,
+                ignore_graph_output=False,
+                ignore_mutable_buffers=False,
+            )
+        )
+
+    def test_cpu_only_unchanged(self) -> None:
+        """CPU-only specs produce bufsizes = [0, X] with no device metadata."""
+        gm, gs = self._prepare_model()
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(
+            algo, gm, 16, gs, enable_non_cpu_memory_planning=True
+        )
+
+        # The CUDA spec is the only tensor in its buffer
+        self.assertEqual(bufsizes[0], 0)  # constants
+        self.assertGreater(bufsizes[1], 0)  # CPU activations
+        self.assertNotIn("non_const_buffer_device", gm.meta)
+
+    def test_all_cuda_no_wasted_slots(self) -> None:
+        """CUDA-only specs produce [0, X] with CUDA at buffer index 1."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        for spec in specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [0, cuda_size] — no wasted CPU buffer slot
+        self.assertEqual(len(bufsizes), 2)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        # Device mapping should be present
+        self.assertIn("non_const_buffer_device", gm.meta)
+        device_map = gm.meta["non_const_buffer_device"]
+        self.assertEqual(len(device_map), 2)
+        self.assertEqual(device_map[0].device_type, DeviceType.CPU)  # constants
+        self.assertEqual(device_map[1].device_type, DeviceType.CUDA)
+
+    def test_mixed_cpu_cuda_separate_buffers(self) -> None:
+        """CPU specs at mem_id=1, CUDA specs at mem_id=2, separate sizes."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set second half of specs to CUDA
+        mid = len(specs) // 2
+        self.assertGreater(mid, 0)
+        cpu_specs = specs[:mid]
+        cuda_specs = specs[mid:]
+        for spec in cuda_specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # [constants, cpu_activations, cuda_activations]
+        self.assertEqual(len(bufsizes), 3)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        self.assertGreater(bufsizes[2], 0)
+
+        # CPU specs should have mem_id=1, CUDA specs should have mem_id=2
+        for spec in cpu_specs:
+            self.assertEqual(spec.mem_id, 1, f"CPU spec has wrong mem_id: {spec.mem_id}")
+        for spec in cuda_specs:
+            self.assertEqual(spec.mem_id, 2, f"CUDA spec has wrong mem_id: {spec.mem_id}")
+
+    def test_mem_offset_correct_after_remap(self) -> None:
+        """After remapping, mem_offset is relative to its own buffer."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+
+        # Set the last spec to CUDA (sole CUDA tensor)
+        cuda_spec = specs[-1]
+        cuda_spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        bufsizes = apply_algo(
+            algo, gm, 16, gs, enable_non_cpu_memory_planning=True
+        )
+
+        # The CUDA spec is the only tensor in its buffer, so offset should be 0
+        self.assertEqual(cuda_spec.mem_offset, 0)
+        # The CUDA buffer should fit exactly this tensor
+        cuda_mem_id = cuda_spec.mem_id
+        self.assertIsNotNone(cuda_mem_id)
+        assert cuda_mem_id is not None
+        self.assertGreaterEqual(bufsizes[cuda_mem_id], cuda_spec.allocated_memory)
+
+    def test_no_cross_device_memory_sharing(self) -> None:
+        """Specs on different devices never share buffers, regardless of lifetime."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        self.assertGreaterEqual(len(specs), 2)
+
+        # Assign alternating specs to CUDA to ensure some pairs have
+        # non-overlapping lifetimes (which greedy would normally share).
+        for i, spec in enumerate(specs):
+            if i % 2 == 0:
+                spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        apply_algo(algo, gm, 16, gs, enable_non_cpu_memory_planning=True)
+
+        # Verify CPU and CUDA specs have disjoint mem_ids
+        cpu_mem_ids: set[int] = set()
+        cuda_mem_ids: set[int] = set()
+        for i, spec in enumerate(specs):
+            if spec.mem_id is not None:
+                if i % 2 == 0:
+                    cuda_mem_ids.add(spec.mem_id)
+                else:
+                    cpu_mem_ids.add(spec.mem_id)
+
+        self.assertTrue(
+            cpu_mem_ids.isdisjoint(cuda_mem_ids),
+            f"CPU {cpu_mem_ids} and CUDA {cuda_mem_ids} should not share buffers",
+        )
+
+    def test_disabled_falls_back_to_cpu(self) -> None:
+        """With enable_non_cpu_memory_planning=False (default), CUDA specs are
+        planned into CPU memory — no device-specific buffers are created."""
+        gm, gs = self._prepare_model()
+        specs = self._get_planned_specs(gm, gs)
+        for spec in specs:
+            spec.device = DeviceType.CUDA
+
+        algo = MemoryPlanningAlgorithmSuite(algo_list=[greedy])
+        # Default: enable_non_cpu_memory_planning=False
+        bufsizes = apply_algo(algo, gm, 16, gs)
+
+        # All specs planned into a single CPU pool — same as CPU-only
+        self.assertEqual(len(bufsizes), 2)
+        self.assertEqual(bufsizes[0], 0)
+        self.assertGreater(bufsizes[1], 0)
+        self.assertNotIn("non_const_buffer_device", gm.meta)

From 0829c5d321b1156c0bc179eef87895cdd89eb375 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 14:58:38 -0700
Subject: [PATCH 07/12] [ET Device Support] Emitter reads
 non_const_buffer_device from graph meta

Enable serialzing non_const_buffer_device into into PTE file.

Differential Revision: [D97850707](https://our.internmc.facebook.com/intern/diff/D97850707/)

[ghstack-poisoned]
---
 exir/emit/_emitter.py       |   5 +
 exir/emit/test/test_emit.py | 183 ++++++++++++++++++++++++++++++++++++
 2 files changed, 188 insertions(+)

diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index a48d88fa224..5c1f2db465d 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan:
                 self.module.meta["non_const_buffer_sizes"],
             ),
             container_meta_type=self.container_meta_type,
+            # non_const_buffer_device is set by apply_algo in memory_planning.py
+            # when device tensors are present. None for CPU-only programs.
+            non_const_buffer_device=self.module.meta.get(
+                "non_const_buffer_device", None
+            ),
         )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 04284398656..7d08a94a8af 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -2643,3 +2643,186 @@ def forward(self, a, b):
             0,
             "No tensor should have CUDA device when model runs entirely on CPU",
         )
+
+    def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None:
+        """Verify that non_const_buffer_device is emitted into ExecutionPlan when
+        device-aware memory planning is enabled and non-CPU tensors are present."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        et_prog = lowered.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNotNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be set when device tensors are present "
+            "and enable_non_cpu_memory_planning is True",
+        )
+        self.assertGreater(len(plan.non_const_buffer_device), 0)
+        for entry in plan.non_const_buffer_device:
+            self.assertEqual(entry.device_type, schema.DeviceType.CUDA)
+            self.assertEqual(entry.device_index, 0)
+
+    def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None:
+        """When all tensors are on CPU, non_const_buffer_device should be None
+        even with enable_non_cpu_memory_planning=True."""
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        et_prog = edge.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None for CPU-only programs",
+        )
+
+    def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None:
+        """Even with device tensors, non_const_buffer_device should be None when
+        enable_non_cpu_memory_planning is False (default)."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        # Default: enable_non_cpu_memory_planning=False
+        et_prog = lowered.to_executorch()
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None when "
+            "enable_non_cpu_memory_planning is False",
+        )

From 9dc075bff1e8d77a41a549c5e06e9f68d4d80d41 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 14:59:55 -0700
Subject: [PATCH 08/12] [ET Device Support] DeviceMemoryBuffer RAII class for
 device memory lifetime management

Introduces DeviceMemoryBuffer, an RAII wrapper that owns a single device
memory allocation. On destruction, it automatically calls
DeviceAllocator::deallocate() to free the memory. This mirrors the role of
std::vector<uint8_t> for CPU planned buffers, but for non-cpu device memory (CUDA, etc.).

Key features:
- Static factory create(size, type, index) looks up DeviceAllocator from registry
- Move-only semantics (no copy) to enforce single ownership
- as_span() accessor wraps device pointer for use with HierarchicalAllocator
- Destructor is no-op for default-constructed or moved-from instances

Differential Revision: [D97850709](https://our.internmc.facebook.com/intern/diff/D97850709/)

[ghstack-poisoned]
---
 runtime/core/device_memory_buffer.cpp         |  34 ++++
 runtime/core/device_memory_buffer.h           | 126 +++++++++++++
 runtime/core/portable_type/targets.bzl        |   1 +
 runtime/core/targets.bzl                      |  27 +++
 .../core/test/device_memory_buffer_test.cpp   | 169 ++++++++++++++++++
 runtime/core/test/targets.bzl                 |   8 +
 6 files changed, 365 insertions(+)
 create mode 100644 runtime/core/device_memory_buffer.cpp
 create mode 100644 runtime/core/device_memory_buffer.h
 create mode 100644 runtime/core/test/device_memory_buffer_test.cpp

diff --git a/runtime/core/device_memory_buffer.cpp b/runtime/core/device_memory_buffer.cpp
new file mode 100644
index 00000000000..7eb3f0e3ae2
--- /dev/null
+++ b/runtime/core/device_memory_buffer.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+namespace executorch::runtime {
+
+Result<DeviceMemoryBuffer> DeviceMemoryBuffer::create(
+    size_t size,
+    etensor::DeviceType type,
+    etensor::DeviceIndex index) {
+  DeviceAllocator* allocator = get_device_allocator(type);
+  if (allocator == nullptr) {
+    ET_LOG(
+        Error,
+        "No device allocator registered for device type %d",
+        static_cast<int>(type));
+    return Error::NotFound;
+  }
+
+  auto result = allocator->allocate(size, index);
+  if (!result.ok()) {
+    return result.error();
+  }
+
+  return DeviceMemoryBuffer(result.get(), size, allocator, index);
+}
+
+} // namespace executorch::runtime
diff --git a/runtime/core/device_memory_buffer.h b/runtime/core/device_memory_buffer.h
new file mode 100644
index 00000000000..7071f3de58d
--- /dev/null
+++ b/runtime/core/device_memory_buffer.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch::runtime {
+
+/**
+ * RAII wrapper that owns a single device memory allocation.
+ *
+ * On destruction, calls DeviceAllocator::deallocate() to free the memory.
+ * This mirrors the role of std::vector<uint8_t> for CPU planned buffers,
+ * but for device memory (CUDA, etc.).
+ *
+ * Move-only: cannot be copied, but can be moved to transfer ownership.
+ */
+class DeviceMemoryBuffer final {
+ public:
+  /**
+   * Creates a DeviceMemoryBuffer by allocating device memory.
+   *
+   * Looks up the DeviceAllocator for the given device type via the
+   * DeviceAllocatorRegistry. If no allocator is registered for the type,
+   * returns Error::NotFound.
+   *
+   * @param size Number of bytes to allocate.
+   * @param type The device type (e.g., CUDA).
+   * @param index The device index (e.g., 0 for cuda:0).
+   * @return A Result containing the DeviceMemoryBuffer on success, or an error.
+   */
+  static Result<DeviceMemoryBuffer> create(
+      size_t size,
+      etensor::DeviceType type,
+      etensor::DeviceIndex index = 0);
+
+  DeviceMemoryBuffer() = default;
+
+  ~DeviceMemoryBuffer() {
+    if (ptr_ != nullptr && allocator_ != nullptr) {
+      allocator_->deallocate(ptr_, device_index_);
+    }
+  }
+
+  // Move constructor: transfer ownership.
+  DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept
+      : ptr_(other.ptr_),
+        size_(other.size_),
+        allocator_(other.allocator_),
+        device_index_(other.device_index_) {
+    other.ptr_ = nullptr;
+    other.size_ = 0;
+    other.allocator_ = nullptr;
+  }
+
+  // Move assignment: release current, take ownership.
+  DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept {
+    if (this != &other) {
+      if (ptr_ != nullptr && allocator_ != nullptr) {
+        allocator_->deallocate(ptr_, device_index_);
+      }
+      ptr_ = other.ptr_;
+      size_ = other.size_;
+      allocator_ = other.allocator_;
+      device_index_ = other.device_index_;
+      other.ptr_ = nullptr;
+      other.size_ = 0;
+      other.allocator_ = nullptr;
+    }
+    return *this;
+  }
+
+  // Non-copyable.
+  DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete;
+  DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete;
+
+  /// Returns the device pointer, or nullptr if empty/moved-from.
+  void* data() const {
+    return ptr_;
+  }
+
+  /// Returns the size in bytes of the allocation.
+  size_t size() const {
+    return size_;
+  }
+
+  /**
+   * Returns a Span<uint8_t> wrapping the device pointer.
+   *
+   * This is intended for use with HierarchicalAllocator, which only performs
+   * pointer arithmetic on the span data and never dereferences it. Device
+   * pointers are valid for pointer arithmetic from the CPU side.
+   */
+  Span<uint8_t> as_span() const {
+    return {static_cast<uint8_t*>(ptr_), size_};
+  }
+
+ private:
+  DeviceMemoryBuffer(
+      void* ptr,
+      size_t size,
+      DeviceAllocator* allocator,
+      etensor::DeviceIndex device_index)
+      : ptr_(ptr),
+        size_(size),
+        allocator_(allocator),
+        device_index_(device_index) {}
+
+  void* ptr_ = nullptr;
+  size_t size_ = 0;
+  DeviceAllocator* allocator_ = nullptr;
+  etensor::DeviceIndex device_index_ = 0;
+};
+
+} // namespace executorch::runtime
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 5b6e67fa213..33f18c68006 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -27,6 +27,7 @@ def define_common_targets():
             "//executorch/backends/...",
             "//executorch/extension/fb/dynamic_shim/...",
             "//executorch/kernels/portable/cpu/...",
+            "//executorch/runtime/core/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 2c13cdbdae3..9b40e947626 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -141,6 +141,33 @@ def define_common_targets():
             visibility = ["//executorch/..."],
         )
 
+    runtime.cxx_library(
+        name = "device_allocator",
+        srcs = ["device_allocator.cpp"],
+        exported_headers = [
+            "device_allocator.h",
+        ],
+        exported_deps = [
+            ":core",
+            "//executorch/runtime/core/portable_type:portable_type",
+        ],
+        deps = [
+            "//executorch/runtime/platform:platform",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
+    runtime.cxx_library(
+        name = "device_memory_buffer",
+        srcs = ["device_memory_buffer.cpp"],
+        exported_headers = ["device_memory_buffer.h"],
+        exported_deps = [
+            ":core",
+            ":device_allocator",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
     runtime.cxx_library(
         name = "tag",
         srcs = ["tag.cpp"],
diff --git a/runtime/core/test/device_memory_buffer_test.cpp b/runtime/core/test/device_memory_buffer_test.cpp
new file mode 100644
index 00000000000..81d0a757cf4
--- /dev/null
+++ b/runtime/core/test/device_memory_buffer_test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+/**
+ * A mock DeviceAllocator for testing DeviceMemoryBuffer.
+ * Returns pointers into a local buffer and tracks call counts.
+ */
+class MockAllocator : public DeviceAllocator {
+ public:
+  explicit MockAllocator(DeviceType type) : type_(type) {}
+
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    allocate_count_++;
+    last_allocate_size_ = nbytes;
+    return static_cast<void*>(buffer_);
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    last_deallocate_ptr_ = ptr;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  size_t last_allocate_size_ = 0;
+  void* last_deallocate_ptr_ = nullptr;
+  uint8_t buffer_[256] = {};
+
+ private:
+  DeviceType type_;
+};
+
+// Global mock registered once before all tests run.
+static MockAllocator g_mock_cuda(DeviceType::CUDA);
+
+class DeviceMemoryBufferTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
+  }
+
+  void SetUp() override {
+    // Reset counters before each test.
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
+    g_mock_cuda.last_allocate_size_ = 0;
+    g_mock_cuda.last_deallocate_ptr_ = nullptr;
+  }
+};
+
+TEST_F(DeviceMemoryBufferTest, DefaultConstructedIsEmpty) {
+  DeviceMemoryBuffer buf;
+  EXPECT_EQ(buf.data(), nullptr);
+  EXPECT_EQ(buf.size(), 0);
+
+  auto span = buf.as_span();
+  EXPECT_EQ(span.data(), nullptr);
+  EXPECT_EQ(span.size(), 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateAllocatesAndDestructorDeallocates) {
+  {
+    auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0);
+    ASSERT_TRUE(result.ok());
+
+    auto buf = std::move(result.get());
+    EXPECT_NE(buf.data(), nullptr);
+    EXPECT_EQ(buf.size(), 1024);
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 1024);
+    EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+  }
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_deallocate_ptr_, g_mock_cuda.buffer_);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateFailsWithNoRegisteredAllocator) {
+  auto result = DeviceMemoryBuffer::create(512, DeviceType::CPU, 0);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), Error::NotFound);
+}
+
+TEST_F(DeviceMemoryBufferTest, MoveConstructorTransfersOwnership) {
+  auto result = DeviceMemoryBuffer::create(256, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto original = std::move(result.get());
+  void* original_ptr = original.data();
+
+  DeviceMemoryBuffer moved(std::move(original));
+
+  EXPECT_EQ(original.data(), nullptr);
+  EXPECT_EQ(original.size(), 0);
+  EXPECT_EQ(moved.data(), original_ptr);
+  EXPECT_EQ(moved.size(), 256);
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, MoveAssignmentTransfersOwnership) {
+  auto result = DeviceMemoryBuffer::create(128, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto original = std::move(result.get());
+  void* original_ptr = original.data();
+
+  DeviceMemoryBuffer target;
+  target = std::move(original);
+
+  EXPECT_EQ(original.data(), nullptr);
+  EXPECT_EQ(target.data(), original_ptr);
+  EXPECT_EQ(target.size(), 128);
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, DestructorNoOpForDefaultConstructed) {
+  {
+    DeviceMemoryBuffer buf;
+  }
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, AsSpanWrapsDevicePointer) {
+  auto result = DeviceMemoryBuffer::create(2048, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto buf = std::move(result.get());
+
+  auto span = buf.as_span();
+  EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
+  EXPECT_EQ(span.size(), 2048);
+}
diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 1ad0940c62e..0436d3e10dd 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -7,6 +7,14 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
+    runtime.cxx_test(
+        name = "device_memory_buffer_test",
+        srcs = ["device_memory_buffer_test.cpp"],
+        deps = [
+            "//executorch/runtime/core:device_memory_buffer",
+        ],
+    )
+
     runtime.cxx_test(
         name = "span_test",
         srcs = ["span_test.cpp"],

From 1fdbbd16da03c92b7b71e90f03acb6dbe5bb9ab9 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 15:01:28 -0700
Subject: [PATCH 09/12] [ET Device Support] MethodMeta: expose per-buffer
 device placement API

Add memory_planned_buffer_device(index) to MethodMeta, returning the
Device (type + index) for each planned memory buffer. This reads from
the non_const_buffer_device field in the serialized ExecutionPlan.

For CPU-only programs (or legacy PTE files without non_const_buffer_device),
all buffers default to Device{CPU, 0}. The sparse list only stores entries
for non-CPU buffers, so the lookup scans for a matching buffer_idx.

This API enables Module::load_method() to query each buffer's target device
and allocate accordingly (malloc for CPU, DeviceAllocator for CUDA, etc.).

Differential Revision: [D97850708](https://our.internmc.facebook.com/intern/diff/D97850708/)

[ghstack-poisoned]
---
 runtime/core/test/targets.bzl                 |  2 +-
 runtime/executor/method_meta.cpp              | 36 +++++++++++++
 runtime/executor/method_meta.h                | 14 +++++
 runtime/executor/test/method_meta_test.cpp    | 51 +++++++++++++++++++
 runtime/executor/test/targets.bzl             |  7 ++-
 .../models/export_program_with_device_info.py |  7 ++-
 6 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 0436d3e10dd..c4da8cc37de 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -50,7 +50,7 @@ def define_common_targets():
             "//executorch/runtime/core:core",
         ],
     )
-    
+
     runtime.cxx_test(
         name = "event_tracer_test",
         srcs = [
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index 75dadfd893a..ca1b3254338 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -325,6 +325,42 @@ Result<int64_t> MethodMeta::memory_planned_buffer_size(size_t index) const {
   return s_plan_->non_const_buffer_sizes()->Get(index + 1);
 }
 
+Result<etensor::Device> MethodMeta::memory_planned_buffer_device(
+    size_t index) const {
+  auto num_buffers = this->num_memory_planned_buffers();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < num_buffers,
+      InvalidArgument,
+      "index %zu out of range. num_buffers: %zu",
+      index,
+      num_buffers);
+
+  // The non_const_buffer_device field is optional and only present when the
+  // program contains non-CPU buffers. For CPU-only programs (or legacy PTE
+  // files), this field is null and all buffers default to CPU.
+  auto* buffer_devices = s_plan_->non_const_buffer_device();
+  if (buffer_devices == nullptr) {
+    return etensor::Device{etensor::DeviceType::CPU, 0};
+  }
+
+  // The sparse list only contains entries for non-CPU buffers.
+  // buffer_idx uses the same indexing as non_const_buffer_sizes (1-based,
+  // with index 0 reserved). The user-facing index is 0-based, so we
+  // compare against index + 1.
+  const auto internal_idx = static_cast<int32_t>(index + 1);
+  for (size_t i = 0; i < buffer_devices->size(); ++i) {
+    auto entry = buffer_devices->Get(i);
+    if (entry->buffer_idx() == internal_idx) {
+      return etensor::Device{
+          static_cast<etensor::DeviceType>(entry->device_type()),
+          static_cast<etensor::DeviceIndex>(entry->device_index())};
+    }
+  }
+
+  // Not found in the sparse list — this buffer is on CPU.
+  return etensor::Device{etensor::DeviceType::CPU, 0};
+}
+
 bool MethodMeta::uses_backend(const char* backend_name) const {
   ET_CHECK_MSG(backend_name, "backend name is null");
   const auto delegates = s_plan_->delegates();
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
index 79fd05c28ee..e0fa16cda22 100644
--- a/runtime/executor/method_meta.h
+++ b/runtime/executor/method_meta.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/core/tag.h>
@@ -234,6 +235,19 @@ class MethodMeta final {
    */
   Result<int64_t> memory_planned_buffer_size(size_t index) const;
 
+  /**
+   * Get the device placement for the specified memory-planned buffer.
+   *
+   * For CPU-only programs (no non_const_buffer_device in the PTE), all buffers
+   * default to Device{CPU, 0}. For programs with device annotations, returns
+   * the device type and index that the buffer should be allocated on.
+   *
+   * @param[in] index The index of the buffer to look up (0-based, same
+   *     indexing as memory_planned_buffer_size()).
+   * @returns The Device on success, or an error on failure.
+   */
+  Result<etensor::Device> memory_planned_buffer_device(size_t index) const;
+
   /**
    * Check to see if a backend is used in this method.
    *
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index e4ef2e72a85..4b2fdb26da2 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -74,6 +74,10 @@ class MethodMetaTest : public ::testing::Test {
   void SetUp() override {
     load_program(std::getenv("ET_MODULE_ADD_PATH"), "add");
     load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful");
+    const char* device_path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+    if (device_path != nullptr) {
+      load_program(device_path, "add_with_device");
+    }
   }
 
  private:
@@ -192,6 +196,27 @@ TEST_F(MethodMetaTest, MethodMetaAttribute) {
   ASSERT_EQ(bad_access.error(), Error::InvalidArgument);
 }
 
+TEST_F(MethodMetaTest, MemoryPlannedBufferDeviceDefaultsCpu) {
+  Result<MethodMeta> method_meta = programs_["add"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // CPU-only model: all buffers should default to CPU device.
+  size_t num_buffers = method_meta->num_memory_planned_buffers();
+  ASSERT_GT(num_buffers, 0);
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto device = method_meta->memory_planned_buffer_device(i);
+    ASSERT_TRUE(device.ok());
+    EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CPU);
+    EXPECT_EQ(device->index(), 0);
+  }
+
+  // Out of range returns error.
+  EXPECT_EQ(
+      method_meta->memory_planned_buffer_device(num_buffers).error(),
+      Error::InvalidArgument);
+}
+
 TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
   // Create sizes that will cause overflow when multiplied
   std::vector<int32_t> overflow_sizes = {
@@ -214,3 +239,29 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
           executorch::aten::string_view{nullptr, 0}),
       "");
 }
+
+TEST_F(MethodMetaTest, MethodMetaBufferDeviceReturnsCudaForDeviceBuffer) {
+  ASSERT_NE(programs_.find("add_with_device"), programs_.end())
+      << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
+  Result<MethodMeta> method_meta =
+      programs_["add_with_device"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // ModuleAddWithDevice exports with enable_non_cpu_memory_planning=True.
+  // The model delegates add(a,b) to CUDA, producing:
+  //   non_const_buffer_sizes: [0, 48]  (index 0 reserved)
+  //   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
+  // So there is exactly 1 planned buffer (user-facing index 0), on CUDA.
+  ASSERT_EQ(method_meta->num_memory_planned_buffers(), 1);
+
+  // Buffer 0 should be CUDA device.
+  auto device = method_meta->memory_planned_buffer_device(0);
+  ASSERT_TRUE(device.ok());
+  EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CUDA);
+  EXPECT_EQ(device->index(), 0);
+
+  // Out of range should return error.
+  EXPECT_EQ(
+      method_meta->memory_planned_buffer_device(1).error(),
+      Error::InvalidArgument);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index f4534aefdea..74ea9a8262d 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -178,7 +178,12 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/runtime/executor:program",
                 "//executorch/extension/data_loader:file_data_loader",
             ],
-            env = modules_env,
+            env = dict(
+                modules_env,
+                **{
+                    "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+                }
+            ),
         )
 
         runtime.cxx_test(
diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py
index 1abf73bfb73..246c41bb9f3 100644
--- a/test/models/export_program_with_device_info.py
+++ b/test/models/export_program_with_device_info.py
@@ -99,7 +99,12 @@ def main() -> None:
         compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
     lowered = edge.to_backend(_DeviceAwarePartitioner())
-    et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False))
+    et_prog = lowered.to_executorch(
+        ExecutorchBackendConfig(
+            emit_stacktrace=False,
+            enable_non_cpu_memory_planning=True,
+        )
+    )
 
     os.makedirs(args.outdir, exist_ok=True)
     outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte")

From 6af305941c640ed3ebd329cdc2172ae3c06141a8 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 24 Mar 2026 15:01:37 -0700
Subject: [PATCH 10/12] [ET Device Support] MemoryManager: add per-buffer
 device metadata

This diff extend MemoryManager with optional per-buffer device type metadata so the runtime explicitly knows which planned memory buffers are on which device. This enables future device-aware dispatch and debugging.

Changes:
- New constructor taking planned_buffer_devices as extra input for device info
- New accessors: planned_buffer_devices(), has_device_memory()
- No existing functionalities have been updated.

Differential Revision: [D97850706](https://our.internmc.facebook.com/intern/diff/D97850706/)

[ghstack-poisoned]
---
 runtime/core/portable_type/targets.bzl        |   1 +
 runtime/executor/memory_manager.h             |  46 +++++
 runtime/executor/targets.bzl                  |   1 +
 runtime/executor/test/memory_manager_test.cpp |  44 +++++
 runtime/executor/test/targets.bzl             |   3 +
 .../test/tensor_parser_device_test.cpp        | 170 ++++++++++++++++++
 6 files changed, 265 insertions(+)

diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 33f18c68006..66fa9986e15 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -28,6 +28,7 @@ def define_common_targets():
             "//executorch/extension/fb/dynamic_shim/...",
             "//executorch/kernels/portable/cpu/...",
             "//executorch/runtime/core/...",
+            "//executorch/runtime/executor/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h
index 42edd9f0bea..b80344d4c9c 100644
--- a/runtime/executor/memory_manager.h
+++ b/runtime/executor/memory_manager.h
@@ -10,6 +10,8 @@
 
 #include <executorch/runtime/core/hierarchical_allocator.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/core/span.h>
 
 namespace executorch {
 namespace runtime {
@@ -61,6 +63,32 @@ class MemoryManager final {
         "method allocator cannot be the same as temp allocator");
   }
 
+  /**
+   * Constructs a new MemoryManager with per-buffer device metadata.
+   *
+   * @param[in] method_allocator Same as above.
+   * @param[in] planned_memory Same as above. May contain a mix of CPU and
+   *     device pointers — HierarchicalAllocator only does pointer arithmetic,
+   *     so device pointers are valid.
+   * @param[in] temp_allocator Same as above.
+   * @param[in] planned_buffer_devices One entry per planned memory buffer
+   *     (same count as planned_memory buffers), indicating the device type for
+   *     each buffer. For CPU-only programs, use the 3-arg constructor instead.
+   */
+  MemoryManager(
+      MemoryAllocator* method_allocator,
+      HierarchicalAllocator* planned_memory,
+      MemoryAllocator* temp_allocator,
+      Span<const etensor::DeviceType> planned_buffer_devices)
+      : method_allocator_(method_allocator),
+        planned_memory_(planned_memory),
+        temp_allocator_(temp_allocator),
+        planned_buffer_devices_(planned_buffer_devices) {
+    ET_CHECK_MSG(
+        method_allocator != temp_allocator,
+        "method allocator cannot be the same as temp allocator");
+  }
+
   /**
    * DEPRECATED: Use the constructor without `constant_allocator` instead.
    *
@@ -105,10 +133,28 @@ class MemoryManager final {
     return temp_allocator_;
   }
 
+  /**
+   * Returns per-buffer device metadata. One entry per planned memory buffer,
+   * same count as planned_memory buffers. Empty if no device metadata was
+   * provided (CPU-only program).
+   */
+  Span<const etensor::DeviceType> planned_buffer_devices() const {
+    return planned_buffer_devices_;
+  }
+
+  /**
+   * Returns true if any planned buffer is on a non-CPU device.
+   * When false, the memory setup is CPU-only and follows the legacy path.
+   */
+  bool has_device_memory() const {
+    return planned_buffer_devices_.size() > 0;
+  }
+
  private:
   MemoryAllocator* method_allocator_;
   HierarchicalAllocator* planned_memory_;
   MemoryAllocator* temp_allocator_;
+  Span<const etensor::DeviceType> planned_buffer_devices_;
 };
 
 } // namespace runtime
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index 90f8d0221e9..2441c55b58e 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -36,6 +36,7 @@ def define_common_targets():
         ],
         exported_deps = [
             "//executorch/runtime/core:memory_allocator",
+            "//executorch/runtime/core/portable_type:portable_type",
         ],
         visibility = ["PUBLIC"],
     )
diff --git a/runtime/executor/test/memory_manager_test.cpp b/runtime/executor/test/memory_manager_test.cpp
index 0e1feb47793..3a7a07d145a 100644
--- a/runtime/executor/test/memory_manager_test.cpp
+++ b/runtime/executor/test/memory_manager_test.cpp
@@ -17,6 +17,8 @@ using namespace ::testing;
 using executorch::runtime::HierarchicalAllocator;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::MemoryManager;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::DeviceType;
 
 TEST(MemoryManagerTest, MinimalCtor) {
   MemoryAllocator method_allocator(0, nullptr);
@@ -93,3 +95,45 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
           /*temp_allocator=*/&method_allocator),
       "cannot be the same");
 }
+
+TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
+  MemoryAllocator method_allocator(0, nullptr);
+  HierarchicalAllocator planned_memory({});
+  MemoryAllocator temp_allocator(0, nullptr);
+
+  MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
+
+  EXPECT_FALSE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
+}
+
+TEST(MemoryManagerTest, FourArgCtorWithDeviceMetadata) {
+  MemoryAllocator method_allocator(0, nullptr);
+  HierarchicalAllocator planned_memory({});
+  MemoryAllocator temp_allocator(0, nullptr);
+
+  // 3 buffers: CPU, CUDA, CPU
+  DeviceType devices[] = {DeviceType::CPU, DeviceType::CUDA, DeviceType::CPU};
+  Span<const DeviceType> device_span(devices, 3);
+
+  MemoryManager mm(
+      &method_allocator, &planned_memory, &temp_allocator, device_span);
+
+  EXPECT_EQ(mm.method_allocator(), &method_allocator);
+  EXPECT_EQ(mm.planned_memory(), &planned_memory);
+  EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
+  EXPECT_TRUE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 3);
+  EXPECT_EQ(mm.planned_buffer_devices()[0], DeviceType::CPU);
+  EXPECT_EQ(mm.planned_buffer_devices()[1], DeviceType::CUDA);
+  EXPECT_EQ(mm.planned_buffer_devices()[2], DeviceType::CPU);
+}
+
+TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
+  MemoryAllocator method_allocator(0, nullptr);
+
+  MemoryManager mm(&method_allocator);
+
+  EXPECT_FALSE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index 74ea9a8262d..32baa63a76b 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
+                "//executorch/extension/module/test/...",
                 "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
@@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 ":managed_memory_manager",
                 "//executorch/runtime/executor:program",
+                "//executorch/runtime/core:device_allocator",
+                "//executorch/runtime/core:device_memory_buffer",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/schema:program",
             ],
diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
index 46488eacd0b..6baf525aa12 100644
--- a/runtime/executor/test/tensor_parser_device_test.cpp
+++ b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -17,18 +17,32 @@
 #include <executorch/runtime/executor/tensor_parser.h>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/schema/program_generated.h>
 
 #include <gtest/gtest.h>
 
 using executorch::aten::Tensor;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
 using executorch::runtime::Error;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::MethodMeta;
 using executorch::runtime::Program;
 using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
 using executorch::runtime::deserialization::parseTensor;
 using executorch::runtime::testing::ManagedMemoryManager;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
 using torch::executor::util::FileDataLoader;
 
 constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
@@ -50,8 +64,67 @@ class ProgramTestFriend final {
 
 using executorch::runtime::testing::ProgramTestFriend;
 
+namespace {
+
+/**
+ * Mock CUDA allocator that uses host memory for testing.
+ * Tracks the allocated range so tests can verify tensor data_ptr
+ * falls within the "device" memory region.
+ */
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    allocate_count_++;
+    buffer_ = std::make_unique<uint8_t[]>(nbytes);
+    buffer_size_ = nbytes;
+    return static_cast<void*>(buffer_.get());
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    buffer_.reset();
+    buffer_size_ = 0;
+  }
+
+  Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  bool is_device_ptr(const void* ptr) const {
+    if (buffer_ == nullptr || buffer_size_ == 0) {
+      return false;
+    }
+    auto* p = static_cast<const uint8_t*>(ptr);
+    return p >= buffer_.get() && p < buffer_.get() + buffer_size_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+
+ private:
+  std::unique_ptr<uint8_t[]> buffer_;
+  size_t buffer_size_ = 0;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
 class TensorParserDeviceTest : public ::testing::Test {
  protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
+  }
+
   void SetUp() override {
     const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
     ASSERT_NE(path, nullptr)
@@ -59,6 +132,9 @@ class TensorParserDeviceTest : public ::testing::Test {
     Result<FileDataLoader> loader = FileDataLoader::from(path);
     ASSERT_EQ(loader.error(), Error::Ok);
     loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
   }
 
   std::unique_ptr<FileDataLoader> loader_;
@@ -169,3 +245,97 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
         << " without device annotation should have device_index=0";
   }
 }
+TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  Result<MethodMeta> method_meta = program->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // ModuleAddWithDevice has:
+  //   non_const_buffer_sizes: [0, 48]  (index 0 reserved, buffer 0 = 48 bytes)
+  //   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
+  const size_t num_buffers = method_meta->num_memory_planned_buffers();
+  ASSERT_EQ(num_buffers, 1);
+
+  // Set up device-aware planned memory.
+  std::vector<Span<uint8_t>> planned_spans;
+  std::vector<std::vector<uint8_t>> cpu_buffers;
+  std::vector<DeviceMemoryBuffer> device_buffers;
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto size = method_meta->memory_planned_buffer_size(i);
+    ASSERT_TRUE(size.ok());
+    auto device = method_meta->memory_planned_buffer_device(i);
+    ASSERT_TRUE(device.ok());
+
+    if (device->is_cpu()) {
+      cpu_buffers.emplace_back(size.get());
+      planned_spans.emplace_back(
+          cpu_buffers.back().data(), cpu_buffers.back().size());
+    } else {
+      cpu_buffers.emplace_back(); // empty placeholder
+      auto dmb = DeviceMemoryBuffer::create(
+          size.get(), device->type(), device->index());
+      ASSERT_TRUE(dmb.ok())
+          << "DeviceMemoryBuffer::create failed for buffer " << i;
+      planned_spans.emplace_back(dmb->as_span());
+      device_buffers.push_back(std::move(dmb.get()));
+    }
+  }
+
+  ASSERT_EQ(g_mock_cuda.allocate_count_, 1);
+
+  // Build HierarchicalAllocator with mixed CPU/device spans.
+  HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  constexpr size_t kMethodAllocBytes = 32 * 1024U;
+  auto method_alloc_pool = std::make_unique<uint8_t[]>(kMethodAllocBytes);
+  MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get());
+  MemoryManager memory_manager(&method_allocator, &planned_memory);
+
+  // Parse tensors and verify CUDA tensors have device memory.
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  int cuda_with_device_memory = 0;
+
+  for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+    bool is_cuda = s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->device_type() ==
+            executorch_flatbuffer::DeviceType::CUDA;
+
+    Result<Tensor> tensor =
+        parseTensor(&program.get(), &memory_manager, s_tensor);
+    ASSERT_TRUE(tensor.ok())
+        << "parseTensor failed at index " << i
+        << " with error 0x" << std::hex
+        << static_cast<uint32_t>(tensor.error());
+
+    Tensor t = tensor.get();
+
+    if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) {
+      EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr()))
+          << "CUDA tensor at index " << i
+          << " should have data_ptr in device memory, but got CPU memory";
+      cuda_with_device_memory++;
+    }
+  }
+
+  // All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
+  // their data_ptr pointing to the mock device memory buffer.
+  EXPECT_EQ(cuda_with_device_memory, 3)
+      << "All 3 CUDA tensors should have data_ptr in device memory";
+}

From 8bd18ab125bd1bf64482dbc80e7a72be86a3322b Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 18 May 2026 17:25:49 -0700
Subject: [PATCH 11/12] Update (base update)

[ghstack-poisoned]
---
 runtime/executor/test/tensor_parser_device_test.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
index 918e564e093..3cd5570b42b 100644
--- a/runtime/executor/test/tensor_parser_device_test.cpp
+++ b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -73,7 +73,12 @@ namespace {
  */
 class MockCudaAllocator : public DeviceAllocator {
  public:
-  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignement = kDefaultAlignment) override {
+    (void)alignement;
+    (void)index;
     allocate_count_++;
     buffer_ = std::make_unique<uint8_t[]>(nbytes);
     buffer_size_ = nbytes;

From b68f982d91a7a9894f39fd51ccfd196bc12bf48c Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@icloud.com>
Date: Fri, 22 May 2026 17:54:19 -0700
Subject: [PATCH 12/12] [ET Device Support] Module: allocate device memory for
 planned buffers

Differential Revision: D97850705

Pull Request resolved: https://github.com/pytorch/executorch/pull/18476
---
 extension/module/module.cpp                   |  78 ++++++-
 extension/module/module.h                     |   9 +
 extension/module/targets.bzl                  |   1 +
 .../module/test/module_device_memory_test.cpp | 218 ++++++++++++++++++
 extension/module/test/targets.bzl             |  22 +-
 .../executorch/build/build_variables.bzl      |   2 +
 test/models/targets.bzl                       |   1 +
 7 files changed, 328 insertions(+), 3 deletions(-)
 create mode 100644 extension/module/test/module_device_memory_test.cpp

diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 5422fb15b71..11fea031603 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -13,6 +13,7 @@
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
 #include <executorch/runtime/platform/runtime.h>
 
 namespace executorch {
@@ -367,6 +368,51 @@ Module::make_planned_memory_with_shared_arenas(
   return planned;
 }
 
+std::unique_ptr<Module::PlannedMemory> Module::make_planned_memory_with_devices(
+    const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) {
+  auto planned = std::make_unique<PlannedMemory>();
+  const size_t num_buffers = method_meta.num_memory_planned_buffers();
+  planned->planned_buffers.reserve(num_buffers);
+  planned->planned_spans.reserve(num_buffers);
+  planned->device_buffers.reserve(num_buffers);
+  planned->planned_devices.reserve(num_buffers);
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto size = method_meta.memory_planned_buffer_size(i);
+    ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i);
+    auto device = method_meta.memory_planned_buffer_device(i);
+    ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i);
+    planned->planned_devices.push_back(device.get());
+
+    if (device->is_cpu()) {
+      planned->planned_buffers.emplace_back(size.get());
+      planned->planned_spans.emplace_back(
+          planned->planned_buffers.back().data(), size.get());
+    } else {
+      // Allocate device memory via DeviceAllocator and store the RAII buffer.
+      planned->planned_buffers.emplace_back(); // empty CPU placeholder
+      auto dmb = runtime::DeviceMemoryBuffer::create(
+          size.get(), device->type(), device->index());
+      ET_CHECK_MSG(
+          dmb.ok(),
+          "Failed to allocate device memory for buffer %zu (device_type=%d)",
+          i,
+          static_cast<int>(device->type()));
+      planned->planned_spans.emplace_back(dmb->as_span());
+      planned->device_buffers.push_back(std::move(dmb.get()));
+    }
+  }
+
+  // HierarchicalAllocator owns the per-buffer Device metadata so the
+  // MemoryManager can later expose it via planned_buffer_devices().
+  planned->planned_memory = std::make_unique<runtime::HierarchicalAllocator>(
+      runtime::Span<runtime::Span<uint8_t>>(
+          planned->planned_spans.data(), planned->planned_spans.size()),
+      runtime::Span<const runtime::etensor::Device>(
+          planned->planned_devices.data(), planned->planned_devices.size()));
+  return planned;
+}
+
 runtime::Result<std::vector<size_t>> Module::get_mem_planned_buffer_sizes(
     const std::string& method_name) {
   auto meta_res = program_->method_meta(method_name.c_str());
@@ -422,10 +468,38 @@ runtime::Error Module::load_method(
     MethodHolder method_holder;
 
     if (!planned_memory) {
-      if (!share_memory_arenas_) {
+      // Check if any buffers need device memory allocation.
+      auto meta_res = program_->method_meta(method_name.c_str());
+      ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error());
+      auto& meta = meta_res.get();
+
+      bool has_device_buffers = false;
+      for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
+        auto dev = meta.memory_planned_buffer_device(i);
+        if (dev.ok() && !dev->is_cpu()) {
+          has_device_buffers = true;
+          break;
+        }
+      }
+
+      if (has_device_buffers) {
+        // Device memory with shared arenas is not yet supported.
+        ET_CHECK_OR_RETURN_ERROR(
+            !share_memory_arenas_,
+            NotSupported,
+            "Device memory buffers are not yet compatible with "
+            "share_memory_arenas. Please disable share_memory_arenas "
+            "when using models with device-planned memory.");
+
+        // Device-aware path: allocate CPU and device buffers. The device
+        // span is owned by the HierarchicalAllocator inside PlannedMemory.
+        method_holder.planned_memory = make_planned_memory_with_devices(meta);
+        planned_memory = method_holder.planned_memory->planned_memory.get();
+      } else if (!share_memory_arenas_) {
         auto sizes_res = get_mem_planned_buffer_sizes(method_name);
         ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
         method_holder.planned_memory = make_planned_memory(sizes_res.get());
+        planned_memory = method_holder.planned_memory->planned_memory.get();
       } else {
         auto sizes_res = get_mem_planned_buffer_sizes(method_name);
         ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
@@ -442,8 +516,8 @@ runtime::Error Module::load_method(
         }
         method_holder.planned_memory =
             make_planned_memory_with_shared_arenas(sizes, shared_arenas_);
+        planned_memory = method_holder.planned_memory->planned_memory.get();
       }
-      planned_memory = method_holder.planned_memory->planned_memory.get();
     }
 
     method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(
diff --git a/extension/module/module.h b/extension/module/module.h
index 47ead23032e..91c7feaad9b 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -18,6 +18,8 @@
 #include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/executor/program.h>
 
+#include <executorch/runtime/core/device_memory_buffer.h>
+
 #ifdef USE_ATEN_LIB
 #define ET_MODULE_NAMESPACE module::aten
 #else // !USE_ATEN_LIB
@@ -716,6 +718,11 @@ class Module {
   struct PlannedMemory {
     std::vector<std::vector<uint8_t>> planned_buffers;
     std::vector<runtime::Span<uint8_t>> planned_spans;
+    std::vector<runtime::DeviceMemoryBuffer> device_buffers;
+    /// Per-buffer Device (type + index) metadata used by
+    /// HierarchicalAllocator. Owns the storage backing the device span the
+    /// allocator references, so it must outlive `planned_memory`.
+    std::vector<runtime::etensor::Device> planned_devices;
     std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
   };
   std::unique_ptr<PlannedMemory> make_planned_memory(
@@ -723,6 +730,8 @@ class Module {
   std::unique_ptr<PlannedMemory> make_planned_memory_with_shared_arenas(
       const std::vector<size_t>& buffer_sizes,
       std::vector<std::vector<uint8_t>>& shared_arenas);
+  std::unique_ptr<PlannedMemory> make_planned_memory_with_devices(
+      const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta);
   runtime::Result<std::vector<size_t>> get_mem_planned_buffer_sizes(
       const std::string& method_name);
   runtime::Result<std::vector<size_t>> get_max_mem_planned_buffer_sizes();
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index fa80203831a..e622b138ff6 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -30,6 +30,7 @@ def define_common_targets():
                 "//executorch/runtime/backend:backend_options",
                 "//executorch/runtime/backend:backend_options_map",
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
+                "//executorch/runtime/core:device_memory_buffer",
             ],
         )
 
diff --git a/extension/module/test/module_device_memory_test.cpp b/extension/module/test/module_device_memory_test.cpp
new file mode 100644
index 00000000000..5031273ac2b
--- /dev/null
+++ b/extension/module/test/module_device_memory_test.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests that Module's device-aware memory allocation path works correctly.
+ *
+ * Uses ModuleAddWithDevice.pte which has:
+ *   non_const_buffer_sizes: [0, 48]  (1 buffer, index 0 reserved)
+ *   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
+ *
+ * Since we don't have a real CUDA backend, we test that:
+ * 1. CPU-only models load through Module without invoking device allocator
+ * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock
+ */
+
+#include <executorch/extension/module/module.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::extension::Module;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignment = kDefaultAlignment) override {
+    (void)alignment;
+    allocate_count_++;
+    last_allocate_size_ = nbytes;
+    last_allocate_index_ = index;
+    buffer_ = std::make_unique<uint8_t[]>(nbytes);
+    return static_cast<void*>(buffer_.get());
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    buffer_.reset();
+  }
+
+  Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  size_t last_allocate_size_ = 0;
+  DeviceIndex last_allocate_index_ = -1;
+
+ private:
+  std::unique_ptr<uint8_t[]> buffer_;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
+class ModuleDeviceMemoryTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(&g_mock_cuda);
+  }
+
+  void SetUp() override {
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
+    g_mock_cuda.last_allocate_size_ = 0;
+    g_mock_cuda.last_allocate_index_ = -1;
+  }
+};
+
+TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) {
+  const char* path = std::getenv("ET_MODULE_ADD_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set";
+
+  Module module(path);
+  auto err = module.load_method("forward");
+  ASSERT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(g_mock_cuda.allocate_count_, 0)
+      << "CPU-only model should not allocate device memory";
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) {
+  // Directly test DeviceMemoryBuffer::create with the registered mock.
+  // This verifies the RAII allocation/deallocation path that Module uses.
+  {
+    auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0);
+    ASSERT_TRUE(result.ok());
+    auto buf = std::move(result.get());
+
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48);
+    EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0);
+    EXPECT_NE(buf.data(), nullptr);
+    EXPECT_EQ(buf.size(), 48);
+
+    // as_span() wraps the device pointer for HierarchicalAllocator.
+    auto span = buf.as_span();
+    EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
+    EXPECT_EQ(span.size(), 48);
+
+    EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+  }
+  // RAII deallocation on scope exit.
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) {
+  // Verify MethodMeta reports the correct device for buffers in the
+  // device-annotated model, without needing to load the full method.
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  Module module(path);
+  auto err = module.load();
+  ASSERT_EQ(err, Error::Ok);
+
+  auto meta = module.method_meta("forward");
+  ASSERT_TRUE(meta.ok());
+
+  // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA.
+  ASSERT_EQ(meta->num_memory_planned_buffers(), 1);
+
+  auto size = meta->memory_planned_buffer_size(0);
+  ASSERT_TRUE(size.ok());
+  EXPECT_EQ(size.get(), 48);
+
+  auto device = meta->memory_planned_buffer_device(0);
+  ASSERT_TRUE(device.ok());
+  EXPECT_EQ(device->type(), DeviceType::CUDA);
+  EXPECT_EQ(device->index(), 0);
+}
+
+TEST_F(ModuleDeviceMemoryTest, DeviceModelWithSharedArenasReturnsNotSupported) {
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  // share_memory_arenas = true with a device-annotated model should fail.
+  Module module(
+      path,
+      Module::LoadMode::File,
+      /*event_tracer=*/nullptr,
+      /*memory_allocator=*/nullptr,
+      /*temp_allocator=*/nullptr,
+      /*share_memory_arenas=*/true);
+
+  auto err = module.load_method("forward");
+  EXPECT_EQ(err, Error::NotSupported);
+}
+
+TEST_F(
+    ModuleDeviceMemoryTest,
+    LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) {
+  const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+  ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
+
+  {
+    Module module(path);
+    auto err = module.load_method("forward");
+
+    // Regardless of whether load_method succeeds or fails (e.g. due to
+    // backend init issues), the device-aware memory allocation path
+    // (make_planned_memory_with_devices) runs BEFORE backend init.
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1)
+        << "Expected 1 device allocation for the CUDA buffer"
+        << " (actual: " << g_mock_cuda.allocate_count_ << ")"
+        << ", deallocate_count=" << g_mock_cuda.deallocate_count_
+        << ", load_method returned error=" << static_cast<int>(err);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48)
+        << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)";
+    EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0)
+        << "Expected device_index=0 (cuda:0)";
+
+    if (err == Error::Ok) {
+      // Success path: MethodHolder moved into methods_ map.
+      // DeviceMemoryBuffer is alive as long as Module is alive.
+      EXPECT_EQ(g_mock_cuda.deallocate_count_, 0)
+          << "No deallocation while method is loaded";
+    } else {
+      // Error path: local MethodHolder destroyed on return from load_method.
+      // RAII deallocation already happened.
+      EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+          << "RAII deallocation on error path";
+    }
+  }
+
+  // After Module destroyed, all device memory must be freed.
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
+      << "Expected deallocation after Module destroyed";
+}
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index f0d7e449efd..4dc3fb537f3 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -28,7 +28,7 @@ def define_common_targets(is_fbcode=False):
             aten_suffix = ("_aten" if aten_mode else "")
 
             runtime.cxx_test(
-                name = "test" + aten_suffix,
+                name = "module_test" + aten_suffix,
                 srcs = [
                     "module_test.cpp",
                 ],
@@ -68,6 +68,26 @@ def define_common_targets(is_fbcode=False):
                 ],
             )
 
+            runtime.cxx_test(
+                name = "module_device_memory_test" + aten_suffix,
+                srcs = [
+                    "module_device_memory_test.cpp",
+                ],
+                deps = [
+                    "//executorch/kernels/portable:generated_lib" + aten_suffix,
+                    "//executorch/extension/module:module" + aten_suffix,
+                    "//executorch/runtime/core:device_allocator",
+                    "//executorch/runtime/core:device_memory_buffer",
+                ],
+                env = {
+                    "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+                    "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
+                },
+                compiler_flags = [
+                    "-Wno-error=deprecated-declarations",
+                ],
+            )
+
     runtime.filegroup(
         name = "resources",
         srcs = native.glob([
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index b0545b8ce18..659a128994f 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -50,6 +50,8 @@ PLATFORM_SRCS = [
 
 EXECUTORCH_CORE_SRCS = sorted([
     "runtime/backend/interface.cpp",
+    "runtime/core/device_allocator.cpp",
+    "runtime/core/device_memory_buffer.cpp",
     "runtime/core/evalue.cpp",
     "runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp",
     "runtime/core/exec_aten/util/tensor_util_portable.cpp",
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index c9fb67b7d31..a80244b1383 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -226,6 +226,7 @@ def define_common_targets():
         default_outs = ["."],
         visibility = [
             "//executorch/runtime/executor/test/...",
+            "//executorch/extension/module/test/...",
         ],
     )