diff --git a/runtime/core/hierarchical_allocator.h b/runtime/core/hierarchical_allocator.h
index d41b98f69d5..00eb5b3a089 100644
--- a/runtime/core/hierarchical_allocator.h
+++ b/runtime/core/hierarchical_allocator.h
@@ -12,6 +12,7 @@
 #include <c10/util/safe_numerics.h>
 
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
 
@@ -34,6 +35,30 @@ class HierarchicalAllocator final {
   explicit HierarchicalAllocator(Span<Span<uint8_t>> buffers)
       : buffers_(buffers) {}
 
+  /**
+   * Constructs a new hierarchical allocator with per-buffer device metadata.
+   *
+   * @param[in] buffers Same as above. May contain a mix of CPU and device
+   *     pointers — HierarchicalAllocator only does pointer arithmetic, so
+   *     device pointers are valid.
+   * @param[in] planned_buffer_devices One entry per buffer (same count as
+   *     `buffers`), indicating the `Device` (type + index) for each buffer.
+   *     Different buffers can target the same device type but different
+   *     indices (e.g., `cuda:0` vs `cuda:1`). For CPU-only programs, use the
+   *     single-arg constructor instead.
+   */
+  HierarchicalAllocator(
+      Span<Span<uint8_t>> buffers,
+      Span<const etensor::Device> planned_buffer_devices)
+      : buffers_(buffers), planned_buffer_devices_(planned_buffer_devices) {
+    ET_CHECK_MSG(
+        planned_buffer_devices.size() == buffers.size(),
+        "planned_buffer_devices size (%" ET_PRIsize_t
+        ") must match buffers size (%" ET_PRIsize_t ")",
+        planned_buffer_devices.size(),
+        buffers.size());
+  }
+
   /**
    * DEPRECATED: Use spans instead.
    */
@@ -88,6 +113,17 @@ class HierarchicalAllocator final {
     return buffer.data() + offset_bytes;
   }
 
+  /**
+   * Returns per-buffer device metadata. One entry per buffer, same count as
+   * the `buffers` passed to the constructor. Each entry is a `Device`
+   * carrying both type and index, so callers can distinguish e.g. `cuda:0`
+   * from `cuda:1`. Empty if no device metadata was provided (CPU-only
+   * program).
+   */
+  Span<const etensor::Device> planned_buffer_devices() const {
+    return planned_buffer_devices_;
+  }
+
  private:
   // TODO(T162089316): Remove the span array and to_spans once all users move to
   // spans. This array is necessary to hold the pointers and sizes that were
@@ -113,6 +149,10 @@ class HierarchicalAllocator final {
 
   /// The underlying buffers.
   Span<Span<uint8_t>> buffers_;
+
+  /// Per-buffer device metadata. Empty when no device info was provided
+  /// (CPU-only program).
+  Span<const etensor::Device> planned_buffer_devices_;
 };
 
 } // namespace runtime
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 89c4dfa08c1..05beb198a61 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -77,6 +77,7 @@ def define_common_targets():
         ],
         exported_deps = [
             ":core",
+            "//executorch/runtime/core/exec_aten:lib",
             "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
         visibility = ["PUBLIC"],
diff --git a/runtime/core/test/hierarchical_allocator_test.cpp b/runtime/core/test/hierarchical_allocator_test.cpp
index e25e1eb8335..245c4b7b0e5 100644
--- a/runtime/core/test/hierarchical_allocator_test.cpp
+++ b/runtime/core/test/hierarchical_allocator_test.cpp
@@ -10,8 +10,10 @@
 
 #include <executorch/runtime/core/hierarchical_allocator.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
 #include <executorch/test/utils/alignment.h>
 
 #include <gtest/gtest.h>
@@ -22,6 +24,8 @@ using executorch::runtime::HierarchicalAllocator;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
+using executorch::runtime::etensor::Device;
+using executorch::runtime::etensor::DeviceType;
 
 class HierarchicalAllocatorTest : public ::testing::Test {
  protected:
@@ -87,6 +91,67 @@ TEST_F(HierarchicalAllocatorTest, Smoke) {
   }
 }
 
+TEST_F(HierarchicalAllocatorTest, NoDeviceMetadataByDefault) {
+  Span<Span<uint8_t>> empty_buffers{};
+  HierarchicalAllocator allocator(empty_buffers);
+
+  EXPECT_EQ(allocator.planned_buffer_devices().size(), 0);
+}
+
+TEST_F(HierarchicalAllocatorTest, ExposesDeviceMetadataWhenProvided) {
+  // Use 4 buffers so the device span size matches.
+  constexpr size_t n_buffers = 4;
+  uint8_t mem0[4];
+  uint8_t mem1[4];
+  uint8_t mem2[4];
+  uint8_t mem3[4];
+  Span<uint8_t> buffers[n_buffers]{
+      {mem0, sizeof(mem0)},
+      {mem1, sizeof(mem1)},
+      {mem2, sizeof(mem2)},
+      {mem3, sizeof(mem3)},
+  };
+
+  // CPU buffers come first because the runtime always sets up host-side
+  // planned memory before any device buffers. The two CUDA entries use
+  // distinct device indices to verify per-buffer index tracking.
+  Device devices[] = {
+      Device(DeviceType::CPU, 0),
+      Device(DeviceType::CPU, 0),
+      Device(DeviceType::CUDA, 0),
+      Device(DeviceType::CUDA, 1),
+  };
+  Span<const Device> device_span(devices, n_buffers);
+
+  HierarchicalAllocator allocator({buffers, n_buffers}, device_span);
+
+  ASSERT_EQ(allocator.planned_buffer_devices().size(), n_buffers);
+  EXPECT_EQ(allocator.planned_buffer_devices()[0], Device(DeviceType::CPU, 0));
+  EXPECT_EQ(allocator.planned_buffer_devices()[1], Device(DeviceType::CPU, 0));
+  EXPECT_EQ(allocator.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0));
+  EXPECT_EQ(allocator.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1));
+}
+
+TEST_F(HierarchicalAllocatorTest, MismatchedDeviceCountAborts) {
+  constexpr size_t n_buffers = 2;
+  uint8_t mem0[4];
+  uint8_t mem1[4];
+  Span<uint8_t> buffers[n_buffers]{
+      {mem0, sizeof(mem0)},
+      {mem1, sizeof(mem1)},
+  };
+
+  // 3 device entries vs 2 buffers — should abort.
+  Device devices[] = {
+      Device(DeviceType::CPU, 0),
+      Device(DeviceType::CPU, 0),
+      Device(DeviceType::CUDA, 0),
+  };
+  Span<const Device> device_span(devices, 3);
+
+  ET_EXPECT_DEATH(HierarchicalAllocator({buffers, n_buffers}, device_span), "");
+}
+
 // TODO(T162089316): Tests the deprecated API. Remove this when removing the
 // API.
 TEST_F(HierarchicalAllocatorTest, DEPRECATEDSmoke) {
diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 87019909a9f..4d865df425d 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -108,6 +108,7 @@ def define_common_targets():
         ],
         deps = [
             "//executorch/runtime/core:memory_allocator",
+            "//executorch/test/utils:utils",
         ],
     )
 
diff --git a/runtime/executor/memory_manager.h b/runtime/executor/memory_manager.h
index 42edd9f0bea..bc89adcda79 100644
--- a/runtime/executor/memory_manager.h
+++ b/runtime/executor/memory_manager.h
@@ -10,6 +10,8 @@
 
 #include <executorch/runtime/core/hierarchical_allocator.h>
 #include <executorch/runtime/core/memory_allocator.h>
+#include <executorch/runtime/core/portable_type/device.h>
+#include <executorch/runtime/core/span.h>
 
 namespace executorch {
 namespace runtime {
@@ -42,7 +44,8 @@ class MemoryManager final {
    *     must agree with the corresponding
    *     `MethodMeta::num_memory_planned_buffers()` and
    *     `MethodMeta::memory_planned_buffer_size(N)` values, which are embedded
-   *     in the Program.
+   *     in the Program. For device-aware programs, the per-buffer device
+   *     metadata is owned by the HierarchicalAllocator as well.
    * @param[in] temp_allocator The allocator to use when allocating temporary
    *     data during kernel or delegate execution. Must outlive the Method that
    *     uses it. May be `nullptr` if the Method does not use kernels or
@@ -105,6 +108,29 @@ class MemoryManager final {
     return temp_allocator_;
   }
 
+  /**
+   * Returns per-buffer device metadata. One entry per planned memory buffer,
+   * same count as planned_memory buffers. Empty if no device metadata was
+   * provided (CPU-only program) or if `planned_memory` is null.
+   *
+   * This is a thin wrapper around
+   * `HierarchicalAllocator::planned_buffer_devices()`.
+   */
+  Span<const etensor::Device> planned_buffer_devices() const {
+    if (planned_memory_ == nullptr) {
+      return {};
+    }
+    return planned_memory_->planned_buffer_devices();
+  }
+
+  /**
+   * Returns true if any planned buffer has device metadata attached.
+   * When false, the memory setup is CPU-only.
+   */
+  bool has_device_memory() const {
+    return planned_buffer_devices().size() > 0;
+  }
+
  private:
   MemoryAllocator* method_allocator_;
   HierarchicalAllocator* planned_memory_;
diff --git a/runtime/executor/test/memory_manager_test.cpp b/runtime/executor/test/memory_manager_test.cpp
index 0e1feb47793..edbbbde343a 100644
--- a/runtime/executor/test/memory_manager_test.cpp
+++ b/runtime/executor/test/memory_manager_test.cpp
@@ -17,6 +17,9 @@ using namespace ::testing;
 using executorch::runtime::HierarchicalAllocator;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::MemoryManager;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Device;
+using executorch::runtime::etensor::DeviceType;
 
 TEST(MemoryManagerTest, MinimalCtor) {
   MemoryAllocator method_allocator(0, nullptr);
@@ -93,3 +96,64 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
           /*temp_allocator=*/&method_allocator),
       "cannot be the same");
 }
+
+TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
+  MemoryAllocator method_allocator(0, nullptr);
+  HierarchicalAllocator planned_memory({});
+  MemoryAllocator temp_allocator(0, nullptr);
+
+  MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
+
+  EXPECT_FALSE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
+}
+
+TEST(MemoryManagerTest, DelegatesDeviceMetadataToHierarchicalAllocator) {
+  MemoryAllocator method_allocator(0, nullptr);
+  MemoryAllocator temp_allocator(0, nullptr);
+
+  // 4 buffers: cpu:0, cpu:0, cuda:0, cuda:1. CPU buffers come first because
+  // the runtime always sets up host-side planned memory before any device
+  // buffers. The two CUDA entries use distinct indices to verify per-buffer
+  // index tracking.
+  constexpr size_t n_buffers = 4;
+  uint8_t mem0[4];
+  uint8_t mem1[4];
+  uint8_t mem2[4];
+  uint8_t mem3[4];
+  Span<uint8_t> buffers[n_buffers]{
+      {mem0, sizeof(mem0)},
+      {mem1, sizeof(mem1)},
+      {mem2, sizeof(mem2)},
+      {mem3, sizeof(mem3)},
+  };
+  Device devices[] = {
+      Device(DeviceType::CPU, 0),
+      Device(DeviceType::CPU, 0),
+      Device(DeviceType::CUDA, 0),
+      Device(DeviceType::CUDA, 1),
+  };
+  Span<const Device> device_span(devices, n_buffers);
+
+  HierarchicalAllocator planned_memory({buffers, n_buffers}, device_span);
+  MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);
+
+  EXPECT_EQ(mm.method_allocator(), &method_allocator);
+  EXPECT_EQ(mm.planned_memory(), &planned_memory);
+  EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
+  EXPECT_TRUE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), n_buffers);
+  EXPECT_EQ(mm.planned_buffer_devices()[0], Device(DeviceType::CPU, 0));
+  EXPECT_EQ(mm.planned_buffer_devices()[1], Device(DeviceType::CPU, 0));
+  EXPECT_EQ(mm.planned_buffer_devices()[2], Device(DeviceType::CUDA, 0));
+  EXPECT_EQ(mm.planned_buffer_devices()[3], Device(DeviceType::CUDA, 1));
+}
+
+TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
+  MemoryAllocator method_allocator(0, nullptr);
+
+  MemoryManager mm(&method_allocator);
+
+  EXPECT_FALSE(mm.has_device_memory());
+  EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index 74ea9a8262d..32baa63a76b 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/exir/backend/test/...",
                 "//executorch/runtime/backend/...",
                 "//executorch/extension/pybindings/...",
+                "//executorch/extension/module/test/...",
                 "//executorch/devtools/fb/runners/...",
                 "//executorch/test/...",
                 "//executorch/examples/...",
@@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 ":managed_memory_manager",
                 "//executorch/runtime/executor:program",
+                "//executorch/runtime/core:device_allocator",
+                "//executorch/runtime/core:device_memory_buffer",
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/schema:program",
             ],
diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
index cdcc4f3e517..3cd5570b42b 100644
--- a/runtime/executor/test/tensor_parser_device_test.cpp
+++ b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -17,17 +17,31 @@
 #include <executorch/runtime/executor/tensor_parser.h>
 
 #include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/device_memory_buffer.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/platform/runtime.h>
 #include <executorch/schema/program_generated.h>
 
 #include <gtest/gtest.h>
 
 using executorch::aten::Tensor;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
 using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::HierarchicalAllocator;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::MethodMeta;
 using executorch::runtime::Program;
+using executorch::runtime::register_device_allocator;
 using executorch::runtime::Result;
+using executorch::runtime::Span;
 using executorch::runtime::deserialization::parseTensor;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
 using executorch::runtime::testing::ManagedMemoryManager;
 using torch::executor::util::FileDataLoader;
 
@@ -50,8 +64,72 @@ class ProgramTestFriend final {
 
 using executorch::runtime::testing::ProgramTestFriend;
 
+namespace {
+
+/**
+ * Mock CUDA allocator that uses host memory for testing.
+ * Tracks the allocated range so tests can verify tensor data_ptr
+ * falls within the "device" memory region.
+ */
+class MockCudaAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignement = kDefaultAlignment) override {
+    (void)alignement;
+    (void)index;
+    allocate_count_++;
+    buffer_ = std::make_unique<uint8_t[]>(nbytes);
+    buffer_size_ = nbytes;
+    return static_cast<void*>(buffer_.get());
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    buffer_.reset();
+    buffer_size_ = 0;
+  }
+
+  Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  bool is_device_ptr(const void* ptr) const {
+    if (buffer_ == nullptr || buffer_size_ == 0) {
+      return false;
+    }
+    auto* p = static_cast<const uint8_t*>(ptr);
+    return p >= buffer_.get() && p < buffer_.get() + buffer_size_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+
+ private:
+  std::unique_ptr<uint8_t[]> buffer_;
+  size_t buffer_size_ = 0;
+};
+
+} // namespace
+
+static MockCudaAllocator g_mock_cuda;
+
 class TensorParserDeviceTest : public ::testing::Test {
  protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(&g_mock_cuda);
+  }
+
   void SetUp() override {
     const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
     ASSERT_NE(path, nullptr)
@@ -59,6 +137,9 @@ class TensorParserDeviceTest : public ::testing::Test {
     Result<FileDataLoader> loader = FileDataLoader::from(path);
     ASSERT_EQ(loader.error(), Error::Ok);
     loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
   }
 
   std::unique_ptr<FileDataLoader> loader_;
@@ -167,3 +248,96 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
         << " without device annotation should have device_index=0";
   }
 }
+TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  Result<MethodMeta> method_meta = program->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // ModuleAddWithDevice has:
+  //   non_const_buffer_sizes: [0, 48]  (index 0 reserved, buffer 0 = 48 bytes)
+  //   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
+  const size_t num_buffers = method_meta->num_memory_planned_buffers();
+  ASSERT_EQ(num_buffers, 1);
+
+  // Set up device-aware planned memory.
+  std::vector<Span<uint8_t>> planned_spans;
+  std::vector<std::vector<uint8_t>> cpu_buffers;
+  std::vector<DeviceMemoryBuffer> device_buffers;
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto size = method_meta->memory_planned_buffer_size(i);
+    ASSERT_TRUE(size.ok());
+    auto device = method_meta->memory_planned_buffer_device(i);
+    ASSERT_TRUE(device.ok());
+
+    if (device->is_cpu()) {
+      cpu_buffers.emplace_back(size.get());
+      planned_spans.emplace_back(
+          cpu_buffers.back().data(), cpu_buffers.back().size());
+    } else {
+      cpu_buffers.emplace_back(); // empty placeholder
+      auto dmb = DeviceMemoryBuffer::create(
+          size.get(), device->type(), device->index());
+      ASSERT_TRUE(dmb.ok())
+          << "DeviceMemoryBuffer::create failed for buffer " << i;
+      planned_spans.emplace_back(dmb->as_span());
+      device_buffers.push_back(std::move(dmb.get()));
+    }
+  }
+
+  ASSERT_EQ(g_mock_cuda.allocate_count_, 1);
+
+  // Build HierarchicalAllocator with mixed CPU/device spans.
+  HierarchicalAllocator planned_memory(
+      {planned_spans.data(), planned_spans.size()});
+
+  constexpr size_t kMethodAllocBytes = 32 * 1024U;
+  auto method_alloc_pool = std::make_unique<uint8_t[]>(kMethodAllocBytes);
+  MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get());
+  MemoryManager memory_manager(&method_allocator, &planned_memory);
+
+  // Parse tensors and verify CUDA tensors have device memory.
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  int cuda_with_device_memory = 0;
+
+  for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+    bool is_cuda = s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->device_type() ==
+            executorch_flatbuffer::DeviceType::CUDA;
+
+    Result<Tensor> tensor =
+        parseTensor(&program.get(), &memory_manager, s_tensor);
+    ASSERT_TRUE(tensor.ok())
+        << "parseTensor failed at index " << i << " with error 0x" << std::hex
+        << static_cast<uint32_t>(tensor.error());
+
+    Tensor t = tensor.get();
+
+    if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) {
+      EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr()))
+          << "CUDA tensor at index " << i
+          << " should have data_ptr in device memory, but got CPU memory";
+      cuda_with_device_memory++;
+    }
+  }
+
+  // All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
+  // their data_ptr pointing to the mock device memory buffer.
+  EXPECT_EQ(cuda_with_device_memory, 3)
+      << "All 3 CUDA tensors should have data_ptr in device memory";
+}