diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index a48d88fa224..5c1f2db465d 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -2073,4 +2073,9 @@ def plan(self) -> ExecutionPlan:
                 self.module.meta["non_const_buffer_sizes"],
             ),
             container_meta_type=self.container_meta_type,
+            # non_const_buffer_device is set by apply_algo in memory_planning.py
+            # when device tensors are present. None for CPU-only programs.
+            non_const_buffer_device=self.module.meta.get(
+                "non_const_buffer_device", None
+            ),
         )
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 2de8a98a2cf..4bf97f60da4 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -2643,3 +2643,186 @@ def forward(self, a, b):
             0,
             "No tensor should have extra device info when model runs entirely on CPU",
         )
+
+    def test_emit_non_const_buffer_device_populated_for_device_tensors(self) -> None:
+        """Verify that non_const_buffer_device is emitted into ExecutionPlan when
+        device-aware memory planning is enabled and non-CPU tensors are present."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        et_prog = lowered.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNotNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be set when device tensors are present "
+            "and enable_non_cpu_memory_planning is True",
+        )
+        self.assertGreater(len(plan.non_const_buffer_device), 0)
+        for entry in plan.non_const_buffer_device:
+            self.assertEqual(entry.device_type, schema.DeviceType.CUDA)
+            self.assertEqual(entry.device_index, 0)
+
+    def test_emit_non_const_buffer_device_none_for_cpu_only(self) -> None:
+        """When all tensors are on CPU, non_const_buffer_device should be None
+        even with enable_non_cpu_memory_planning=True."""
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        et_prog = edge.to_executorch(
+            config=ExecutorchBackendConfig(enable_non_cpu_memory_planning=True),
+        )
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None for CPU-only programs",
+        )
+
+    def test_emit_non_const_buffer_device_none_when_flag_disabled(self) -> None:
+        """Even with device tensors, non_const_buffer_device should be None when
+        enable_non_cpu_memory_planning is False (default)."""
+        from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+            generate_pattern_op_partitions,
+        )
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+        from executorch.exir.backend.partitioner import (
+            DelegationSpec,
+            Partitioner,
+            PartitionResult,
+        )
+        from executorch.exir.backend.test.backend_with_compiler_demo import (
+            BackendWithCompilerDemo,
+        )
+        from executorch.exir.passes.propagate_device_pass import (
+            TARGET_DEVICE_COMPILE_SPEC_KEY,
+        )
+        from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+        class AddSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    exir_ops.edge.aten.add.Tensor,
+                ]
+
+        class DevicePartitioner(Partitioner):
+            def __init__(self):
+                super().__init__()
+                self.delegation_spec = DelegationSpec(
+                    BackendWithCompilerDemo.__name__,
+                    [
+                        CompileSpec("max_value", bytes([4])),
+                        CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+                    ],
+                )
+
+            def partition(self, exported_program) -> PartitionResult:
+                partition_tags = {}
+                partition_list = generate_pattern_op_partitions(
+                    exported_program.graph_module,
+                    op_support=any_chain(AddSupport()),
+                )
+                for partition in partition_list:
+                    for node in partition.nodes:
+                        tag = f"tag{partition.id}"
+                        node.meta["delegation_tag"] = tag
+                        partition_tags[tag] = self.delegation_spec
+                return PartitionResult(
+                    tagged_exported_program=exported_program,
+                    partition_tags=partition_tags,
+                )
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b)
+
+        model = Model()
+        inputs = (torch.randn(2, 2), torch.randn(2, 2))
+
+        edge = to_edge(
+            export(model, inputs),
+            compile_config=EdgeCompileConfig(_check_ir_validity=False),
+        )
+        lowered = edge.to_backend(DevicePartitioner())
+        # Default: enable_non_cpu_memory_planning=False
+        et_prog = lowered.to_executorch()
+        program = et_prog._emitter_output.program
+
+        plan = program.execution_plan[0]
+        self.assertIsNone(
+            plan.non_const_buffer_device,
+            "non_const_buffer_device should be None when "
+            "enable_non_cpu_memory_planning is False",
+        )
diff --git a/runtime/core/device_memory_buffer.cpp b/runtime/core/device_memory_buffer.cpp
new file mode 100644
index 00000000000..26faba39456
--- /dev/null
+++ b/runtime/core/device_memory_buffer.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+namespace executorch::runtime {
+
+Result<DeviceMemoryBuffer> DeviceMemoryBuffer::create(
+    size_t size,
+    etensor::DeviceType type,
+    etensor::DeviceIndex index,
+    size_t alignment) {
+  DeviceAllocator* allocator = get_device_allocator(type);
+  if (allocator == nullptr) {
+    ET_LOG(
+        Error,
+        "No device allocator registered for device type %d",
+        static_cast<int>(type));
+    return Error::NotFound;
+  }
+
+  auto result = allocator->allocate(size, index, alignment);
+  if (!result.ok()) {
+    return result.error();
+  }
+
+  return DeviceMemoryBuffer(result.get(), size, allocator, index);
+}
+
+} // namespace executorch::runtime
diff --git a/runtime/core/device_memory_buffer.h b/runtime/core/device_memory_buffer.h
new file mode 100644
index 00000000000..929460ce938
--- /dev/null
+++ b/runtime/core/device_memory_buffer.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+
+namespace executorch::runtime {
+
+/**
+ * RAII wrapper that owns a single device memory allocation.
+ *
+ * On destruction, calls DeviceAllocator::deallocate() to free the memory.
+ * This mirrors the role of std::vector<uint8_t> for CPU planned buffers,
+ * but for device memory (CUDA, etc.).
+ *
+ * Move-only: cannot be copied, but can be moved to transfer ownership.
+ */
+class DeviceMemoryBuffer final {
+ public:
+  /**
+   * Creates a DeviceMemoryBuffer by allocating device memory.
+   *
+   * Looks up the DeviceAllocator for the given device type via the
+   * DeviceAllocatorRegistry. If no allocator is registered for the type,
+   * returns Error::NotFound.
+   *
+   * @param size Number of bytes to allocate.
+   * @param type The device type (e.g., CUDA).
+   * @param index The device index (e.g., 0 for cuda:0).
+   * @param alignment Minimum alignment of the returned pointer in bytes.
+   *     Must be a power of 2. Defaults to DeviceAllocator::kDefaultAlignment.
+   * @return A Result containing the DeviceMemoryBuffer on success, or an error.
+   */
+  static Result<DeviceMemoryBuffer> create(
+      size_t size,
+      etensor::DeviceType type,
+      etensor::DeviceIndex index = 0,
+      size_t alignment = DeviceAllocator::kDefaultAlignment);
+
+  DeviceMemoryBuffer() = default;
+
+  ~DeviceMemoryBuffer() {
+    if (ptr_ != nullptr && allocator_ != nullptr) {
+      allocator_->deallocate(ptr_, device_index_);
+    }
+  }
+
+  // Move constructor: transfer ownership.
+  DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept
+      : ptr_(other.ptr_),
+        size_(other.size_),
+        allocator_(other.allocator_),
+        device_index_(other.device_index_) {
+    other.ptr_ = nullptr;
+    other.size_ = 0;
+    other.allocator_ = nullptr;
+  }
+
+  // Move assignment: release current, take ownership.
+  DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept {
+    if (this != &other) {
+      if (ptr_ != nullptr && allocator_ != nullptr) {
+        allocator_->deallocate(ptr_, device_index_);
+      }
+      ptr_ = other.ptr_;
+      size_ = other.size_;
+      allocator_ = other.allocator_;
+      device_index_ = other.device_index_;
+      other.ptr_ = nullptr;
+      other.size_ = 0;
+      other.allocator_ = nullptr;
+    }
+    return *this;
+  }
+
+  // Non-copyable.
+  DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete;
+  DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete;
+
+  /// Returns the device pointer, or nullptr if empty/moved-from.
+  void* data() const {
+    return ptr_;
+  }
+
+  /// Returns the size in bytes of the allocation.
+  size_t size() const {
+    return size_;
+  }
+
+  /**
+   * Returns a Span<uint8_t> wrapping the device pointer.
+   *
+   * This is intended for use with HierarchicalAllocator, which only performs
+   * pointer arithmetic on the span data and never dereferences it. Device
+   * pointers are valid for pointer arithmetic from the CPU side.
+   */
+  Span<uint8_t> as_span() const {
+    return {static_cast<uint8_t*>(ptr_), size_};
+  }
+
+ private:
+  DeviceMemoryBuffer(
+      void* ptr,
+      size_t size,
+      DeviceAllocator* allocator,
+      etensor::DeviceIndex device_index)
+      : ptr_(ptr),
+        size_(size),
+        allocator_(allocator),
+        device_index_(device_index) {}
+
+  void* ptr_ = nullptr;
+  size_t size_ = 0;
+  DeviceAllocator* allocator_ = nullptr;
+  etensor::DeviceIndex device_index_ = 0;
+};
+
+} // namespace executorch::runtime
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 78ffea3bdba..0a368bd6379 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -28,6 +28,7 @@ def define_common_targets():
             "//executorch/extension/fb/dynamic_shim/...",
             "//executorch/kernels/portable/cpu/...",
             "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/core/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl
index 1a81c5af1f6..89c4dfa08c1 100644
--- a/runtime/core/targets.bzl
+++ b/runtime/core/targets.bzl
@@ -155,6 +155,17 @@ def define_common_targets():
             visibility = ["//executorch/..."],
         )
 
+    runtime.cxx_library(
+        name = "device_memory_buffer",
+        srcs = ["device_memory_buffer.cpp"],
+        exported_headers = ["device_memory_buffer.h"],
+        exported_deps = [
+            ":core",
+            ":device_allocator",
+        ],
+        visibility = ["PUBLIC"],
+    )
+
     runtime.cxx_library(
         name = "tag",
         srcs = ["tag.cpp"],
diff --git a/runtime/core/test/device_memory_buffer_test.cpp b/runtime/core/test/device_memory_buffer_test.cpp
new file mode 100644
index 00000000000..36ecadfc2d2
--- /dev/null
+++ b/runtime/core/test/device_memory_buffer_test.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/device_memory_buffer.h>
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::DeviceMemoryBuffer;
+using executorch::runtime::Error;
+using executorch::runtime::get_device_allocator;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+/**
+ * A mock DeviceAllocator for testing DeviceMemoryBuffer.
+ * Returns pointers into a local buffer and tracks call counts.
+ */
+class MockAllocator : public DeviceAllocator {
+ public:
+  explicit MockAllocator(DeviceType type) : type_(type) {}
+
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex index,
+      size_t alignment = DeviceAllocator::kDefaultAlignment) override {
+    allocate_count_++;
+    last_allocate_size_ = nbytes;
+    last_allocate_alignment_ = alignment;
+    return static_cast<void*>(buffer_);
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {
+    deallocate_count_++;
+    last_deallocate_ptr_ = ptr;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  size_t last_allocate_size_ = 0;
+  size_t last_allocate_alignment_ = 0;
+  void* last_deallocate_ptr_ = nullptr;
+  uint8_t buffer_[256] = {};
+
+ private:
+  DeviceType type_;
+};
+
+// Global mock registered once before all tests run.
+static MockAllocator g_mock_cuda(DeviceType::CUDA);
+
+class DeviceMemoryBufferTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(&g_mock_cuda);
+  }
+
+  void SetUp() override {
+    // Reset counters before each test.
+    g_mock_cuda.allocate_count_ = 0;
+    g_mock_cuda.deallocate_count_ = 0;
+    g_mock_cuda.last_allocate_size_ = 0;
+    g_mock_cuda.last_allocate_alignment_ = 0;
+    g_mock_cuda.last_deallocate_ptr_ = nullptr;
+  }
+};
+
+TEST_F(DeviceMemoryBufferTest, DefaultConstructedIsEmpty) {
+  DeviceMemoryBuffer buf;
+  EXPECT_EQ(buf.data(), nullptr);
+  EXPECT_EQ(buf.size(), 0);
+
+  auto span = buf.as_span();
+  EXPECT_EQ(span.data(), nullptr);
+  EXPECT_EQ(span.size(), 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateAllocatesAndDestructorDeallocates) {
+  {
+    auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0);
+    ASSERT_TRUE(result.ok());
+
+    auto buf = std::move(result.get());
+    EXPECT_NE(buf.data(), nullptr);
+    EXPECT_EQ(buf.size(), 1024);
+    EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
+    EXPECT_EQ(g_mock_cuda.last_allocate_size_, 1024);
+    EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+  }
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_deallocate_ptr_, g_mock_cuda.buffer_);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateFailsWithNoRegisteredAllocator) {
+  auto result = DeviceMemoryBuffer::create(512, DeviceType::CPU, 0);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), Error::NotFound);
+}
+
+TEST_F(DeviceMemoryBufferTest, MoveConstructorTransfersOwnership) {
+  auto result = DeviceMemoryBuffer::create(256, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto original = std::move(result.get());
+  void* original_ptr = original.data();
+
+  DeviceMemoryBuffer moved(std::move(original));
+
+  EXPECT_EQ(original.data(), nullptr);
+  EXPECT_EQ(original.size(), 0);
+  EXPECT_EQ(moved.data(), original_ptr);
+  EXPECT_EQ(moved.size(), 256);
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, MoveAssignmentTransfersOwnership) {
+  auto result = DeviceMemoryBuffer::create(128, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto original = std::move(result.get());
+  void* original_ptr = original.data();
+
+  DeviceMemoryBuffer target;
+  target = std::move(original);
+
+  EXPECT_EQ(original.data(), nullptr);
+  EXPECT_EQ(target.data(), original_ptr);
+  EXPECT_EQ(target.size(), 128);
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, DestructorNoOpForDefaultConstructed) {
+  { DeviceMemoryBuffer buf; }
+  EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
+}
+
+TEST_F(DeviceMemoryBufferTest, AsSpanWrapsDevicePointer) {
+  auto result = DeviceMemoryBuffer::create(2048, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  auto buf = std::move(result.get());
+
+  auto span = buf.as_span();
+  EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
+  EXPECT_EQ(span.size(), 2048);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateUsesDefaultAlignmentWhenUnspecified) {
+  auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0);
+  ASSERT_TRUE(result.ok());
+  EXPECT_EQ(
+      g_mock_cuda.last_allocate_alignment_, DeviceAllocator::kDefaultAlignment);
+}
+
+TEST_F(DeviceMemoryBufferTest, CreateForwardsCustomAlignmentToAllocator) {
+  constexpr size_t kCustomAlignment = 512;
+  auto result =
+      DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0, kCustomAlignment);
+  ASSERT_TRUE(result.ok());
+  EXPECT_EQ(g_mock_cuda.last_allocate_alignment_, kCustomAlignment);
+}
diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl
index 1adb75f6e82..87019909a9f 100644
--- a/runtime/core/test/targets.bzl
+++ b/runtime/core/test/targets.bzl
@@ -7,6 +7,14 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
+    runtime.cxx_test(
+        name = "device_memory_buffer_test",
+        srcs = ["device_memory_buffer_test.cpp"],
+        deps = [
+            "//executorch/runtime/core:device_memory_buffer",
+        ],
+    )
+
     runtime.cxx_test(
         name = "span_test",
         srcs = ["span_test.cpp"],
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index ba73828378a..0af97b3fdf3 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -364,6 +364,42 @@ Result<int64_t> MethodMeta::memory_planned_buffer_size(size_t index) const {
   return size;
 }
 
+Result<etensor::Device> MethodMeta::memory_planned_buffer_device(
+    size_t index) const {
+  auto num_buffers = this->num_memory_planned_buffers();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < num_buffers,
+      InvalidArgument,
+      "index %zu out of range. num_buffers: %zu",
+      index,
+      num_buffers);
+
+  // The non_const_buffer_device field is optional and only present when the
+  // program contains non-CPU buffers. For CPU-only programs (or legacy PTE
+  // files), this field is null and all buffers default to CPU.
+  auto* buffer_devices = s_plan_->non_const_buffer_device();
+  if (buffer_devices == nullptr) {
+    return etensor::Device{etensor::DeviceType::CPU, 0};
+  }
+
+  // The sparse list only contains entries for non-CPU buffers.
+  // buffer_idx uses the same indexing as non_const_buffer_sizes (1-based,
+  // with index 0 reserved). The user-facing index is 0-based, so we
+  // compare against index + 1.
+  const auto internal_idx = static_cast<int32_t>(index + 1);
+  for (size_t i = 0; i < buffer_devices->size(); ++i) {
+    auto entry = buffer_devices->Get(i);
+    if (entry->buffer_idx() == internal_idx) {
+      return etensor::Device{
+          static_cast<etensor::DeviceType>(entry->device_type()),
+          static_cast<etensor::DeviceIndex>(entry->device_index())};
+    }
+  }
+
+  // Not found in the sparse list — this buffer is on CPU.
+  return etensor::Device{etensor::DeviceType::CPU, 0};
+}
+
 bool MethodMeta::uses_backend(const char* backend_name) const {
   ET_CHECK_MSG(backend_name, "backend name is null");
   const auto delegates = s_plan_->delegates();
diff --git a/runtime/executor/method_meta.h b/runtime/executor/method_meta.h
index 79fd05c28ee..e0fa16cda22 100644
--- a/runtime/executor/method_meta.h
+++ b/runtime/executor/method_meta.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/device.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/core/tag.h>
@@ -234,6 +235,19 @@ class MethodMeta final {
    */
   Result<int64_t> memory_planned_buffer_size(size_t index) const;
 
+  /**
+   * Get the device placement for the specified memory-planned buffer.
+   *
+   * For CPU-only programs (no non_const_buffer_device in the PTE), all buffers
+   * default to Device{CPU, 0}. For programs with device annotations, returns
+   * the device type and index that the buffer should be allocated on.
+   *
+   * @param[in] index The index of the buffer to look up (0-based, same
+   *     indexing as memory_planned_buffer_size()).
+   * @returns The Device on success, or an error on failure.
+   */
+  Result<etensor::Device> memory_planned_buffer_device(size_t index) const;
+
   /**
    * Check to see if a backend is used in this method.
    *
diff --git a/runtime/executor/test/method_meta_test.cpp b/runtime/executor/test/method_meta_test.cpp
index e4ef2e72a85..3e6e09cc8c3 100644
--- a/runtime/executor/test/method_meta_test.cpp
+++ b/runtime/executor/test/method_meta_test.cpp
@@ -74,6 +74,10 @@ class MethodMetaTest : public ::testing::Test {
   void SetUp() override {
     load_program(std::getenv("ET_MODULE_ADD_PATH"), "add");
     load_program(std::getenv("ET_MODULE_STATEFUL_PATH"), "stateful");
+    const char* device_path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+    if (device_path != nullptr) {
+      load_program(device_path, "add_with_device");
+    }
   }
 
  private:
@@ -192,6 +196,27 @@ TEST_F(MethodMetaTest, MethodMetaAttribute) {
   ASSERT_EQ(bad_access.error(), Error::InvalidArgument);
 }
 
+TEST_F(MethodMetaTest, MemoryPlannedBufferDeviceDefaultsCpu) {
+  Result<MethodMeta> method_meta = programs_["add"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // CPU-only model: all buffers should default to CPU device.
+  size_t num_buffers = method_meta->num_memory_planned_buffers();
+  ASSERT_GT(num_buffers, 0);
+
+  for (size_t i = 0; i < num_buffers; ++i) {
+    auto device = method_meta->memory_planned_buffer_device(i);
+    ASSERT_TRUE(device.ok());
+    EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CPU);
+    EXPECT_EQ(device->index(), 0);
+  }
+
+  // Out of range returns error.
+  EXPECT_EQ(
+      method_meta->memory_planned_buffer_device(num_buffers).error(),
+      Error::InvalidArgument);
+}
+
 TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
   // Create sizes that will cause overflow when multiplied
   std::vector<int32_t> overflow_sizes = {
@@ -214,3 +239,30 @@ TEST_F(MethodMetaTest, TensorInfoSizeOverflow) {
           executorch::aten::string_view{nullptr, 0}),
       "");
 }
+
+TEST_F(MethodMetaTest, MethodMetaBufferDeviceReturnsCudaForDeviceBuffer) {
+  ASSERT_NE(programs_.find("add_with_device"), programs_.end())
+      << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
+  Result<MethodMeta> method_meta =
+      programs_["add_with_device"]->method_meta("forward");
+  ASSERT_EQ(method_meta.error(), Error::Ok);
+
+  // ModuleAddWithDevice exports with enable_non_cpu_memory_planning=True.
+  // The model delegates add(a,b) to CUDA, producing:
+  //   non_const_buffer_sizes: [0, 48]  (index 0 reserved)
+  //   non_const_buffer_device: [{buffer_idx=1, device_type=CUDA,
+  //   device_index=0}]
+  // So there is exactly 1 planned buffer (user-facing index 0), on CUDA.
+  ASSERT_EQ(method_meta->num_memory_planned_buffers(), 1);
+
+  // Buffer 0 should be CUDA device.
+  auto device = method_meta->memory_planned_buffer_device(0);
+  ASSERT_TRUE(device.ok());
+  EXPECT_EQ(device->type(), executorch::runtime::etensor::DeviceType::CUDA);
+  EXPECT_EQ(device->index(), 0);
+
+  // Out of range should return error.
+  EXPECT_EQ(
+      method_meta->memory_planned_buffer_device(1).error(),
+      Error::InvalidArgument);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
index f4534aefdea..74ea9a8262d 100644
--- a/runtime/executor/test/targets.bzl
+++ b/runtime/executor/test/targets.bzl
@@ -178,7 +178,12 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/runtime/executor:program",
                 "//executorch/extension/data_loader:file_data_loader",
             ],
-            env = modules_env,
+            env = dict(
+                modules_env,
+                **{
+                    "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+                }
+            ),
         )
 
         runtime.cxx_test(
diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py
index 8666f298640..3b6af55c6e8 100644
--- a/test/models/export_program_with_device_info.py
+++ b/test/models/export_program_with_device_info.py
@@ -99,7 +99,12 @@ def main() -> None:
         compile_config=EdgeCompileConfig(_check_ir_validity=False),
     )
     lowered = edge.to_backend(_DeviceAwarePartitioner())
-    et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False))
+    et_prog = lowered.to_executorch(
+        ExecutorchBackendConfig(  # type: ignore[call-arg]
+            emit_stacktrace=False,
+            enable_non_cpu_memory_planning=True,
+        )
+    )
 
     os.makedirs(args.outdir, exist_ok=True)
     outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte")