diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl
index c8bf2847dcf..6a5c40f9857 100644
--- a/extension/tensor/targets.bzl
+++ b/extension/tensor/targets.bzl
@@ -24,6 +24,7 @@ def define_common_targets():
             ],
             visibility = ["PUBLIC"],
             deps = [
+                "//executorch/runtime/core:device_allocator",
                 "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index a6ba6018333..a1c5c3c6fe7 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -12,6 +12,7 @@
 
 #include <c10/util/safe_numerics.h>
 
+#include <executorch/runtime/core/device_allocator.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace executorch {
@@ -25,6 +26,9 @@ namespace {
  * ensures that they are managed together and have the same lifetime as the
  * Tensor. When the Tensor is destroyed, the Storage structure ensures
  * proper cleanup of the associated metadata and data if needed.
+ *
+ * For device tensors, the data pointer points to device memory; the deleter
+ * is responsible for freeing it through the appropriate DeviceAllocator.
  */
 struct Storage final {
   executorch::aten::TensorImpl tensor_impl;
@@ -47,6 +51,11 @@ struct Storage final {
         strides(std::move(strides)),
         deleter(std::move(deleter)) {}
 
+  Storage(const Storage&) = delete;
+  Storage& operator=(const Storage&) = delete;
+  Storage(Storage&&) = delete;
+  Storage& operator=(Storage&&) = delete;
+
   ~Storage() {
     if (deleter) {
       deleter(tensor_impl.mutable_data());
@@ -63,7 +72,9 @@ TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type,
     executorch::aten::TensorShapeDynamism dynamism,
-    std::function<void(void*)> deleter) {
+    std::function<void(void*)> deleter,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index) {
   const auto dim = sizes.size();
   ET_CHECK_MSG(
       dim_order.empty() || dim_order.size() == dim,
@@ -111,17 +122,25 @@ TensorPtr make_tensor_ptr(
       data,
       dim_order.data(),
       strides.data(),
-      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC);
+      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC,
+      device_type,
+      device_index);
   auto storage = std::make_shared<Storage>(
       std::move(tensor_impl),
       std::move(sizes),
       std::move(dim_order),
       std::move(strides),
       std::move(deleter));
-  const auto tensor_ptr = &storage->tensor;
+  const auto raw_tensor_ptr = &storage->tensor;
   return std::shared_ptr<executorch::aten::Tensor>(
-      std::move(storage), tensor_ptr);
+      std::move(storage), raw_tensor_ptr);
 #else
+  ET_CHECK_MSG(
+      device_type == runtime::etensor::DeviceType::CPU,
+      "USE_ATEN_LIB build does not support non-CPU device tensors via make_tensor_ptr; "
+      "got device_type=%d. Use the ExecuTorch portable build for device tensor support.",
+      static_cast<int>(device_type));
+  (void)device_index;
   auto options = c10::TensorOptions()
                      .dtype(c10::scalarTypeToTypeMeta(type))
                      .device(c10::kCPU);
@@ -271,5 +290,120 @@ runtime::Error resize_tensor_ptr(
           sizes.data(), sizes.size()));
 }
 
+// ---- Device tensor helpers ----
+//
+// These helpers are only meaningful in the ExecuTorch portable build.
+// USE_ATEN_LIB cannot create on-device tensors via make_tensor_ptr, so cloning
+// to/from a device tensor is intentionally unsupported in that build.
+
+#ifndef USE_ATEN_LIB
+
+TensorPtr clone_tensor_ptr_to_device(
+    const TensorPtr& cpu_tensor,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index) {
+  ET_CHECK_MSG(
+      device_type != runtime::etensor::DeviceType::CPU,
+      "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies.");
+
+  auto* allocator = runtime::get_device_allocator(device_type);
+  ET_CHECK_MSG(
+      allocator != nullptr,
+      "No device allocator registered for device type %d",
+      static_cast<int>(device_type));
+
+  const auto nbytes = cpu_tensor->nbytes();
+  const auto* cpu_data = cpu_tensor->const_data_ptr();
+  ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data.");
+
+  auto result = allocator->allocate(nbytes, device_index);
+  ET_CHECK_MSG(result.ok(), "Failed to allocate device memory.");
+  void* device_data = result.get();
+
+  auto err = allocator->copy_host_to_device(
+      device_data, cpu_data, nbytes, device_index);
+  ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed.");
+
+  std::vector<executorch::aten::SizesType> sizes(
+      cpu_tensor->sizes().begin(), cpu_tensor->sizes().end());
+  std::vector<executorch::aten::DimOrderType> dim_order(
+      cpu_tensor->dim_order().begin(), cpu_tensor->dim_order().end());
+  std::vector<executorch::aten::StridesType> strides(
+      cpu_tensor->strides().begin(), cpu_tensor->strides().end());
+
+  return make_tensor_ptr(
+      std::move(sizes),
+      device_data,
+      std::move(dim_order),
+      std::move(strides),
+      cpu_tensor->scalar_type(),
+      cpu_tensor->shape_dynamism(),
+      [allocator, device_index](void* ptr) {
+        allocator->deallocate(ptr, device_index);
+      },
+      device_type,
+      device_index);
+}
+
+TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) {
+  const auto nbytes = device_tensor->nbytes();
+  const auto* device_data = device_tensor->const_data_ptr();
+  ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data.");
+
+  const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type();
+  const auto device_index =
+      device_tensor->unsafeGetTensorImpl()->device_index();
+  ET_CHECK_MSG(
+      device_type != runtime::etensor::DeviceType::CPU,
+      "Source tensor is already on CPU.");
+
+  auto* allocator = runtime::get_device_allocator(device_type);
+  ET_CHECK_MSG(
+      allocator != nullptr,
+      "No device allocator registered for device type %d",
+      static_cast<int>(device_type));
+
+  std::vector<uint8_t> cpu_data(nbytes);
+
+  auto err = allocator->copy_device_to_host(
+      cpu_data.data(), device_data, nbytes, device_index);
+  ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed.");
+
+  std::vector<executorch::aten::SizesType> sizes(
+      device_tensor->sizes().begin(), device_tensor->sizes().end());
+  std::vector<executorch::aten::DimOrderType> dim_order(
+      device_tensor->dim_order().begin(), device_tensor->dim_order().end());
+  std::vector<executorch::aten::StridesType> strides(
+      device_tensor->strides().begin(), device_tensor->strides().end());
+
+  return make_tensor_ptr(
+      std::move(sizes),
+      std::move(cpu_data),
+      std::move(dim_order),
+      std::move(strides),
+      device_tensor->scalar_type());
+}
+
+#else // USE_ATEN_LIB
+
+TensorPtr clone_tensor_ptr_to_device(
+    const TensorPtr& /*cpu_tensor*/,
+    runtime::etensor::DeviceType /*device_type*/,
+    runtime::etensor::DeviceIndex /*device_index*/) {
+  ET_CHECK_MSG(
+      false,
+      "clone_tensor_ptr_to_device is not supported in USE_ATEN_LIB builds; "
+      "make_tensor_ptr cannot create on-device aten tensors.");
+}
+
+TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& /*device_tensor*/) {
+  ET_CHECK_MSG(
+      false,
+      "clone_tensor_ptr_to_cpu is not supported in USE_ATEN_LIB builds; "
+      "make_tensor_ptr cannot create on-device aten tensors.");
+}
+
+#endif // USE_ATEN_LIB
+
 } // namespace extension
 } // namespace executorch
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 0ed06cbe021..1a534362108 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -18,6 +18,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/device.h>
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
 
@@ -32,8 +33,13 @@ using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
  *
+ * The `device_type` and `device_index` parameters set the TensorImpl's device
+ * metadata only — no data is allocated or copied. The caller is responsible
+ * for ensuring `data` already lives on the requested device. To copy CPU data
+ * to a device, use `clone_tensor_ptr_to_device` instead.
+ *
  * @param sizes A vector specifying the size of each dimension.
- * @param data A pointer to the data buffer.
+ * @param data A pointer to the data buffer (CPU or device, see device_type).
  * @param dim_order A vector specifying the order of dimensions.
  * @param strides A vector specifying the strides of the tensor.
  * @param type The scalar type of the tensor elements.
@@ -41,6 +47,9 @@ using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
  * @param deleter A custom deleter function for managing the lifetime of the
  * data buffer. If provided, this deleter will be called when the managed Tensor
  * object is destroyed.
+ * @param device_type The device on which `data` resides (default CPU). In
+ * USE_ATEN_LIB builds this must be CPU.
+ * @param device_index The device index for multi-device scenarios (default 0).
  * @return A TensorPtr that manages the newly created Tensor.
  */
 TensorPtr make_tensor_ptr(
@@ -52,18 +61,25 @@ TensorPtr make_tensor_ptr(
         executorch::aten::ScalarType::Float,
     const executorch::aten::TensorShapeDynamism dynamism =
         executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
-    std::function<void(void*)> deleter = nullptr);
+    std::function<void(void*)> deleter = nullptr,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0);
 
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
  *
+ * Convenience overload for the primary factory; see the primary overload for
+ * device semantics.
+ *
  * @param sizes A vector specifying the size of each dimension.
- * @param data A pointer to the data buffer.
+ * @param data A pointer to the data buffer (CPU or device, see device_type).
  * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies the mutability of the tensor's shape.
  * @param deleter A custom deleter function for managing the lifetime of the
- * data buffer. If provided, this deleter will be called when the managed Tensor
- * object is destroyed.
+ * data buffer.
+ * @param device_type The device on which `data` resides (default CPU).
+ * @param device_index The device index for multi-device scenarios (default 0).
  * @return A TensorPtr that manages the newly created Tensor.
  */
 inline TensorPtr make_tensor_ptr(
@@ -73,9 +89,20 @@ inline TensorPtr make_tensor_ptr(
         executorch::aten::ScalarType::Float,
     const executorch::aten::TensorShapeDynamism dynamism =
         executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
-    std::function<void(void*)> deleter = nullptr) {
+    std::function<void(void*)> deleter = nullptr,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   return make_tensor_ptr(
-      std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter));
+      std::move(sizes),
+      data,
+      {},
+      {},
+      type,
+      dynamism,
+      std::move(deleter),
+      device_type,
+      device_index);
 }
 
 /**
@@ -88,6 +115,9 @@ inline TensorPtr make_tensor_ptr(
  * specified `type`. This allows for flexible creation of tensors with data
  * vectors of one type and a different scalar type.
  *
+ * The result is always a CPU tensor. To move it to a device, use
+ * `clone_tensor_ptr_to_device`.
+ *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param sizes A vector specifying the size of each dimension.
  * @param data A vector containing the tensor's data.
@@ -177,10 +207,10 @@ inline TensorPtr make_tensor_ptr(
  *
  * This template overload is specialized for cases where the tensor data is
  * provided as a vector. The scalar type is automatically deduced from the
- * vector's data type. If the specified `type` differs from the deduced type of
- * the vector's elements, and casting is allowed, the data will be cast to the
- * specified `type`. This allows for flexible creation of tensors with data
- * vectors of one type and a different scalar type.
+ * vector's data type.
+ *
+ * The result is always a CPU tensor. To move it to a device, use
+ * `clone_tensor_ptr_to_device`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the vector.
  * @param data A vector containing the tensor's data.
@@ -209,11 +239,10 @@ inline TensorPtr make_tensor_ptr(
  *
  * This template overload is specialized for cases where the tensor data is
  * provided as an initializer list. The scalar type is automatically deduced
- * from the initializer list's data type. If the specified `type` differs from
- * the deduced type of the initializer list's elements, and casting is allowed,
- * the data will be cast to the specified `type`. This allows for flexible
- * creation of tensors with data vectors of one type and a different scalar
- * type.
+ * from the initializer list's data type.
+ *
+ * The result is always a CPU tensor. To move it to a device, use
+ * `clone_tensor_ptr_to_device`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the initializer
  * list.
@@ -252,11 +281,10 @@ inline TensorPtr make_tensor_ptr(
  *
  * This template overload allows creating a Tensor from an initializer list
  * of data. The scalar type is automatically deduced from the type of the
- * initializer list's elements. If the specified `type` differs from
- * the deduced type of the initializer list's elements, and casting is allowed,
- * the data will be cast to the specified `type`. This allows for flexible
- * creation of tensors with data vectors of one type and a different scalar
- * type.
+ * initializer list's elements.
+ *
+ * The result is always a CPU tensor. To move it to a device, use
+ * `clone_tensor_ptr_to_device`.
  *
  * @tparam T The C++ type of the tensor elements, deduced from the initializer
  * list.
@@ -299,7 +327,8 @@ inline TensorPtr make_tensor_ptr(T value) {
  *
  * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
  * and a scalar type to interpret the data. The vector is managed, and the
- * memory's lifetime is tied to the TensorImpl.
+ * memory's lifetime is tied to the TensorImpl. The result is always a CPU
+ * tensor.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param data A vector containing the raw memory for the tensor's data.
@@ -321,9 +350,8 @@ TensorPtr make_tensor_ptr(
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
  *
- * This overload accepts a raw memory buffer stored in a std::vector<uint8_t>
- * and a scalar type to interpret the data. The vector is managed, and the
- * memory's lifetime is tied to the TensorImpl.
+ * Convenience overload for the raw-buffer factory; see above. The result is
+ * always a CPU tensor.
  *
  * @param sizes A vector specifying the size of each dimension.
  * @param data A vector containing the raw memory for the tensor's data.
@@ -352,6 +380,9 @@ inline TensorPtr make_tensor_ptr(
  * configuration. If `dim_order` is empty but `strides` is provided, `dim_order`
  * is left empty so the core may infer it from the provided strides.
  *
+ * This overload always aliases — it never copies. To copy a tensor's data to
+ * a device, use `clone_tensor_ptr_to_device`.
+ *
  * @param tensor The source tensor to alias.
  * @param sizes Optional sizes override.
  * @param dim_order Optional dimension order override.
@@ -411,6 +442,9 @@ inline TensorPtr make_tensor_ptr(
  * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
  * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
  *
+ * This overload always aliases — it never copies. To copy a tensor's data to
+ * a device, use `clone_tensor_ptr_to_device`.
+ *
  * @param tensor_ptr The source tensor pointer to alias.
  * @param sizes Optional sizes override.
  * @param dim_order Optional dimension order override.
@@ -498,6 +532,37 @@ runtime::Error resize_tensor_ptr(
     TensorPtr& tensor,
     const std::vector<executorch::aten::SizesType>& sizes);
 
+/**
+ * Clones a CPU TensorPtr to a device TensorPtr.
+ *
+ * Allocates memory on the specified device and copies the tensor data from
+ * host to device using the DeviceAllocator registered for the given device
+ * type. The returned TensorPtr owns the device memory and will free it via
+ * the allocator when destroyed.
+ *
+ * @param cpu_tensor The source CPU tensor whose data will be copied.
+ * @param device_type The target device type (e.g., DeviceType::CUDA). Must not
+ * be CPU.
+ * @param device_index The target device index (default 0).
+ * @return A TensorPtr backed by device memory containing the copied data.
+ */
+TensorPtr clone_tensor_ptr_to_device(
+    const TensorPtr& cpu_tensor,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index = 0);
+
+/**
+ * Clones a device TensorPtr to a CPU TensorPtr.
+ *
+ * Allocates host memory and copies the tensor data from device to host using
+ * the DeviceAllocator registered for the source tensor's device type. The
+ * device type is determined from the source tensor's metadata.
+ *
+ * @param device_tensor The source device tensor whose data will be copied.
+ * @return A TensorPtr backed by CPU memory containing the copied data.
+ */
+TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor);
+
 } // namespace extension
 } // namespace executorch
 
diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl
index 5bf8c7019b8..807e16ec8c1 100644
--- a/extension/tensor/test/targets.bzl
+++ b/extension/tensor/test/targets.bzl
@@ -21,3 +21,14 @@ def define_common_targets():
                 "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
+
+        runtime.cxx_test(
+            name = "tensor_ptr_device_test" + aten_suffix,
+            srcs = [
+                "tensor_ptr_device_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+                "//executorch/runtime/core:device_allocator",
+            ],
+        )
diff --git a/extension/tensor/test/tensor_ptr_device_test.cpp b/extension/tensor/test/tensor_ptr_device_test.cpp
new file mode 100644
index 00000000000..0aa9e5f1b88
--- /dev/null
+++ b/extension/tensor/test/tensor_ptr_device_test.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+#ifndef USE_ATEN_LIB
+// All device tensor helpers are intentionally unsupported in USE_ATEN_LIB
+// builds (make_tensor_ptr cannot create on-device aten tensors), so the entire
+// test fixture is gated to the portable build.
+
+namespace {
+
+// A fake device allocator that uses host memory (malloc/free/memcpy) to
+// simulate device memory operations, enabling end-to-end data roundtrip
+// verification without requiring actual device hardware.
+class FakeDeviceAllocator : public DeviceAllocator {
+ public:
+  explicit FakeDeviceAllocator(DeviceType type) : type_(type) {}
+
+  Result<void*> allocate(
+      size_t nbytes,
+      DeviceIndex /*index*/,
+      size_t /*alignment*/ = kDefaultAlignment) override {
+    void* ptr = std::malloc(nbytes);
+    if (!ptr) {
+      return Error::MemoryAllocationFailed;
+    }
+    allocate_count_++;
+    return ptr;
+  }
+
+  void deallocate(void* ptr, DeviceIndex /*index*/) override {
+    std::free(ptr);
+    deallocate_count_++;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex /*index*/) override {
+    std::memcpy(dst, src, nbytes);
+    h2d_count_++;
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex /*index*/) override {
+    std::memcpy(dst, src, nbytes);
+    d2h_count_++;
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  void reset_counters() {
+    allocate_count_ = 0;
+    deallocate_count_ = 0;
+    h2d_count_ = 0;
+    d2h_count_ = 0;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  int h2d_count_ = 0;
+  int d2h_count_ = 0;
+
+ private:
+  DeviceType type_;
+};
+
+// Function-static singleton avoids non-const global allocator state.
+FakeDeviceAllocator& fake_cuda_allocator() {
+  static FakeDeviceAllocator allocator(DeviceType::CUDA);
+  return allocator;
+}
+
+// One-shot registration; the constructor runs at static init time and the
+// instance itself is immutable afterwards.
+struct RegisterFakeAllocator {
+  RegisterFakeAllocator() {
+    register_device_allocator(&fake_cuda_allocator());
+  }
+};
+const RegisterFakeAllocator s_register;
+
+} // namespace
+
+class TensorPtrDeviceTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    runtime_init();
+  }
+
+  void SetUp() override {
+    fake_cuda_allocator().reset_counters();
+  }
+};
+
+TEST_F(TensorPtrDeviceTest, CpuToDeviceTensor) {
+  auto cpu_tensor =
+      make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 2);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 3);
+  EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  EXPECT_NE(device_tensor->const_data_ptr(), nullptr);
+  EXPECT_NE(device_tensor->const_data_ptr(), cpu_tensor->const_data_ptr());
+
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+  EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 0);
+
+  EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1);
+  EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1);
+}
+
+TEST_F(TensorPtrDeviceTest, CpuToDeviceFromRawData) {
+  constexpr std::array<float, 4> data{10.0f, 20.0f, 30.0f, 40.0f};
+  auto cpu_tensor =
+      make_tensor_ptr({2, 2}, const_cast<float*>(data.data()));
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 2);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 2);
+  EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  EXPECT_NE(device_tensor->const_data_ptr(), nullptr);
+  EXPECT_NE(
+      device_tensor->const_data_ptr(),
+      static_cast<const void*>(data.data()));
+
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+
+  EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1);
+  EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1);
+}
+
+// clone_tensor_ptr_to_cpu relies on TensorImpl device metadata which is only
+// available in the non-ATen (ExecuTorch portable) path.
+TEST_F(TensorPtrDeviceTest, DeviceToCpuTensor) {
+  auto cpu_tensor =
+      make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto result_tensor = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(result_tensor->dim(), 2);
+  EXPECT_EQ(result_tensor->size(0), 2);
+  EXPECT_EQ(result_tensor->size(1), 3);
+  EXPECT_EQ(result_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+
+  auto* result_data = result_tensor->const_data_ptr<float>();
+  auto* original_data = cpu_tensor->const_data_ptr<float>();
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_FLOAT_EQ(result_data[i], original_data[i]);
+  }
+
+  EXPECT_EQ(fake_cuda_allocator().d2h_count_, 1);
+}
+
+TEST_F(TensorPtrDeviceTest, RoundtripCpuDeviceCpu) {
+  const std::vector<float> original = {1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f};
+  auto cpu_tensor = make_tensor_ptr({2, 3}, original);
+
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip_tensor = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_NE(roundtrip_tensor->const_data_ptr(), cpu_tensor->const_data_ptr());
+  EXPECT_NE(
+      roundtrip_tensor->const_data_ptr(), device_tensor->const_data_ptr());
+
+  auto* result_data = roundtrip_tensor->const_data_ptr<float>();
+  for (size_t i = 0; i < original.size(); ++i) {
+    EXPECT_FLOAT_EQ(result_data[i], original[i]);
+  }
+
+  EXPECT_EQ(roundtrip_tensor->dim(), cpu_tensor->dim());
+  EXPECT_EQ(roundtrip_tensor->size(0), cpu_tensor->size(0));
+  EXPECT_EQ(roundtrip_tensor->size(1), cpu_tensor->size(1));
+  EXPECT_EQ(roundtrip_tensor->scalar_type(), cpu_tensor->scalar_type());
+}
+
+TEST_F(TensorPtrDeviceTest, RoundtripInt32) {
+  auto cpu_tensor = make_tensor_ptr({4}, std::vector<int32_t>{10, 20, 30, 40});
+
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Int);
+  const std::vector<int32_t> expected = {10, 20, 30, 40};
+  auto* data = roundtrip->const_data_ptr<int32_t>();
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(data[i], expected[i]);
+  }
+}
+
+TEST_F(TensorPtrDeviceTest, DeviceIndexPropagation) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(
+      cpu_tensor, DeviceType::CUDA, /*device_index=*/1);
+
+  EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  EXPECT_FLOAT_EQ(roundtrip->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_FLOAT_EQ(roundtrip->const_data_ptr<float>()[1], 2.0f);
+}
+
+TEST_F(TensorPtrDeviceTest, DeviceMemoryCleanup) {
+  {
+    auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+    auto device_tensor =
+        clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+    EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1);
+    EXPECT_EQ(fake_cuda_allocator().deallocate_count_, 0);
+  }
+  EXPECT_EQ(fake_cuda_allocator().deallocate_count_, 1);
+}
+
+TEST_F(TensorPtrDeviceTest, ScalarTensorRoundtrip) {
+  auto cpu_tensor = make_tensor_ptr({}, {42.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 0);
+  EXPECT_EQ(device_tensor->numel(), 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  EXPECT_EQ(roundtrip->dim(), 0);
+  EXPECT_EQ(roundtrip->numel(), 1);
+  EXPECT_FLOAT_EQ(roundtrip->const_data_ptr<float>()[0], 42.0f);
+}
+
+TEST_F(TensorPtrDeviceTest, RawDataRoundtrip) {
+  constexpr std::array<float, 3> raw_data{100.0f, 200.0f, 300.0f};
+  auto cpu_tensor =
+      make_tensor_ptr({3}, const_cast<float*>(raw_data.data()));
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->dim(), 1);
+  EXPECT_EQ(roundtrip->size(0), 3);
+  auto* data = roundtrip->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data[0], 100.0f);
+  EXPECT_FLOAT_EQ(data[1], 200.0f);
+  EXPECT_FLOAT_EQ(data[2], 300.0f);
+}
+
+TEST_F(TensorPtrDeviceTest, ErrorCpuTargetDevice) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  ET_EXPECT_DEATH(clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CPU), "");
+}
+
+TEST_F(TensorPtrDeviceTest, ErrorNullCpuTensorData) {
+  auto null_tensor = make_tensor_ptr({2, 2}, nullptr);
+  ET_EXPECT_DEATH(
+      clone_tensor_ptr_to_device(null_tensor, DeviceType::CUDA), "");
+}
+
+TEST_F(TensorPtrDeviceTest, ErrorCpuTensorToCpu) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  ET_EXPECT_DEATH(clone_tensor_ptr_to_cpu(cpu_tensor), "");
+}
+
+TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) {
+  auto cpu_tensor =
+      make_tensor_ptr({2, 2}, std::vector<float>{1.0f, 2.0f, 3.0f, 4.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 2);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 2);
+  EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+  EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1);
+  EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  auto* data = roundtrip->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data[0], 1.0f);
+  EXPECT_FLOAT_EQ(data[1], 2.0f);
+  EXPECT_FLOAT_EQ(data[2], 3.0f);
+  EXPECT_FLOAT_EQ(data[3], 4.0f);
+}
+
+TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) {
+  constexpr std::array<float, 3> raw{5.0f, 6.0f, 7.0f};
+  auto cpu_tensor = make_tensor_ptr({3}, const_cast<float*>(raw.data()));
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 1);
+  EXPECT_EQ(device_tensor->size(0), 3);
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+  EXPECT_NE(
+      device_tensor->const_data_ptr(),
+      static_cast<const void*>(raw.data()));
+  EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1);
+  EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  auto* data = roundtrip->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data[0], 5.0f);
+  EXPECT_FLOAT_EQ(data[1], 6.0f);
+  EXPECT_FLOAT_EQ(data[2], 7.0f);
+}
+
+TEST_F(TensorPtrDeviceTest, CloneToCpuVerifiesCpuDeviceMetadata) {
+  auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto result = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(result->unsafeGetTensorImpl()->device_type(), DeviceType::CPU);
+  EXPECT_EQ(result->unsafeGetTensorImpl()->device_index(), 0);
+}
+
+TEST_F(TensorPtrDeviceTest, MultipleClonesFromSameSource) {
+  auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f});
+  auto device1 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto device2 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_NE(device1->const_data_ptr(), device2->const_data_ptr());
+  EXPECT_EQ(fake_cuda_allocator().allocate_count_, 2);
+  EXPECT_EQ(fake_cuda_allocator().h2d_count_, 2);
+}
+
+TEST_F(TensorPtrDeviceTest, HighDimensionalTensorRoundtrip) {
+  std::vector<float> data(24);
+  for (size_t i = 0; i < 24; ++i) {
+    data[i] = static_cast<float>(i);
+  }
+  auto cpu_tensor = make_tensor_ptr({2, 3, 4}, data);
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 3);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 3);
+  EXPECT_EQ(device_tensor->size(2), 4);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  auto* result = roundtrip->const_data_ptr<float>();
+  for (size_t i = 0; i < 24; ++i) {
+    EXPECT_FLOAT_EQ(result[i], static_cast<float>(i));
+  }
+}
+
+TEST_F(TensorPtrDeviceTest, RoundtripDouble) {
+  auto cpu_tensor = make_tensor_ptr({3}, std::vector<double>{1.1, 2.2, 3.3});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Double);
+  auto* data = roundtrip->const_data_ptr<double>();
+  EXPECT_DOUBLE_EQ(data[0], 1.1);
+  EXPECT_DOUBLE_EQ(data[1], 2.2);
+  EXPECT_DOUBLE_EQ(data[2], 3.3);
+}
+
+TEST_F(TensorPtrDeviceTest, RoundtripInt64) {
+  auto cpu_tensor = make_tensor_ptr({3}, std::vector<int64_t>{100, 200, 300});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Long);
+  auto* data = roundtrip->const_data_ptr<int64_t>();
+  EXPECT_EQ(data[0], 100);
+  EXPECT_EQ(data[1], 200);
+  EXPECT_EQ(data[2], 300);
+}
+
+TEST_F(TensorPtrDeviceTest, LargeTensorRoundtrip) {
+  const size_t n = 10000;
+  std::vector<float> data(n);
+  for (size_t i = 0; i < n; ++i) {
+    data[i] = static_cast<float>(i) * 0.1f;
+  }
+  auto cpu_tensor = make_tensor_ptr({static_cast<int32_t>(n)}, data);
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  auto* result = roundtrip->const_data_ptr<float>();
+  for (size_t i = 0; i < n; ++i) {
+    EXPECT_FLOAT_EQ(result[i], data[i]);
+  }
+}
+
+#endif // USE_ATEN_LIB
diff --git a/runtime/core/portable_type/tensor.h b/runtime/core/portable_type/tensor.h
index 775bccc1b52..f4ee2aef1f5 100644
--- a/runtime/core/portable_type/tensor.h
+++ b/runtime/core/portable_type/tensor.h
@@ -115,6 +115,21 @@ class Tensor {
     return impl_->shape_dynamism();
   }
 
+  /// Returns the device where tensor data resides.
+  Device device() const {
+    return impl_->device();
+  }
+
+  /// Returns the type of device where tensor data resides.
+  DeviceType device_type() const {
+    return impl_->device_type();
+  }
+
+  /// Returns the device index, or 0 if default/unspecified.
+  DeviceIndex device_index() const {
+    return impl_->device_index();
+  }
+
   /// Returns a pointer of type T to the constant underlying data blob.
   template <typename T>
   inline const T* const_data_ptr() const {
diff --git a/runtime/core/portable_type/test/tensor_test.cpp b/runtime/core/portable_type/test/tensor_test.cpp
index 714cdc25661..ba14644d71e 100644
--- a/runtime/core/portable_type/test/tensor_test.cpp
+++ b/runtime/core/portable_type/test/tensor_test.cpp
@@ -13,6 +13,9 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/test/utils/DeathTest.h>
 
+using executorch::runtime::etensor::Device;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
 using executorch::runtime::etensor::ScalarType;
 using executorch::runtime::etensor::Tensor;
 using executorch::runtime::etensor::TensorImpl;
@@ -78,3 +81,41 @@ TEST_F(TensorTest, ModifyDataOfConstTensor) {
   EXPECT_EQ(a.scalar_type(), ScalarType::Int);
   EXPECT_EQ(a.const_data_ptr<int32_t>()[0], 0);
 }
+
+TEST_F(TensorTest, DeviceForwardersDefaultCpu) {
+  TensorImpl::SizesType sizes[1] = {1};
+  TensorImpl::DimOrderType dim_order[1] = {0};
+  int32_t data[1] = {0};
+  // TensorImpl ctor defaults device to CPU/0 when not specified.
+  auto a_impl = TensorImpl(ScalarType::Int, 1, sizes, data, dim_order);
+  Tensor a(&a_impl);
+
+  EXPECT_EQ(a.device_type(), DeviceType::CPU);
+  EXPECT_EQ(a.device_index(), DeviceIndex(0));
+  EXPECT_EQ(a.device(), Device(DeviceType::CPU, 0));
+}
+
+TEST_F(TensorTest, DeviceForwardersNonCpu) {
+  TensorImpl::SizesType sizes[1] = {1};
+  TensorImpl::DimOrderType dim_order[1] = {0};
+  int32_t data[1] = {0};
+  auto a_impl = TensorImpl(
+      ScalarType::Int,
+      1,
+      sizes,
+      data,
+      dim_order,
+      /*strides=*/nullptr,
+      executorch::runtime::TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      /*device_index=*/3);
+  Tensor a(&a_impl);
+
+  // Each forwarder must agree with the underlying TensorImpl.
+  EXPECT_EQ(a.device_type(), a_impl.device_type());
+  EXPECT_EQ(a.device_index(), a_impl.device_index());
+  EXPECT_EQ(a.device(), a_impl.device());
+
+  EXPECT_EQ(a.device_type(), DeviceType::CUDA);
+  EXPECT_EQ(a.device_index(), DeviceIndex(3));
+}