diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index c8449a95718..3547423d8c9 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -101,6 +101,28 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "op__device_copy",
+    srcs = [
+        "op__device_copy.cpp",
+    ],
+    # Constructor needed for op registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["PUBLIC"],
+    deps = [
+        ":cuda_allocator",
+        "//executorch/extension/kernel_util:kernel_util",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/kernel:kernel_includes",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
 runtime.cxx_library(
     name = "cuda_backend",
     srcs = [
@@ -120,6 +142,7 @@ runtime.cxx_library(
         ":cuda_platform",
         ":runtime_shims",
         ":cuda_allocator",
+        ":op__device_copy",
         ":cuda_platform",
         "//executorch/backends/aoti:aoti_common_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
diff --git a/backends/cuda/runtime/op__device_copy.cpp b/backends/cuda/runtime/op__device_copy.cpp
new file mode 100644
index 00000000000..cb6cc2753fb
--- /dev/null
+++ b/backends/cuda/runtime/op__device_copy.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace executorch::backends::cuda {
+
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::etensor::DeviceType;
+
+Tensor& _h2d_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    Tensor& out) {
+  const auto* self_impl = self.unsafeGetTensorImpl();
+  const auto* out_impl = out.unsafeGetTensorImpl();
+  const auto device_index = out_impl->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      self_impl->device_type() == DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: source tensor must be on CPU, got device_type=%d",
+      static_cast<int>(self_impl->device_type()));
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_impl->device_type() == DeviceType::CUDA,
+      InvalidArgument,
+      out,
+      "_h2d_copy: destination tensor must be on CUDA, got device_type=%d",
+      static_cast<int>(out_impl->device_type()));
+
+  const size_t nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  const Error err = CudaAllocator::instance().copy_host_to_device(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_h2d_copy: copy_host_to_device failed");
+
+  return out;
+}
+
+Tensor& _d2h_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    Tensor& out) {
+  const auto* self_impl = self.unsafeGetTensorImpl();
+  const auto* out_impl = out.unsafeGetTensorImpl();
+  const auto device_index = self_impl->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      self_impl->device_type() == DeviceType::CUDA,
+      InvalidArgument,
+      out,
+      "_d2h_copy: source tensor must be on CUDA, got device_type=%d",
+      static_cast<int>(self_impl->device_type()));
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_impl->device_type() == DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: destination tensor must be on CPU, got device_type=%d",
+      static_cast<int>(out_impl->device_type()));
+
+  const size_t nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  const Error err = CudaAllocator::instance().copy_device_to_host(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_d2h_copy: copy_device_to_host failed");
+
+  return out;
+}
+
+} // namespace executorch::backends::cuda
+
+EXECUTORCH_LIBRARY(
+    et_copy,
+    "_h2d_copy.out",
+    executorch::backends::cuda::_h2d_copy_out);
+EXECUTORCH_LIBRARY(
+    et_copy,
+    "_d2h_copy.out",
+    executorch::backends::cuda::_d2h_copy_out);
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index b68043f7feb..8d915499b10 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -42,3 +42,24 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch_item_bool")
     cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
+
+    cpp_unittest(
+        name = "test_op__device_copy",
+        srcs = ["test_op__device_copy.cpp"],
+        deps = [
+            "//executorch/backends/cuda/runtime:cuda_allocator",
+            "//executorch/backends/cuda/runtime:op__device_copy",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+        preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+        keep_gpu_sections = True,
+        remote_execution = re_test_utils.remote_execution(
+            platform = "gpu-remote-execution",
+        ),
+    )
diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
new file mode 100644
index 00000000000..1c4456c27e6
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+
+#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \
+    defined(__SANITIZE_ADDRESS__)
+#include <sanitizer/lsan_interface.h>
+#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1
+#else
+#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0
+#endif
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::backends::cuda::CudaAllocator;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::TensorShapeDynamism;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace executorch::backends::cuda {
+Tensor& _h2d_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    Tensor& out);
+Tensor& _d2h_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    Tensor& out);
+} // namespace executorch::backends::cuda
+
+namespace {
+
+struct CudaDeleter {
+  void operator()(void* ptr) const {
+    CudaAllocator::instance().deallocate(ptr, device_index);
+  }
+
+  DeviceIndex device_index = 0;
+};
+
+using CudaPtr = std::unique_ptr<void, CudaDeleter>;
+
+CudaPtr allocate_cuda(size_t nbytes, DeviceIndex device_index = 0) {
+  auto result = CudaAllocator::instance().allocate(nbytes, device_index);
+  EXPECT_TRUE(result.ok()) << "CudaAllocator::allocate failed";
+  return CudaPtr(
+      result.ok() ? result.get() : nullptr, CudaDeleter{device_index});
+}
+
+bool is_cuda_available() {
+#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
+  __lsan_disable();
+#endif
+  int device_count = 0;
+  const cudaError_t err = cudaGetDeviceCount(&device_count);
+#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE
+  __lsan_enable();
+#endif
+  return err == cudaSuccess && device_count > 0;
+}
+
+std::vector<float> copy_cuda_to_host(const void* device_ptr, size_t numel) {
+  std::vector<float> host(numel);
+  const cudaError_t err = cudaMemcpy(
+      host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed";
+  return host;
+}
+
+void copy_host_to_cuda(const std::vector<float>& host, void* device_ptr) {
+  const cudaError_t err = cudaMemcpy(
+      device_ptr,
+      host.data(),
+      host.size() * sizeof(float),
+      cudaMemcpyHostToDevice);
+  EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed";
+}
+
+class CudaDeviceCopyOpTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+  }
+
+  void SetUp() override {
+    if (!is_cuda_available()) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests";
+    }
+  }
+};
+
+} // namespace
+
+TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesCudaAllocatorCopy) {
+  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto device_data = allocate_cuda(src_data.size() * sizeof(float));
+  ASSERT_NE(device_data.get(), nullptr);
+
+  int32_t sizes[] = {static_cast<int32_t>(src_data.size())};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data.data(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      device_data.get(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  KernelRuntimeContext ctx;
+  Tensor& result = executorch::backends::cuda::_h2d_copy_out(ctx, src, dst);
+
+  EXPECT_EQ(&result, &dst);
+  EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data);
+}
+
+TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesCudaAllocatorCopy) {
+  const std::vector<float> expected = {5.0f, 6.0f, 7.0f, 8.0f};
+  auto device_data = allocate_cuda(expected.size() * sizeof(float));
+  ASSERT_NE(device_data.get(), nullptr);
+  copy_host_to_cuda(expected, device_data.get());
+
+  std::vector<float> dst_data(expected.size(), 0.0f);
+  int32_t sizes[] = {static_cast<int32_t>(expected.size())};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      device_data.get(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data.data(),
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor dst(&dst_impl);
+
+  KernelRuntimeContext ctx;
+  Tensor& result = executorch::backends::cuda::_d2h_copy_out(ctx, src, dst);
+
+  EXPECT_EQ(&result, &dst);
+  EXPECT_EQ(dst_data, expected);
+}
diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp
new file mode 100644
index 00000000000..dd5d16cd204
--- /dev/null
+++ b/kernels/portable/cpu/op__device_copy.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops.
+ *
+ * These ops transfer tensor data between CPU and device memory using
+ * the DeviceAllocator interface. The device type is inferred from the
+ * tensor metadata (out.device_type() for H2D, self.device_type() for D2H),
+ * which was set during AOT serialization by PropagateDevicePass.
+ */
+
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace executorch::runtime::native {
+
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+/**
+ * Copies tensor data from host (CPU) memory to device memory.
+ *
+ * self: source tensor on CPU
+ * out:  destination tensor on device (memory-planned by runtime)
+ *
+ * The device type and index are inferred from out's TensorImpl metadata.
+ */
+Tensor&
+_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
+  auto device_type = out.unsafeGetTensorImpl()->device_type();
+  auto device_index = out.unsafeGetTensorImpl()->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      self.unsafeGetTensorImpl()->device_type() == etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: source tensor must be on CPU, got device_type=%d",
+      static_cast<int>(self.unsafeGetTensorImpl()->device_type()));
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      device_type != etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_h2d_copy: destination tensor must be on a non-CPU device");
+
+  auto nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  DeviceAllocator* allocator = get_device_allocator(device_type);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      allocator != nullptr,
+      NotFound,
+      out,
+      "_h2d_copy: no device allocator registered for device_type=%d",
+      static_cast<int>(device_type));
+
+  Error err = allocator->copy_host_to_device(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_h2d_copy: copy_host_to_device failed");
+
+  return out;
+}
+
+/**
+ * Copies tensor data from device memory to host (CPU) memory.
+ *
+ * self: source tensor on device
+ * out:  destination tensor on CPU (memory-planned by runtime)
+ *
+ * The device type and index are inferred from self's TensorImpl metadata.
+ */
+Tensor&
+_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
+  auto device_type = self.unsafeGetTensorImpl()->device_type();
+  auto device_index = self.unsafeGetTensorImpl()->device_index();
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      device_type != etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: source tensor must be on a non-CPU device");
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.unsafeGetTensorImpl()->device_type() == etensor::DeviceType::CPU,
+      InvalidArgument,
+      out,
+      "_d2h_copy: destination tensor must be on CPU, got device_type=%d",
+      static_cast<int>(out.unsafeGetTensorImpl()->device_type()));
+
+  auto nbytes = self.nbytes();
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      nbytes == out.nbytes(),
+      InvalidArgument,
+      out,
+      "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
+      nbytes,
+      out.nbytes());
+
+  DeviceAllocator* allocator = get_device_allocator(device_type);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      allocator != nullptr,
+      NotFound,
+      out,
+      "_d2h_copy: no device allocator registered for device_type=%d",
+      static_cast<int>(device_type));
+
+  Error err = allocator->copy_device_to_host(
+      out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index);
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      err == Error::Ok,
+      Internal,
+      out,
+      "_d2h_copy: copy_device_to_host failed");
+
+  return out;
+}
+
+} // namespace executorch::runtime::native
+
+EXECUTORCH_LIBRARY(
+    et_copy,
+    "_h2d_copy.out",
+    executorch::runtime::native::_h2d_copy_out);
+EXECUTORCH_LIBRARY(
+    et_copy,
+    "_d2h_copy.out",
+    executorch::runtime::native::_d2h_copy_out);
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index 7df77570450..0034d345719 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -75,6 +75,24 @@ def define_common_targets():
         ],
     )
 
+    # Device copy ops (h2d_copy, d2h_copy) for transferring data between
+    # CPU and device memory. Uses DeviceAllocator interface.
+    runtime.cxx_library(
+        name = "op__device_copy",
+        srcs = ["op__device_copy.cpp"],
+        visibility = ["PUBLIC"],
+        # Constructor needed for op registration.
+        compiler_flags = ["-Wno-global-constructors"],
+        deps = [
+            "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/extension/kernel_util:kernel_util",
+        ],
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+    )
+
     # Used for dtype selective build. Collect source and header files.
     runtime.filegroup(
         name = "portable_source_files",
diff --git a/kernels/portable/cpu/test/op__device_copy_test.cpp b/kernels/portable/cpu/test/op__device_copy_test.cpp
new file mode 100644
index 00000000000..78cf06ae934
--- /dev/null
+++ b/kernels/portable/cpu/test/op__device_copy_test.cpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels.
+ *
+ * Uses a MockDeviceAllocator to verify that the kernels correctly call
+ * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface,
+ * and that device type is inferred from tensor metadata.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/portable_type/tensor_impl.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/platform/runtime.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::aten::TensorImpl;
+using executorch::runtime::DeviceAllocator;
+using executorch::runtime::Error;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::register_device_allocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism;
+
+// Forward declare the kernel functions from op__device_copy.cpp
+namespace executorch::runtime::native {
+Tensor&
+_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out);
+Tensor&
+_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out);
+} // namespace executorch::runtime::native
+
+namespace {
+
+class MockDeviceAllocator : public DeviceAllocator {
+ public:
+  Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
+    return Error::NotSupported;
+  }
+
+  void deallocate(void* ptr, DeviceIndex index) override {}
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    h2d_call_count_++;
+    last_h2d_nbytes_ = nbytes;
+    last_h2d_device_index_ = index;
+    // Actually copy so we can verify data
+    std::memcpy(dst, src, nbytes);
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex index) override {
+    d2h_call_count_++;
+    last_d2h_nbytes_ = nbytes;
+    last_d2h_device_index_ = index;
+    std::memcpy(dst, src, nbytes);
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return DeviceType::CUDA;
+  }
+
+  int h2d_call_count_ = 0;
+  int d2h_call_count_ = 0;
+  size_t last_h2d_nbytes_ = 0;
+  size_t last_d2h_nbytes_ = 0;
+  DeviceIndex last_h2d_device_index_ = -1;
+  DeviceIndex last_d2h_device_index_ = -1;
+};
+
+} // namespace
+
+static MockDeviceAllocator g_mock_cuda;
+
+class OpDeviceCopyTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    executorch::runtime::runtime_init();
+    register_device_allocator(&g_mock_cuda);
+  }
+
+  void SetUp() override {
+    g_mock_cuda.h2d_call_count_ = 0;
+    g_mock_cuda.d2h_call_count_ = 0;
+    g_mock_cuda.last_h2d_nbytes_ = 0;
+    g_mock_cuda.last_d2h_nbytes_ = 0;
+    g_mock_cuda.last_h2d_device_index_ = -1;
+    g_mock_cuda.last_d2h_device_index_ = -1;
+  }
+};
+
+TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) {
+  // Set up a CPU source tensor with known data.
+  float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int32_t sizes[] = {4};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  // Set up a CUDA destination tensor (simulated with host memory).
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  KernelRuntimeContext ctx{};
+  Tensor& result = executorch::runtime::native::_h2d_copy_out(ctx, src, dst);
+
+  // Verify the allocator was called correctly.
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float));
+  EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0);
+
+  // Verify data was copied (mock does a real memcpy).
+  EXPECT_EQ(dst_data[0], 1.0f);
+  EXPECT_EQ(dst_data[1], 2.0f);
+  EXPECT_EQ(dst_data[2], 3.0f);
+  EXPECT_EQ(dst_data[3], 4.0f);
+
+  // Verify return value is the out tensor.
+  EXPECT_EQ(&result, &dst);
+}
+
+TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) {
+  // Set up a CUDA source tensor with known data.
+  float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f};
+  int32_t sizes[] = {4};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor src(&src_impl);
+
+  // Set up a CPU destination tensor.
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor dst(&dst_impl);
+
+  KernelRuntimeContext ctx{};
+  Tensor& result = executorch::runtime::native::_d2h_copy_out(ctx, src, dst);
+
+  // Verify the allocator was called correctly.
+  EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float));
+  EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0);
+
+  // Verify data was copied.
+  EXPECT_EQ(dst_data[0], 5.0f);
+  EXPECT_EQ(dst_data[1], 6.0f);
+  EXPECT_EQ(dst_data[2], 7.0f);
+  EXPECT_EQ(dst_data[3], 8.0f);
+
+  EXPECT_EQ(&result, &dst);
+}
+
+TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) {
+  // Verify device_index is correctly forwarded to the allocator.
+  float src_data[] = {1.0f};
+  float dst_data[] = {0.0f};
+  int32_t sizes[] = {1};
+  uint8_t dim_order[] = {0};
+  int32_t strides[] = {1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  // Device index = 1 (e.g., cuda:1)
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      1,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      1);
+  Tensor dst(&dst_impl);
+
+  KernelRuntimeContext ctx{};
+  executorch::runtime::native::_h2d_copy_out(ctx, src, dst);
+
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1);
+}
+
+TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) {
+  // Test with a 2D tensor [2, 3].
+  float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+  int32_t sizes[] = {2, 3};
+  uint8_t dim_order[] = {0, 1};
+  int32_t strides[] = {3, 1};
+
+  TensorImpl src_impl(
+      ScalarType::Float,
+      2,
+      sizes,
+      src_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CPU,
+      0);
+  Tensor src(&src_impl);
+
+  TensorImpl dst_impl(
+      ScalarType::Float,
+      2,
+      sizes,
+      dst_data,
+      dim_order,
+      strides,
+      TensorShapeDynamism::STATIC,
+      DeviceType::CUDA,
+      0);
+  Tensor dst(&dst_impl);
+
+  KernelRuntimeContext ctx{};
+  executorch::runtime::native::_h2d_copy_out(ctx, src, dst);
+
+  EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1);
+  EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float));
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(dst_data[i], src_data[i]);
+  }
+}
diff --git a/kernels/portable/cpu/test/targets.bzl b/kernels/portable/cpu/test/targets.bzl
index a40f6da1931..962616c0785 100644
--- a/kernels/portable/cpu/test/targets.bzl
+++ b/kernels/portable/cpu/test/targets.bzl
@@ -27,3 +27,16 @@ def define_common_targets():
         srcs = ["vec_ops_test.cpp"],
         deps = ["//executorch/kernels/portable/cpu:vec_ops"],
     )
+
+    runtime.cxx_test(
+        name = "op__device_copy_test",
+        srcs = ["op__device_copy_test.cpp"],
+        deps = [
+            "//executorch/kernels/portable/cpu:op__device_copy",
+            "//executorch/runtime/core:device_allocator",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/kernel:kernel_runtime_context",
+            "//executorch/runtime/platform:platform",
+        ],
+    )