diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index c8449a95718..3547423d8c9 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -101,6 +101,28 @@ runtime.cxx_library( ], ) +runtime.cxx_library( + name = "op__device_copy", + srcs = [ + "op__device_copy.cpp", + ], + # Constructor needed for op registration. + compiler_flags = ["-Wno-global-constructors"], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["PUBLIC"], + deps = [ + ":cuda_allocator", + "//executorch/extension/kernel_util:kernel_util", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/kernel:kernel_includes", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], +) + runtime.cxx_library( name = "cuda_backend", srcs = [ @@ -120,6 +142,7 @@ runtime.cxx_library( ":cuda_platform", ":runtime_shims", ":cuda_allocator", + ":op__device_copy", ":cuda_platform", "//executorch/backends/aoti:aoti_common_slim", "//executorch/backends/aoti/slim/core:slimtensor", diff --git a/backends/cuda/runtime/op__device_copy.cpp b/backends/cuda/runtime/op__device_copy.cpp new file mode 100644 index 00000000000..cb6cc2753fb --- /dev/null +++ b/backends/cuda/runtime/op__device_copy.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace executorch::backends::cuda { + +using executorch::aten::Tensor; +using executorch::runtime::Error; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::etensor::DeviceType; + +Tensor& _h2d_copy_out( + KernelRuntimeContext& ctx, + const Tensor& self, + Tensor& out) { + const auto* self_impl = self.unsafeGetTensorImpl(); + const auto* out_impl = out.unsafeGetTensorImpl(); + const auto device_index = out_impl->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + self_impl->device_type() == DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: source tensor must be on CPU, got device_type=%d", + static_cast(self_impl->device_type())); + + ET_KERNEL_CHECK_MSG( + ctx, + out_impl->device_type() == DeviceType::CUDA, + InvalidArgument, + out, + "_h2d_copy: destination tensor must be on CUDA, got device_type=%d", + static_cast(out_impl->device_type())); + + const size_t nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + const Error err = CudaAllocator::instance().copy_host_to_device( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_h2d_copy: copy_host_to_device failed"); + + return out; +} + +Tensor& _d2h_copy_out( + KernelRuntimeContext& ctx, + const Tensor& self, + Tensor& out) { + const auto* self_impl = self.unsafeGetTensorImpl(); + const auto* out_impl = out.unsafeGetTensorImpl(); + const auto device_index = self_impl->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + self_impl->device_type() == DeviceType::CUDA, + InvalidArgument, + out, + "_d2h_copy: source tensor must be on CUDA, got device_type=%d", + static_cast(self_impl->device_type())); + + ET_KERNEL_CHECK_MSG( + ctx, + out_impl->device_type() == DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: destination tensor must be on CPU, got device_type=%d", + static_cast(out_impl->device_type())); + + const size_t nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + const Error err = CudaAllocator::instance().copy_device_to_host( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_d2h_copy: copy_device_to_host failed"); + + return out; +} + +} // namespace executorch::backends::cuda + +EXECUTORCH_LIBRARY( + et_copy, + "_h2d_copy.out", + executorch::backends::cuda::_h2d_copy_out); +EXECUTORCH_LIBRARY( + et_copy, + "_d2h_copy.out", + executorch::backends::cuda::_d2h_copy_out); diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index b68043f7feb..8d915499b10 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -42,3 +42,24 @@ def define_common_targets(): cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch_item_bool") cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out") + + cpp_unittest( + name = "test_op__device_copy", + srcs = ["test_op__device_copy.cpp"], + deps = [ + "//executorch/backends/cuda/runtime:cuda_allocator", + "//executorch/backends/cuda/runtime:op__device_copy", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/portable_type:portable_type", + "//executorch/runtime/kernel:kernel_runtime_context", + "//executorch/runtime/platform:platform", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), + ) diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp new file mode 100644 index 00000000000..1c4456c27e6 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include + +#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \ + defined(__SANITIZE_ADDRESS__) +#include +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1 +#else +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0 +#endif + +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::backends::cuda::CudaAllocator; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::TensorShapeDynamism; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace executorch::backends::cuda { +Tensor& _h2d_copy_out( + KernelRuntimeContext& ctx, + const Tensor& self, + Tensor& out); +Tensor& _d2h_copy_out( + KernelRuntimeContext& ctx, + const Tensor& self, + Tensor& out); +} // namespace executorch::backends::cuda + +namespace { + +struct CudaDeleter { + void operator()(void* ptr) const { + CudaAllocator::instance().deallocate(ptr, device_index); + } + + DeviceIndex device_index = 0; +}; + +using CudaPtr = std::unique_ptr; + +CudaPtr allocate_cuda(size_t nbytes, DeviceIndex device_index = 0) { + auto result = CudaAllocator::instance().allocate(nbytes, device_index); + EXPECT_TRUE(result.ok()) << "CudaAllocator::allocate failed"; + return CudaPtr( + result.ok() ? result.get() : nullptr, CudaDeleter{device_index}); +} + +bool is_cuda_available() { +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_disable(); +#endif + int device_count = 0; + const cudaError_t err = cudaGetDeviceCount(&device_count); +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_enable(); +#endif + return err == cudaSuccess && device_count > 0; +} + +std::vector copy_cuda_to_host(const void* device_ptr, size_t numel) { + std::vector host(numel); + const cudaError_t err = cudaMemcpy( + host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed"; + return host; +} + +void copy_host_to_cuda(const std::vector& host, void* device_ptr) { + const cudaError_t err = cudaMemcpy( + device_ptr, + host.data(), + host.size() * sizeof(float), + cudaMemcpyHostToDevice); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed"; +} + +class CudaDeviceCopyOpTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + } + + void SetUp() override { + if (!is_cuda_available()) { + GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests"; + } + } +}; + +} // namespace + +TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesCudaAllocatorCopy) { + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto device_data = allocate_cuda(src_data.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + + int32_t sizes[] = {static_cast(src_data.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + KernelRuntimeContext ctx; + Tensor& result = executorch::backends::cuda::_h2d_copy_out(ctx, src, dst); + + EXPECT_EQ(&result, &dst); + EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data); +} + +TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesCudaAllocatorCopy) { + const std::vector expected = {5.0f, 6.0f, 7.0f, 8.0f}; + auto device_data = allocate_cuda(expected.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + copy_host_to_cuda(expected, device_data.get()); + + std::vector dst_data(expected.size(), 0.0f); + int32_t sizes[] = {static_cast(expected.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + KernelRuntimeContext ctx; + Tensor& result = executorch::backends::cuda::_d2h_copy_out(ctx, src, dst); + + EXPECT_EQ(&result, &dst); + EXPECT_EQ(dst_data, expected); +} diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp new file mode 100644 index 00000000000..dd5d16cd204 --- /dev/null +++ b/kernels/portable/cpu/op__device_copy.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops. + * + * These ops transfer tensor data between CPU and device memory using + * the DeviceAllocator interface. The device type is inferred from the + * tensor metadata (out.device_type() for H2D, self.device_type() for D2H), + * which was set during AOT serialization by PropagateDevicePass. + */ + +#include +#include +#include +#include + +namespace executorch::runtime::native { + +using executorch::aten::Tensor; +using executorch::runtime::KernelRuntimeContext; + +/** + * Copies tensor data from host (CPU) memory to device memory. + * + * self: source tensor on CPU + * out: destination tensor on device (memory-planned by runtime) + * + * The device type and index are inferred from out's TensorImpl metadata. + */ +Tensor& +_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = out.unsafeGetTensorImpl()->device_type(); + auto device_index = out.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + self.unsafeGetTensorImpl()->device_type() == etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: source tensor must be on CPU, got device_type=%d", + static_cast(self.unsafeGetTensorImpl()->device_type())); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: destination tensor must be on a non-CPU device"); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_h2d_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_host_to_device( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_h2d_copy: copy_host_to_device failed"); + + return out; +} + +/** + * Copies tensor data from device memory to host (CPU) memory. + * + * self: source tensor on device + * out: destination tensor on CPU (memory-planned by runtime) + * + * The device type and index are inferred from self's TensorImpl metadata. + */ +Tensor& +_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = self.unsafeGetTensorImpl()->device_type(); + auto device_index = self.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: source tensor must be on a non-CPU device"); + + ET_KERNEL_CHECK_MSG( + ctx, + out.unsafeGetTensorImpl()->device_type() == etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: destination tensor must be on CPU, got device_type=%d", + static_cast(out.unsafeGetTensorImpl()->device_type())); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_d2h_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_device_to_host( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_d2h_copy: copy_device_to_host failed"); + + return out; +} + +} // namespace executorch::runtime::native + +EXECUTORCH_LIBRARY( + et_copy, + "_h2d_copy.out", + executorch::runtime::native::_h2d_copy_out); +EXECUTORCH_LIBRARY( + et_copy, + "_d2h_copy.out", + executorch::runtime::native::_d2h_copy_out); diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 7df77570450..0034d345719 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -75,6 +75,24 @@ def define_common_targets(): ], ) + # Device copy ops (h2d_copy, d2h_copy) for transferring data between + # CPU and device memory. Uses DeviceAllocator interface. + runtime.cxx_library( + name = "op__device_copy", + srcs = ["op__device_copy.cpp"], + visibility = ["PUBLIC"], + # Constructor needed for op registration. + compiler_flags = ["-Wno-global-constructors"], + deps = [ + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/kernel:kernel_includes", + "//executorch/extension/kernel_util:kernel_util", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + ) + # Used for dtype selective build. Collect source and header files. runtime.filegroup( name = "portable_source_files", diff --git a/kernels/portable/cpu/test/op__device_copy_test.cpp b/kernels/portable/cpu/test/op__device_copy_test.cpp new file mode 100644 index 00000000000..78cf06ae934 --- /dev/null +++ b/kernels/portable/cpu/test/op__device_copy_test.cpp @@ -0,0 +1,294 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels. + * + * Uses a MockDeviceAllocator to verify that the kernels correctly call + * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface, + * and that device type is inferred from tensor metadata. + */ + +#include + +#include +#include +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::Error; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism; + +// Forward declare the kernel functions from op__device_copy.cpp +namespace executorch::runtime::native { +Tensor& +_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out); +Tensor& +_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out); +} // namespace executorch::runtime::native + +namespace { + +class MockDeviceAllocator : public DeviceAllocator { + public: + Result allocate(size_t nbytes, DeviceIndex index) override { + return Error::NotSupported; + } + + void deallocate(void* ptr, DeviceIndex index) override {} + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + h2d_call_count_++; + last_h2d_nbytes_ = nbytes; + last_h2d_device_index_ = index; + // Actually copy so we can verify data + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + d2h_call_count_++; + last_d2h_nbytes_ = nbytes; + last_d2h_device_index_ = index; + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int h2d_call_count_ = 0; + int d2h_call_count_ = 0; + size_t last_h2d_nbytes_ = 0; + size_t last_d2h_nbytes_ = 0; + DeviceIndex last_h2d_device_index_ = -1; + DeviceIndex last_d2h_device_index_ = -1; +}; + +} // namespace + +static MockDeviceAllocator g_mock_cuda; + +class OpDeviceCopyTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(&g_mock_cuda); + } + + void SetUp() override { + g_mock_cuda.h2d_call_count_ = 0; + g_mock_cuda.d2h_call_count_ = 0; + g_mock_cuda.last_h2d_nbytes_ = 0; + g_mock_cuda.last_d2h_nbytes_ = 0; + g_mock_cuda.last_h2d_device_index_ = -1; + g_mock_cuda.last_d2h_device_index_ = -1; + } +}; + +TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) { + // Set up a CPU source tensor with known data. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Set up a CUDA destination tensor (simulated with host memory). + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + KernelRuntimeContext ctx{}; + Tensor& result = executorch::runtime::native::_h2d_copy_out(ctx, src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0); + + // Verify data was copied (mock does a real memcpy). + EXPECT_EQ(dst_data[0], 1.0f); + EXPECT_EQ(dst_data[1], 2.0f); + EXPECT_EQ(dst_data[2], 3.0f); + EXPECT_EQ(dst_data[3], 4.0f); + + // Verify return value is the out tensor. + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) { + // Set up a CUDA source tensor with known data. + float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + // Set up a CPU destination tensor. + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + KernelRuntimeContext ctx{}; + Tensor& result = executorch::runtime::native::_d2h_copy_out(ctx, src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0); + + // Verify data was copied. + EXPECT_EQ(dst_data[0], 5.0f); + EXPECT_EQ(dst_data[1], 6.0f); + EXPECT_EQ(dst_data[2], 7.0f); + EXPECT_EQ(dst_data[3], 8.0f); + + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) { + // Verify device_index is correctly forwarded to the allocator. + float src_data[] = {1.0f}; + float dst_data[] = {0.0f}; + int32_t sizes[] = {1}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Device index = 1 (e.g., cuda:1) + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 1); + Tensor dst(&dst_impl); + + KernelRuntimeContext ctx{}; + executorch::runtime::native::_h2d_copy_out(ctx, src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1); +} + +TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) { + // Test with a 2D tensor [2, 3]. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + int32_t sizes[] = {2, 3}; + uint8_t dim_order[] = {0, 1}; + int32_t strides[] = {3, 1}; + + TensorImpl src_impl( + ScalarType::Float, + 2, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 2, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + KernelRuntimeContext ctx{}; + executorch::runtime::native::_h2d_copy_out(ctx, src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float)); + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(dst_data[i], src_data[i]); + } +} diff --git a/kernels/portable/cpu/test/targets.bzl b/kernels/portable/cpu/test/targets.bzl index a40f6da1931..962616c0785 100644 --- a/kernels/portable/cpu/test/targets.bzl +++ b/kernels/portable/cpu/test/targets.bzl @@ -27,3 +27,16 @@ def define_common_targets(): srcs = ["vec_ops_test.cpp"], deps = ["//executorch/kernels/portable/cpu:vec_ops"], ) + + runtime.cxx_test( + name = "op__device_copy_test", + srcs = ["op__device_copy_test.cpp"], + deps = [ + "//executorch/kernels/portable/cpu:op__device_copy", + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/portable_type:portable_type", + "//executorch/runtime/kernel:kernel_runtime_context", + "//executorch/runtime/platform:platform", + ], + )