diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index c8bf2847dcf..6a5c40f9857 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -24,6 +24,7 @@ def define_common_targets(): ], visibility = ["PUBLIC"], deps = [ + "//executorch/runtime/core:device_allocator", "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp index a6ba6018333..a1c5c3c6fe7 100644 --- a/extension/tensor/tensor_ptr.cpp +++ b/extension/tensor/tensor_ptr.cpp @@ -12,6 +12,7 @@ #include +#include #include namespace executorch { @@ -25,6 +26,9 @@ namespace { * ensures that they are managed together and have the same lifetime as the * Tensor. When the Tensor is destroyed, the Storage structure ensures * proper cleanup of the associated metadata and data if needed. + * + * For device tensors, the data pointer points to device memory; the deleter + * is responsible for freeing it through the appropriate DeviceAllocator. */ struct Storage final { executorch::aten::TensorImpl tensor_impl; @@ -47,6 +51,11 @@ struct Storage final { strides(std::move(strides)), deleter(std::move(deleter)) {} + Storage(const Storage&) = delete; + Storage& operator=(const Storage&) = delete; + Storage(Storage&&) = delete; + Storage& operator=(Storage&&) = delete; + ~Storage() { if (deleter) { deleter(tensor_impl.mutable_data()); @@ -63,7 +72,9 @@ TensorPtr make_tensor_ptr( std::vector strides, executorch::aten::ScalarType type, executorch::aten::TensorShapeDynamism dynamism, - std::function deleter) { + std::function deleter, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index) { const auto dim = sizes.size(); ET_CHECK_MSG( dim_order.empty() || dim_order.size() == dim, @@ -111,17 +122,25 @@ TensorPtr make_tensor_ptr( data, dim_order.data(), strides.data(), - dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC); + dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC, + device_type, + device_index); auto storage = std::make_shared( std::move(tensor_impl), std::move(sizes), std::move(dim_order), std::move(strides), std::move(deleter)); - const auto tensor_ptr = &storage->tensor; + const auto raw_tensor_ptr = &storage->tensor; return std::shared_ptr( - std::move(storage), tensor_ptr); + std::move(storage), raw_tensor_ptr); #else + ET_CHECK_MSG( + device_type == runtime::etensor::DeviceType::CPU, + "USE_ATEN_LIB build does not support non-CPU device tensors via make_tensor_ptr; " + "got device_type=%d. Use the ExecuTorch portable build for device tensor support.", + static_cast(device_type)); + (void)device_index; auto options = c10::TensorOptions() .dtype(c10::scalarTypeToTypeMeta(type)) .device(c10::kCPU); @@ -271,5 +290,120 @@ runtime::Error resize_tensor_ptr( sizes.data(), sizes.size())); } +// ---- Device tensor helpers ---- +// +// These helpers are only meaningful in the ExecuTorch portable build. +// USE_ATEN_LIB cannot create on-device tensors via make_tensor_ptr, so cloning +// to/from a device tensor is intentionally unsupported in that build. + +#ifndef USE_ATEN_LIB + +TensorPtr clone_tensor_ptr_to_device( + const TensorPtr& cpu_tensor, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index) { + ET_CHECK_MSG( + device_type != runtime::etensor::DeviceType::CPU, + "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies."); + + auto* allocator = runtime::get_device_allocator(device_type); + ET_CHECK_MSG( + allocator != nullptr, + "No device allocator registered for device type %d", + static_cast(device_type)); + + const auto nbytes = cpu_tensor->nbytes(); + const auto* cpu_data = cpu_tensor->const_data_ptr(); + ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data."); + + auto result = allocator->allocate(nbytes, device_index); + ET_CHECK_MSG(result.ok(), "Failed to allocate device memory."); + void* device_data = result.get(); + + auto err = allocator->copy_host_to_device( + device_data, cpu_data, nbytes, device_index); + ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed."); + + std::vector sizes( + cpu_tensor->sizes().begin(), cpu_tensor->sizes().end()); + std::vector dim_order( + cpu_tensor->dim_order().begin(), cpu_tensor->dim_order().end()); + std::vector strides( + cpu_tensor->strides().begin(), cpu_tensor->strides().end()); + + return make_tensor_ptr( + std::move(sizes), + device_data, + std::move(dim_order), + std::move(strides), + cpu_tensor->scalar_type(), + cpu_tensor->shape_dynamism(), + [allocator, device_index](void* ptr) { + allocator->deallocate(ptr, device_index); + }, + device_type, + device_index); +} + +TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) { + const auto nbytes = device_tensor->nbytes(); + const auto* device_data = device_tensor->const_data_ptr(); + ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data."); + + const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type(); + const auto device_index = + device_tensor->unsafeGetTensorImpl()->device_index(); + ET_CHECK_MSG( + device_type != runtime::etensor::DeviceType::CPU, + "Source tensor is already on CPU."); + + auto* allocator = runtime::get_device_allocator(device_type); + ET_CHECK_MSG( + allocator != nullptr, + "No device allocator registered for device type %d", + static_cast(device_type)); + + std::vector cpu_data(nbytes); + + auto err = allocator->copy_device_to_host( + cpu_data.data(), device_data, nbytes, device_index); + ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed."); + + std::vector sizes( + device_tensor->sizes().begin(), device_tensor->sizes().end()); + std::vector dim_order( + device_tensor->dim_order().begin(), device_tensor->dim_order().end()); + std::vector strides( + device_tensor->strides().begin(), device_tensor->strides().end()); + + return make_tensor_ptr( + std::move(sizes), + std::move(cpu_data), + std::move(dim_order), + std::move(strides), + device_tensor->scalar_type()); +} + +#else // USE_ATEN_LIB + +TensorPtr clone_tensor_ptr_to_device( + const TensorPtr& /*cpu_tensor*/, + runtime::etensor::DeviceType /*device_type*/, + runtime::etensor::DeviceIndex /*device_index*/) { + ET_CHECK_MSG( + false, + "clone_tensor_ptr_to_device is not supported in USE_ATEN_LIB builds; " + "make_tensor_ptr cannot create on-device aten tensors."); +} + +TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& /*device_tensor*/) { + ET_CHECK_MSG( + false, + "clone_tensor_ptr_to_cpu is not supported in USE_ATEN_LIB builds; " + "make_tensor_ptr cannot create on-device aten tensors."); +} + +#endif // USE_ATEN_LIB + } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index 0ed06cbe021..1a534362108 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -18,6 +18,7 @@ #include #include #include +#include C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") @@ -32,8 +33,13 @@ using TensorPtr = std::shared_ptr; /** * Creates a TensorPtr that manages a Tensor with the specified properties. * + * The `device_type` and `device_index` parameters set the TensorImpl's device + * metadata only — no data is allocated or copied. The caller is responsible + * for ensuring `data` already lives on the requested device. To copy CPU data + * to a device, use `clone_tensor_ptr_to_device` instead. + * * @param sizes A vector specifying the size of each dimension. - * @param data A pointer to the data buffer. + * @param data A pointer to the data buffer (CPU or device, see device_type). * @param dim_order A vector specifying the order of dimensions. * @param strides A vector specifying the strides of the tensor. * @param type The scalar type of the tensor elements. @@ -41,6 +47,9 @@ using TensorPtr = std::shared_ptr; * @param deleter A custom deleter function for managing the lifetime of the * data buffer. If provided, this deleter will be called when the managed Tensor * object is destroyed. + * @param device_type The device on which `data` resides (default CPU). In + * USE_ATEN_LIB builds this must be CPU. + * @param device_index The device index for multi-device scenarios (default 0). * @return A TensorPtr that manages the newly created Tensor. */ TensorPtr make_tensor_ptr( @@ -52,18 +61,25 @@ TensorPtr make_tensor_ptr( executorch::aten::ScalarType::Float, const executorch::aten::TensorShapeDynamism dynamism = executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, - std::function deleter = nullptr); + std::function deleter = nullptr, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0); /** * Creates a TensorPtr that manages a Tensor with the specified properties. * + * Convenience overload for the primary factory; see the primary overload for + * device semantics. + * * @param sizes A vector specifying the size of each dimension. - * @param data A pointer to the data buffer. + * @param data A pointer to the data buffer (CPU or device, see device_type). * @param type The scalar type of the tensor elements. * @param dynamism Specifies the mutability of the tensor's shape. * @param deleter A custom deleter function for managing the lifetime of the - * data buffer. If provided, this deleter will be called when the managed Tensor - * object is destroyed. + * data buffer. + * @param device_type The device on which `data` resides (default CPU). + * @param device_index The device index for multi-device scenarios (default 0). * @return A TensorPtr that manages the newly created Tensor. */ inline TensorPtr make_tensor_ptr( @@ -73,9 +89,20 @@ inline TensorPtr make_tensor_ptr( executorch::aten::ScalarType::Float, const executorch::aten::TensorShapeDynamism dynamism = executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, - std::function deleter = nullptr) { + std::function deleter = nullptr, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { return make_tensor_ptr( - std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter)); + std::move(sizes), + data, + {}, + {}, + type, + dynamism, + std::move(deleter), + device_type, + device_index); } /** @@ -88,6 +115,9 @@ inline TensorPtr make_tensor_ptr( * specified `type`. This allows for flexible creation of tensors with data * vectors of one type and a different scalar type. * + * The result is always a CPU tensor. To move it to a device, use + * `clone_tensor_ptr_to_device`. + * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param sizes A vector specifying the size of each dimension. * @param data A vector containing the tensor's data. @@ -177,10 +207,10 @@ inline TensorPtr make_tensor_ptr( * * This template overload is specialized for cases where the tensor data is * provided as a vector. The scalar type is automatically deduced from the - * vector's data type. If the specified `type` differs from the deduced type of - * the vector's elements, and casting is allowed, the data will be cast to the - * specified `type`. This allows for flexible creation of tensors with data - * vectors of one type and a different scalar type. + * vector's data type. + * + * The result is always a CPU tensor. To move it to a device, use + * `clone_tensor_ptr_to_device`. * * @tparam T The C++ type of the tensor elements, deduced from the vector. * @param data A vector containing the tensor's data. @@ -209,11 +239,10 @@ inline TensorPtr make_tensor_ptr( * * This template overload is specialized for cases where the tensor data is * provided as an initializer list. The scalar type is automatically deduced - * from the initializer list's data type. If the specified `type` differs from - * the deduced type of the initializer list's elements, and casting is allowed, - * the data will be cast to the specified `type`. This allows for flexible - * creation of tensors with data vectors of one type and a different scalar - * type. + * from the initializer list's data type. + * + * The result is always a CPU tensor. To move it to a device, use + * `clone_tensor_ptr_to_device`. * * @tparam T The C++ type of the tensor elements, deduced from the initializer * list. @@ -252,11 +281,10 @@ inline TensorPtr make_tensor_ptr( * * This template overload allows creating a Tensor from an initializer list * of data. The scalar type is automatically deduced from the type of the - * initializer list's elements. If the specified `type` differs from - * the deduced type of the initializer list's elements, and casting is allowed, - * the data will be cast to the specified `type`. This allows for flexible - * creation of tensors with data vectors of one type and a different scalar - * type. + * initializer list's elements. + * + * The result is always a CPU tensor. To move it to a device, use + * `clone_tensor_ptr_to_device`. * * @tparam T The C++ type of the tensor elements, deduced from the initializer * list. @@ -299,7 +327,8 @@ inline TensorPtr make_tensor_ptr(T value) { * * This overload accepts a raw memory buffer stored in a std::vector * and a scalar type to interpret the data. The vector is managed, and the - * memory's lifetime is tied to the TensorImpl. + * memory's lifetime is tied to the TensorImpl. The result is always a CPU + * tensor. * * @param sizes A vector specifying the size of each dimension. * @param data A vector containing the raw memory for the tensor's data. @@ -321,9 +350,8 @@ TensorPtr make_tensor_ptr( /** * Creates a TensorPtr that manages a Tensor with the specified properties. * - * This overload accepts a raw memory buffer stored in a std::vector - * and a scalar type to interpret the data. The vector is managed, and the - * memory's lifetime is tied to the TensorImpl. + * Convenience overload for the raw-buffer factory; see above. The result is + * always a CPU tensor. * * @param sizes A vector specifying the size of each dimension. * @param data A vector containing the raw memory for the tensor's data. @@ -352,6 +380,9 @@ inline TensorPtr make_tensor_ptr( * configuration. If `dim_order` is empty but `strides` is provided, `dim_order` * is left empty so the core may infer it from the provided strides. * + * This overload always aliases — it never copies. To copy a tensor's data to + * a device, use `clone_tensor_ptr_to_device`. + * * @param tensor The source tensor to alias. * @param sizes Optional sizes override. * @param dim_order Optional dimension order override. @@ -411,6 +442,9 @@ inline TensorPtr make_tensor_ptr( * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...). * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed. * + * This overload always aliases — it never copies. To copy a tensor's data to + * a device, use `clone_tensor_ptr_to_device`. + * * @param tensor_ptr The source tensor pointer to alias. * @param sizes Optional sizes override. * @param dim_order Optional dimension order override. @@ -498,6 +532,37 @@ runtime::Error resize_tensor_ptr( TensorPtr& tensor, const std::vector& sizes); +/** + * Clones a CPU TensorPtr to a device TensorPtr. + * + * Allocates memory on the specified device and copies the tensor data from + * host to device using the DeviceAllocator registered for the given device + * type. The returned TensorPtr owns the device memory and will free it via + * the allocator when destroyed. + * + * @param cpu_tensor The source CPU tensor whose data will be copied. + * @param device_type The target device type (e.g., DeviceType::CUDA). Must not + * be CPU. + * @param device_index The target device index (default 0). + * @return A TensorPtr backed by device memory containing the copied data. + */ +TensorPtr clone_tensor_ptr_to_device( + const TensorPtr& cpu_tensor, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index = 0); + +/** + * Clones a device TensorPtr to a CPU TensorPtr. + * + * Allocates host memory and copies the tensor data from device to host using + * the DeviceAllocator registered for the source tensor's device type. The + * device type is determined from the source tensor's metadata. + * + * @param device_tensor The source device tensor whose data will be copied. + * @return A TensorPtr backed by CPU memory containing the copied data. + */ +TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor); + } // namespace extension } // namespace executorch diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl index 5bf8c7019b8..807e16ec8c1 100644 --- a/extension/tensor/test/targets.bzl +++ b/extension/tensor/test/targets.bzl @@ -21,3 +21,14 @@ def define_common_targets(): "//executorch/extension/tensor:tensor" + aten_suffix, ], ) + + runtime.cxx_test( + name = "tensor_ptr_device_test" + aten_suffix, + srcs = [ + "tensor_ptr_device_test.cpp", + ], + deps = [ + "//executorch/extension/tensor:tensor" + aten_suffix, + "//executorch/runtime/core:device_allocator", + ], + ) diff --git a/extension/tensor/test/tensor_ptr_device_test.cpp b/extension/tensor/test/tensor_ptr_device_test.cpp new file mode 100644 index 00000000000..0aa9e5f1b88 --- /dev/null +++ b/extension/tensor/test/tensor_ptr_device_test.cpp @@ -0,0 +1,415 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include + +#include +#include +#include + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +#ifndef USE_ATEN_LIB +// All device tensor helpers are intentionally unsupported in USE_ATEN_LIB +// builds (make_tensor_ptr cannot create on-device aten tensors), so the entire +// test fixture is gated to the portable build. + +namespace { + +// A fake device allocator that uses host memory (malloc/free/memcpy) to +// simulate device memory operations, enabling end-to-end data roundtrip +// verification without requiring actual device hardware. +class FakeDeviceAllocator : public DeviceAllocator { + public: + explicit FakeDeviceAllocator(DeviceType type) : type_(type) {} + + Result allocate( + size_t nbytes, + DeviceIndex /*index*/, + size_t /*alignment*/ = kDefaultAlignment) override { + void* ptr = std::malloc(nbytes); + if (!ptr) { + return Error::MemoryAllocationFailed; + } + allocate_count_++; + return ptr; + } + + void deallocate(void* ptr, DeviceIndex /*index*/) override { + std::free(ptr); + deallocate_count_++; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex /*index*/) override { + std::memcpy(dst, src, nbytes); + h2d_count_++; + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex /*index*/) override { + std::memcpy(dst, src, nbytes); + d2h_count_++; + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + void reset_counters() { + allocate_count_ = 0; + deallocate_count_ = 0; + h2d_count_ = 0; + d2h_count_ = 0; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + int h2d_count_ = 0; + int d2h_count_ = 0; + + private: + DeviceType type_; +}; + +// Function-static singleton avoids non-const global allocator state. +FakeDeviceAllocator& fake_cuda_allocator() { + static FakeDeviceAllocator allocator(DeviceType::CUDA); + return allocator; +} + +// One-shot registration; the constructor runs at static init time and the +// instance itself is immutable afterwards. +struct RegisterFakeAllocator { + RegisterFakeAllocator() { + register_device_allocator(&fake_cuda_allocator()); + } +}; +const RegisterFakeAllocator s_register; + +} // namespace + +class TensorPtrDeviceTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + runtime_init(); + } + + void SetUp() override { + fake_cuda_allocator().reset_counters(); + } +}; + +TEST_F(TensorPtrDeviceTest, CpuToDeviceTensor) { + auto cpu_tensor = + make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 2); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 3); + EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float); + EXPECT_NE(device_tensor->const_data_ptr(), nullptr); + EXPECT_NE(device_tensor->const_data_ptr(), cpu_tensor->const_data_ptr()); + + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 0); + + EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1); + EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1); +} + +TEST_F(TensorPtrDeviceTest, CpuToDeviceFromRawData) { + constexpr std::array data{10.0f, 20.0f, 30.0f, 40.0f}; + auto cpu_tensor = + make_tensor_ptr({2, 2}, const_cast(data.data())); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 2); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 2); + EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float); + EXPECT_NE(device_tensor->const_data_ptr(), nullptr); + EXPECT_NE( + device_tensor->const_data_ptr(), + static_cast(data.data())); + + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + + EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1); + EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1); +} + +// clone_tensor_ptr_to_cpu relies on TensorImpl device metadata which is only +// available in the non-ATen (ExecuTorch portable) path. +TEST_F(TensorPtrDeviceTest, DeviceToCpuTensor) { + auto cpu_tensor = + make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto result_tensor = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(result_tensor->dim(), 2); + EXPECT_EQ(result_tensor->size(0), 2); + EXPECT_EQ(result_tensor->size(1), 3); + EXPECT_EQ(result_tensor->scalar_type(), executorch::aten::ScalarType::Float); + + auto* result_data = result_tensor->const_data_ptr(); + auto* original_data = cpu_tensor->const_data_ptr(); + for (int i = 0; i < 6; ++i) { + EXPECT_FLOAT_EQ(result_data[i], original_data[i]); + } + + EXPECT_EQ(fake_cuda_allocator().d2h_count_, 1); +} + +TEST_F(TensorPtrDeviceTest, RoundtripCpuDeviceCpu) { + const std::vector original = {1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f}; + auto cpu_tensor = make_tensor_ptr({2, 3}, original); + + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip_tensor = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_NE(roundtrip_tensor->const_data_ptr(), cpu_tensor->const_data_ptr()); + EXPECT_NE( + roundtrip_tensor->const_data_ptr(), device_tensor->const_data_ptr()); + + auto* result_data = roundtrip_tensor->const_data_ptr(); + for (size_t i = 0; i < original.size(); ++i) { + EXPECT_FLOAT_EQ(result_data[i], original[i]); + } + + EXPECT_EQ(roundtrip_tensor->dim(), cpu_tensor->dim()); + EXPECT_EQ(roundtrip_tensor->size(0), cpu_tensor->size(0)); + EXPECT_EQ(roundtrip_tensor->size(1), cpu_tensor->size(1)); + EXPECT_EQ(roundtrip_tensor->scalar_type(), cpu_tensor->scalar_type()); +} + +TEST_F(TensorPtrDeviceTest, RoundtripInt32) { + auto cpu_tensor = make_tensor_ptr({4}, std::vector{10, 20, 30, 40}); + + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Int); + const std::vector expected = {10, 20, 30, 40}; + auto* data = roundtrip->const_data_ptr(); + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(data[i], expected[i]); + } +} + +TEST_F(TensorPtrDeviceTest, DeviceIndexPropagation) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + auto device_tensor = clone_tensor_ptr_to_device( + cpu_tensor, DeviceType::CUDA, /*device_index=*/1); + + EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[0], 1.0f); + EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[1], 2.0f); +} + +TEST_F(TensorPtrDeviceTest, DeviceMemoryCleanup) { + { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + auto device_tensor = + clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1); + EXPECT_EQ(fake_cuda_allocator().deallocate_count_, 0); + } + EXPECT_EQ(fake_cuda_allocator().deallocate_count_, 1); +} + +TEST_F(TensorPtrDeviceTest, ScalarTensorRoundtrip) { + auto cpu_tensor = make_tensor_ptr({}, {42.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 0); + EXPECT_EQ(device_tensor->numel(), 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + EXPECT_EQ(roundtrip->dim(), 0); + EXPECT_EQ(roundtrip->numel(), 1); + EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[0], 42.0f); +} + +TEST_F(TensorPtrDeviceTest, RawDataRoundtrip) { + constexpr std::array raw_data{100.0f, 200.0f, 300.0f}; + auto cpu_tensor = + make_tensor_ptr({3}, const_cast(raw_data.data())); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->dim(), 1); + EXPECT_EQ(roundtrip->size(0), 3); + auto* data = roundtrip->const_data_ptr(); + EXPECT_FLOAT_EQ(data[0], 100.0f); + EXPECT_FLOAT_EQ(data[1], 200.0f); + EXPECT_FLOAT_EQ(data[2], 300.0f); +} + +TEST_F(TensorPtrDeviceTest, ErrorCpuTargetDevice) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + ET_EXPECT_DEATH(clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CPU), ""); +} + +TEST_F(TensorPtrDeviceTest, ErrorNullCpuTensorData) { + auto null_tensor = make_tensor_ptr({2, 2}, nullptr); + ET_EXPECT_DEATH( + clone_tensor_ptr_to_device(null_tensor, DeviceType::CUDA), ""); +} + +TEST_F(TensorPtrDeviceTest, ErrorCpuTensorToCpu) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + ET_EXPECT_DEATH(clone_tensor_ptr_to_cpu(cpu_tensor), ""); +} + +TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) { + auto cpu_tensor = + make_tensor_ptr({2, 2}, std::vector{1.0f, 2.0f, 3.0f, 4.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 2); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 2); + EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float); + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1); + EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto* data = roundtrip->const_data_ptr(); + EXPECT_FLOAT_EQ(data[0], 1.0f); + EXPECT_FLOAT_EQ(data[1], 2.0f); + EXPECT_FLOAT_EQ(data[2], 3.0f); + EXPECT_FLOAT_EQ(data[3], 4.0f); +} + +TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) { + constexpr std::array raw{5.0f, 6.0f, 7.0f}; + auto cpu_tensor = make_tensor_ptr({3}, const_cast(raw.data())); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 1); + EXPECT_EQ(device_tensor->size(0), 3); + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + EXPECT_NE( + device_tensor->const_data_ptr(), + static_cast(raw.data())); + EXPECT_EQ(fake_cuda_allocator().allocate_count_, 1); + EXPECT_EQ(fake_cuda_allocator().h2d_count_, 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto* data = roundtrip->const_data_ptr(); + EXPECT_FLOAT_EQ(data[0], 5.0f); + EXPECT_FLOAT_EQ(data[1], 6.0f); + EXPECT_FLOAT_EQ(data[2], 7.0f); +} + +TEST_F(TensorPtrDeviceTest, CloneToCpuVerifiesCpuDeviceMetadata) { + auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto result = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(result->unsafeGetTensorImpl()->device_type(), DeviceType::CPU); + EXPECT_EQ(result->unsafeGetTensorImpl()->device_index(), 0); +} + +TEST_F(TensorPtrDeviceTest, MultipleClonesFromSameSource) { + auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f}); + auto device1 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device2 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_NE(device1->const_data_ptr(), device2->const_data_ptr()); + EXPECT_EQ(fake_cuda_allocator().allocate_count_, 2); + EXPECT_EQ(fake_cuda_allocator().h2d_count_, 2); +} + +TEST_F(TensorPtrDeviceTest, HighDimensionalTensorRoundtrip) { + std::vector data(24); + for (size_t i = 0; i < 24; ++i) { + data[i] = static_cast(i); + } + auto cpu_tensor = make_tensor_ptr({2, 3, 4}, data); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 3); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 3); + EXPECT_EQ(device_tensor->size(2), 4); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto* result = roundtrip->const_data_ptr(); + for (size_t i = 0; i < 24; ++i) { + EXPECT_FLOAT_EQ(result[i], static_cast(i)); + } +} + +TEST_F(TensorPtrDeviceTest, RoundtripDouble) { + auto cpu_tensor = make_tensor_ptr({3}, std::vector{1.1, 2.2, 3.3}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Double); + auto* data = roundtrip->const_data_ptr(); + EXPECT_DOUBLE_EQ(data[0], 1.1); + EXPECT_DOUBLE_EQ(data[1], 2.2); + EXPECT_DOUBLE_EQ(data[2], 3.3); +} + +TEST_F(TensorPtrDeviceTest, RoundtripInt64) { + auto cpu_tensor = make_tensor_ptr({3}, std::vector{100, 200, 300}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Long); + auto* data = roundtrip->const_data_ptr(); + EXPECT_EQ(data[0], 100); + EXPECT_EQ(data[1], 200); + EXPECT_EQ(data[2], 300); +} + +TEST_F(TensorPtrDeviceTest, LargeTensorRoundtrip) { + const size_t n = 10000; + std::vector data(n); + for (size_t i = 0; i < n; ++i) { + data[i] = static_cast(i) * 0.1f; + } + auto cpu_tensor = make_tensor_ptr({static_cast(n)}, data); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + auto* result = roundtrip->const_data_ptr(); + for (size_t i = 0; i < n; ++i) { + EXPECT_FLOAT_EQ(result[i], data[i]); + } +} + +#endif // USE_ATEN_LIB diff --git a/runtime/core/portable_type/tensor.h b/runtime/core/portable_type/tensor.h index 775bccc1b52..f4ee2aef1f5 100644 --- a/runtime/core/portable_type/tensor.h +++ b/runtime/core/portable_type/tensor.h @@ -115,6 +115,21 @@ class Tensor { return impl_->shape_dynamism(); } + /// Returns the device where tensor data resides. + Device device() const { + return impl_->device(); + } + + /// Returns the type of device where tensor data resides. + DeviceType device_type() const { + return impl_->device_type(); + } + + /// Returns the device index, or 0 if default/unspecified. + DeviceIndex device_index() const { + return impl_->device_index(); + } + /// Returns a pointer of type T to the constant underlying data blob. template inline const T* const_data_ptr() const { diff --git a/runtime/core/portable_type/test/tensor_test.cpp b/runtime/core/portable_type/test/tensor_test.cpp index 714cdc25661..ba14644d71e 100644 --- a/runtime/core/portable_type/test/tensor_test.cpp +++ b/runtime/core/portable_type/test/tensor_test.cpp @@ -13,6 +13,9 @@ #include #include +using executorch::runtime::etensor::Device; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; using executorch::runtime::etensor::ScalarType; using executorch::runtime::etensor::Tensor; using executorch::runtime::etensor::TensorImpl; @@ -78,3 +81,41 @@ TEST_F(TensorTest, ModifyDataOfConstTensor) { EXPECT_EQ(a.scalar_type(), ScalarType::Int); EXPECT_EQ(a.const_data_ptr()[0], 0); } + +TEST_F(TensorTest, DeviceForwardersDefaultCpu) { + TensorImpl::SizesType sizes[1] = {1}; + TensorImpl::DimOrderType dim_order[1] = {0}; + int32_t data[1] = {0}; + // TensorImpl ctor defaults device to CPU/0 when not specified. + auto a_impl = TensorImpl(ScalarType::Int, 1, sizes, data, dim_order); + Tensor a(&a_impl); + + EXPECT_EQ(a.device_type(), DeviceType::CPU); + EXPECT_EQ(a.device_index(), DeviceIndex(0)); + EXPECT_EQ(a.device(), Device(DeviceType::CPU, 0)); +} + +TEST_F(TensorTest, DeviceForwardersNonCpu) { + TensorImpl::SizesType sizes[1] = {1}; + TensorImpl::DimOrderType dim_order[1] = {0}; + int32_t data[1] = {0}; + auto a_impl = TensorImpl( + ScalarType::Int, + 1, + sizes, + data, + dim_order, + /*strides=*/nullptr, + executorch::runtime::TensorShapeDynamism::STATIC, + DeviceType::CUDA, + /*device_index=*/3); + Tensor a(&a_impl); + + // Each forwarder must agree with the underlying TensorImpl. + EXPECT_EQ(a.device_type(), a_impl.device_type()); + EXPECT_EQ(a.device_index(), a_impl.device_index()); + EXPECT_EQ(a.device(), a_impl.device()); + + EXPECT_EQ(a.device_type(), DeviceType::CUDA); + EXPECT_EQ(a.device_index(), DeviceIndex(3)); +}