pytorch · Gasoonjia · Feb 9, 2026 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
@@ -84,8 +84,6 @@ struct AOTIDelegateHandle {
   void* so_handle;
   std::string so_path;
   AOTInductorModelContainerHandle container_handle;
-  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
-                     // dependency
   std::string method_name;
 
   // Function pointers specific to this handle's shared library

diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
@@ -127,16 +127,47 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
   /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
     // Get the current stream for the current device
+    // Currently all cuda slimtensors should be on the same device same stream,
+    // so we can just use the stream on current device.
+    // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
+    // support multiple devices.
     auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
-    if (stream_result.ok()) {
-      ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
+    ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+  }
+
+  /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
+  /// @param dst Destination pointer.
+  /// @param src Source pointer.
+  /// @param nbytes Number of bytes to copy.
+  /// @param dst_device Destination device.
+  /// @param src_device Source device.
+  /// @param stream CUDA stream for async copy.
+  static void memcpy_async(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      const c10::Device& dst_device,
+      const c10::Device& src_device,
+      cudaStream_t stream) {
+    cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
+
+    if (src_device.is_cpu()) {
+      direction = cudaMemcpyHostToDevice;
+    } else if (dst_device.is_cpu()) {
+      direction = cudaMemcpyDeviceToHost;
     } else {
-      // Fallback to synchronous free if we can't get the stream
-      ET_CUDA_LOG_WARN(cudaFree(ptr));
+      ET_CHECK_MSG(
+          src_device.index() == dst_device.index(),
+          "CUDA memcpy across different device indices not supported: %d != %d",
+          static_cast<int>(src_device.index()),
+          static_cast<int>(dst_device.index()));
     }
+
+    ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA.
+  /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
   /// @param dst Destination pointer.
   /// @param src Source pointer.
   /// @param nbytes Number of bytes to copy.

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -95,6 +95,9 @@ runtime.cxx_library(
     srcs = [
         "cuda_backend.cpp",
     ],
+    headers = [
+        "cuda_delegate_handle.h",
+    ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
     supports_python_dlopen = True,