Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0b42608
Remove unnecessary cuda sync for better perf
Gasoonjia Feb 9, 2026
0014dcb
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 9, 2026
75c0995
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
28a0792
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
3b38ed7
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
1d27edd
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
f4672cd
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
3a29c04
Qualcomm AI Engine Direct - Optimize UT execution time re-submit (#17…
chenweng-quic Feb 10, 2026
a5c5f70
fix to allocate samsung device issue
Jiseong-oh Feb 10, 2026
5cd3504
Add death test matchers to verify assertion messages (#16543)
rascani Feb 10, 2026
b7e063a
Validate dim_order is a permutation in dim_order_to_stride (#17314)
rascani Feb 10, 2026
23de893
[ET-VK] Fix missing memory barrier for first-use writes on aliased te…
Feb 10, 2026
aed50d1
consolidate cuda stream
Gasoonjia Feb 11, 2026
bddcb88
reformat
Gasoonjia Feb 11, 2026
6916192
rename for better clarification
Gasoonjia Feb 11, 2026
7d06ab5
rebase to latest main
Gasoonjia Feb 11, 2026
55cb555
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 11, 2026
8dcbc3e
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 11, 2026
69be4bb
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 12, 2026
7d83c14
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions backends/aoti/aoti_delegate_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ struct AOTIDelegateHandle {
void* so_handle;
std::string so_path;
AOTInductorModelContainerHandle container_handle;
void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
// dependency
std::string method_name;

// Function pointers specific to this handle's shared library
Expand Down
41 changes: 36 additions & 5 deletions backends/aoti/slim/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,47 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
/// @param ptr Pointer to device memory to free.
static void free(void* ptr) {
// Get the current stream for the current device
// Currently all cuda slimtensors should be on the same device same stream,
// so we can just use the stream on current device.
// TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
// support multiple devices.
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
if (stream_result.ok()) {
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
}

/// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
/// @param dst_device Destination device.
/// @param src_device Source device.
/// @param stream CUDA stream for async copy.
static void memcpy_async(
void* dst,
const void* src,
size_t nbytes,
const c10::Device& dst_device,
const c10::Device& src_device,
cudaStream_t stream) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;

if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
direction = cudaMemcpyDeviceToHost;
} else {
// Fallback to synchronous free if we can't get the stream
ET_CUDA_LOG_WARN(cudaFree(ptr));
ET_CHECK_MSG(
src_device.index() == dst_device.index(),
"CUDA memcpy across different device indices not supported: %d != %d",
static_cast<int>(src_device.index()),
static_cast<int>(dst_device.index()));
}

ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
}

/// Copies memory between CPU and CUDA or CUDA and CUDA.
/// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
Expand Down
3 changes: 3 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ runtime.cxx_library(
srcs = [
"cuda_backend.cpp",
],
headers = [
"cuda_delegate_handle.h",
],
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
link_whole = True,
supports_python_dlopen = True,
Expand Down
Loading
Loading