From 0b4260894570766550ef6786f9f0f2caf6f6c7eb Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 9 Feb 2026 12:02:51 -0800
Subject: [PATCH 01/16] Remove unnecessary cuda sync for better perf

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]
---
 backends/aoti/slim/core/storage.h      | 12 ++++++------
 backends/cuda/runtime/cuda_backend.cpp | 26 ++++++++++++++++----------
 extension/asr/runner/runner.cpp        | 11 ++++++++++-
 extension/asr/runner/runner.h          |  2 ++
 4 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
index bd227dbb43a..cc3eb0e5c8b 100644
--- a/backends/aoti/slim/core/storage.h
+++ b/backends/aoti/slim/core/storage.h
@@ -127,13 +127,13 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
   /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
     // Get the current stream for the current device
+    // Currently all cuda slimtensors should be on the same device same stream,
+    // so we can just use the stream on current device.
+    // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
+    // support multiple devices.
     auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
-    if (stream_result.ok()) {
-      ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
-    } else {
-      // Fallback to synchronous free if we can't get the stream
-      ET_CUDA_LOG_WARN(cudaFree(ptr));
-    }
+    ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
+    ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
   }
 
   /// Copies memory between CPU and CUDA or CUDA and CUDA.
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 8e1a871ad4c..3fe63fc836f 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -423,17 +423,17 @@ class ET_EXPERIMENTAL CudaBackend final
 
     const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
 
-    // Synchronize CUDA stream to ensure kernel execution is complete
-    // before accessing output data (either for copy or skip-copy path)
-    cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
-    cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
-    ET_CHECK_OR_RETURN_ERROR(
-        sync_err == cudaSuccess,
-        Internal,
-        "cudaStreamSynchronize failed: %s",
-        cudaGetErrorString(sync_err));
-
     if (copy_outputs) {
+      // Synchronize CUDA stream before D2H copy. This is required because
+      // cudaMemcpy is not stream-ordered and needs the kernel to complete.
+      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+      cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
+      ET_CHECK_OR_RETURN_ERROR(
+          sync_err == cudaSuccess,
+          Internal,
+          "cudaStreamSynchronize failed: %s",
+          cudaGetErrorString(sync_err));
+
       // Deep copy GPU SlimTensor results back to CPU ETensors
       for (size_t i = 0; i < n_outputs; i++) {
         auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
@@ -448,6 +448,12 @@ class ET_EXPERIMENTAL CudaBackend final
       // Skip-copy optimization: point ETensor directly to GPU data.
       // The caller is responsible for handling GPU data directly.
       //
+      // No cudaStreamSynchronize needed here because:
+      // 1. All operations (kernel, allocations, frees) are on the same stream
+      // 2. cudaFreeAsync is stream-ordered, so CUDA guarantees the kernel
+      //    completes before any memory is freed
+      // 3. The next execution's operations will also be ordered on this stream
+      //
       // Lifetime management: We cache the newly created GPU tensors and delete
       // the previous round's tensors, since they are no longer needed.
       {
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index 21ff276bb82..58981f06862 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -46,6 +46,8 @@ AsrRunner::AsrRunner(
   }
 }
 
+AsrRunner::~AsrRunner() = default;
+
 bool AsrRunner::is_loaded() const {
   return module_ && encoder_method_loaded_ && decoder_method_loaded_ &&
       (!sampler_method_present_ || sampler_method_loaded_) && tokenizer_ &&
@@ -121,13 +123,20 @@ Error AsrRunner::load() {
 #ifdef CUDA_AVAILABLE
   // Skip copying outputs to CPU. When a sampler exists, keep both encoder and
   // decoder outputs on device and pass decoder logits directly into sampler.
-  executorch::runtime::BackendOptions<1> backend_options;
+  // The backend will automatically create a shared CUDA stream for all methods
+  // when skip-copy is enabled to ensure proper ordering.
+  executorch::runtime::BackendOptions<2> backend_options;
   std::string skip_methods = kEncoderMethodName;
   if (sampler_method_present_) {
     skip_methods.append(",").append(kDecoderMethodName);
   }
   ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option(
       "skip_copy_output_to_cpu_for_method", skip_methods.c_str()));
+  // Enable shared CUDA stream for all methods when skip-copy is used.
+  // This ensures proper ordering between encoder/decoder/sampler outputs.
+  ET_CHECK_OK_OR_RETURN_ERROR(
+      backend_options.set_option("use_shared_cuda_stream", true));
+
   const auto opt_err =
       executorch::runtime::set_option("CudaBackend", backend_options.view());
   if (opt_err != ::executorch::runtime::Error::Ok) {
diff --git a/extension/asr/runner/runner.h b/extension/asr/runner/runner.h
index 077fdb69fe4..d8bb8f5c279 100644
--- a/extension/asr/runner/runner.h
+++ b/extension/asr/runner/runner.h
@@ -64,6 +64,8 @@ class ET_EXPERIMENTAL AsrRunner {
       std::optional<std::string> data_path,
       const std::string& tokenizer_path);
 
+  ~AsrRunner();
+
   /**
    * Returns true when the module and tokenizer are ready for inference.
    */

From 0014dcb2a269a43a481cc509f91da3bbc7e961cf Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 9 Feb 2026 14:18:33 -0800
Subject: [PATCH 02/16] Update on "Remove unnecessary cuda sync for better
 perf"

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]
---
 backends/aoti/aoti_delegate_handle.h   |  4 +-
 backends/cuda/runtime/cuda_backend.cpp | 98 ++++++++++++++++++++++++--
 2 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index b14e02da9ef..862bcb3bf02 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -84,8 +84,8 @@ struct AOTIDelegateHandle {
   void* so_handle;
   std::string so_path;
   AOTInductorModelContainerHandle container_handle;
-  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
-                     // dependency
+  void* cuda_stream; // Per-handle CUDA stream. If nullptr, use backend's shared
+                     // stream instead (for skip-copy optimization).
   std::string method_name;
 
   // Function pointers specific to this handle's shared library
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 3fe63fc836f..297994ccbe1 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -77,6 +77,7 @@ using slim::c10::DeviceType;
 namespace {
 constexpr char kSkipCopyOutputToCpuForMethod[] =
     "skip_copy_output_to_cpu_for_method";
+constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
 } // anonymous namespace
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -143,6 +144,36 @@ class ET_EXPERIMENTAL CudaBackend final
     return method_in_csv(method_name, skip_copy_method_);
   }
 
+  // Create the shared CUDA stream. Called when use_shared_cuda_stream option
+  // is set to true. The presence of shared_cuda_stream_ indicates shared mode.
+  void create_shared_cuda_stream() {
+    std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
+    if (shared_cuda_stream_ != nullptr) {
+      return; // Already created
+    }
+    cudaError_t err = cudaStreamCreate(&shared_cuda_stream_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "Failed to create shared CUDA stream: %s",
+          cudaGetErrorString(err));
+      return;
+    }
+    ET_LOG(Info, "Created shared CUDA stream: %p", shared_cuda_stream_);
+  }
+
+  // Get the shared CUDA stream. Returns nullptr if not in shared mode.
+  cudaStream_t get_shared_cuda_stream() const {
+    std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
+    return shared_cuda_stream_;
+  }
+
+  // Check if we're using shared CUDA stream mode.
+  bool is_using_shared_cuda_stream() const {
+    std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
+    return shared_cuda_stream_ != nullptr;
+  }
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -181,6 +212,19 @@ class ET_EXPERIMENTAL CudaBackend final
   }
 
  public:
+  // Destructor: clean up the shared CUDA stream if it was created.
+  ~CudaBackend() {
+    if (shared_cuda_stream_ != nullptr) {
+      cudaError_t err = cudaStreamDestroy(shared_cuda_stream_);
+      if (err != cudaSuccess) {
+        ET_LOG(
+            Error,
+            "Failed to destroy shared CUDA stream: %s",
+            cudaGetErrorString(err));
+      }
+    }
+  }
+
   bool is_available() const override {
     return 1;
   }
@@ -201,6 +245,15 @@ class ET_EXPERIMENTAL CudaBackend final
               kSkipCopyOutputToCpuForMethod);
           return Error::InvalidArgument;
         }
+      } else if (std::strcmp(option.key, kUseSharedCudaStream) == 0) {
+        if (auto* val = std::get_if<bool>(&option.value)) {
+          if (*val) {
+            create_shared_cuda_stream();
+          }
+        } else {
+          ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
+          return Error::InvalidArgument;
+        }
       }
     }
     return Error::Ok;
@@ -313,10 +366,27 @@ class ET_EXPERIMENTAL CudaBackend final
           handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
       buffer_res->Free();
     }
-    // Create a CUDA stream for asynchronous execution
-    cudaStream_t cuda_stream;
-    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
-    handle->cuda_stream = static_cast<void*>(cuda_stream);
+
+    // Use shared CUDA stream if enabled via options, otherwise create one.
+    // A shared stream ensures proper ordering across multiple methods
+    // (e.g., encoder, decoder, sampler) when using skip-copy optimization.
+    if (is_using_shared_cuda_stream()) {
+      // Shared stream mode: set handle's stream to nullptr.
+      // The stream will be retrieved from backend in execute().
+      handle->cuda_stream = nullptr;
+      ET_LOG(
+          Info, "Using shared CUDA stream for method %s", method_name.c_str());
+    } else {
+      // Per-handle stream mode: each handle owns its own stream.
+      cudaStream_t cuda_stream;
+      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+      handle->cuda_stream = static_cast<void*>(cuda_stream);
+      ET_LOG(
+          Info,
+          "Created new CUDA stream %p for method %s",
+          handle->cuda_stream,
+          method_name.c_str());
+    }
 
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
@@ -406,13 +476,19 @@ class ET_EXPERIMENTAL CudaBackend final
     // expects ETensor* as input/output. We avoid changing its signature since
     // it's shared with the Metal backend. Instead, we reinterpret_cast
     // SlimTensor* to Tensor*
+    //
+    // Get the CUDA stream: use handle's stream if set, otherwise get from
+    // backend's shared stream.
+    cudaStream_t cuda_stream = handle->cuda_stream != nullptr
+        ? static_cast<cudaStream_t>(handle->cuda_stream)
+        : get_shared_cuda_stream();
     AOTIRuntimeError error = handle->run(
         handle->container_handle,
         reinterpret_cast<Tensor**>(gpu_inputs.data()),
         n_inputs,
         reinterpret_cast<Tensor**>(gpu_outputs.data()),
         n_outputs,
-        handle->cuda_stream,
+        static_cast<void*>(cuda_stream),
         nullptr);
 
     ET_CHECK_OR_RETURN_ERROR(
@@ -426,7 +502,6 @@ class ET_EXPERIMENTAL CudaBackend final
     if (copy_outputs) {
       // Synchronize CUDA stream before D2H copy. This is required because
       // cudaMemcpy is not stream-ordered and needs the kernel to complete.
-      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
       cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
       ET_CHECK_OR_RETURN_ERROR(
           sync_err == cudaSuccess,
@@ -501,7 +576,9 @@ class ET_EXPERIMENTAL CudaBackend final
       }
     }
 
-    // Destroy the CUDA stream if it exists
+    // Destroy the CUDA stream only if this handle owns it (non-null).
+    // When cuda_stream is nullptr, the handle uses the backend's shared
+    // stream which is managed by the backend singleton via shared_ptr.
     if (handle->cuda_stream != nullptr) {
       cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
       cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
@@ -547,6 +624,13 @@ class ET_EXPERIMENTAL CudaBackend final
   mutable std::mutex skip_copy_method_mutex_;
   std::string skip_copy_method_;
 
+  // Shared CUDA stream for all methods. When set (non-null), all methods use
+  // the same stream to ensure proper ordering (critical for skip-copy
+  // optimization). Created when use_shared_cuda_stream option is set to true.
+  // Cleaned up in destructor.
+  mutable std::mutex cuda_stream_mutex_;
+  cudaStream_t shared_cuda_stream_ = nullptr;
+
   // Cached output tensors for skip-copy optimization.
   // When skip-copy is enabled, output SlimTensors are cached here to keep
   // the underlying GPU memory alive while the caller processes the results.

From 75c099544e0e7d653b19ba78a20e753bdb59b63e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 9 Feb 2026 21:35:30 -0800
Subject: [PATCH 03/16] Update base for Update on "Remove unnecessary cuda sync
 for better perf"

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]
---
 .ci/scripts/test_model_e2e.sh                      | 2 +-
 examples/models/moshi/mimi/install_requirements.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 711aff15111..5ec9e313326 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -167,7 +167,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.10.0.dev20251211 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.11.0.dev20260209 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 36dbb4316f5..1a472c8f8c1 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.10.0.dev20251211 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.11.0.dev20260209 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps

From 28a07921cc39b79801eabe13bb8057200c149e75 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 9 Feb 2026 22:15:10 -0800
Subject: [PATCH 04/16] Update base for Update on "Remove unnecessary cuda sync
 for better perf"

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]
---
 torch_pin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_pin.py b/torch_pin.py
index 62a2572fd78..3be8cec87c6 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.11.0"
-NIGHTLY_VERSION = "dev20251222"
+NIGHTLY_VERSION = "dev20260209"

From 3b38ed7b330a7de149964477b4ac20b4e104f233 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 9 Feb 2026 22:41:24 -0800
Subject: [PATCH 05/16] Update base for Update on "Remove unnecessary cuda sync
 for better perf"

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]
---
 install_requirements.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install_requirements.py b/install_requirements.py
index 8559c28fbf2..56ba40ff113 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -119,12 +119,12 @@ def install_optional_example_requirements(use_pytorch_nightly):
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
         (
-            f"torchvision==0.25.0.{NIGHTLY_VERSION}"
+            f"torchvision==0.26.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchvision"
         ),
         (
-            f"torchaudio==2.10.0.{NIGHTLY_VERSION}"
+            f"torchaudio==2.11.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchaudio"
         ),

From 1d27edd1211bb55c97c318a888d7b65cc1344134 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 10 Feb 2026 00:08:02 -0800
Subject: [PATCH 06/16] Update base for Update on "Remove unnecessary cuda sync
 for better perf"

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]
---
 .ci/scripts/test_model_e2e.sh                      | 2 +-
 examples/models/moshi/mimi/install_requirements.sh | 2 +-
 install_requirements.py                            | 4 ++--
 torch_pin.py                                       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 5ec9e313326..15c44d0ef1e 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -167,7 +167,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.11.0.dev20260209 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.10.0.dev20251222 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index 1a472c8f8c1..993aa65e137 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.11.0.dev20260209 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.10.0.dev20251222 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/install_requirements.py b/install_requirements.py
index 56ba40ff113..8559c28fbf2 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -119,12 +119,12 @@ def install_optional_example_requirements(use_pytorch_nightly):
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
         (
-            f"torchvision==0.26.0.{NIGHTLY_VERSION}"
+            f"torchvision==0.25.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchvision"
         ),
         (
-            f"torchaudio==2.11.0.{NIGHTLY_VERSION}"
+            f"torchaudio==2.10.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchaudio"
         ),
diff --git a/torch_pin.py b/torch_pin.py
index 3be8cec87c6..62a2572fd78 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.11.0"
-NIGHTLY_VERSION = "dev20260209"
+NIGHTLY_VERSION = "dev20251222"

From f4672cd3e974a686a72a4804ea806439828363bd Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 10 Feb 2026 09:12:37 -0800
Subject: [PATCH 07/16] Update base for Update on "Remove unnecessary cuda sync
 for better perf"

Right now we always do cudasync before existing cudabackend.execution(). However we only need that when copying data from gpu to cpu; any actions happen inside a same stream do not need explicit sync.

Differential Revision: [D92193164](https://our.internmc.facebook.com/intern/diff/D92193164/)

[ghstack-poisoned]

From 3a29c041137d50baa8016cee1e5ee1c5a74c6eda Mon Sep 17 00:00:00 2001
From: qti-chenweng <168707118+chenweng-quic@users.noreply.github.com>
Date: Tue, 10 Feb 2026 09:38:49 -0800
Subject: [PATCH 08/16] Qualcomm AI Engine Direct - Optimize UT execution time
 re-submit (#17086)

### Summary
- Currently all tests will push same libs, which is redundant. With this
PR it only push once, and reduce execution time from two to three times
for operator's tests.

### Test plan
```
python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNFloatingPointOperator --device <device> --host <host> --model SM8650 --build_folder build-android --executorch_root . --artifact all_artifact
```
without optimization
<img width="429" height="136" alt="image"
src="https://github.com/user-attachments/assets/f52a167c-b665-47da-97b2-02f836f4858e"
/>

with optimization
<img width="442" height="130" alt="image"
src="https://github.com/user-attachments/assets/46366237-ec7b-4d38-b577-5677b3dafb36"
/>


cc @cccclai @cbilgin
---
 backends/qualcomm/tests/test_qnn_delegate.py |  9 ++-
 backends/qualcomm/tests/utils.py             | 20 ++++++
 examples/qualcomm/utils.py                   | 70 +++++++++++---------
 3 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 53bfd9b4193..97cf77a31af 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -8256,8 +8256,15 @@ def test_cli(self):
                 self.build_folder,
                 "--input_list",
                 f"{tmp_dir}/input_list",
+                "--model",
+                self.model,
+                "--host",
+                self.host,
+                "--target",
+                self.target,
+                "--device",
+                self.device,
             ]
-            self.add_default_cmds(cmds)
             subprocess.run(cmds, stdout=subprocess.DEVNULL)
             self.assertTrue(os.path.isfile(f"{tmp_dir}/e_out/Result_0/output_0.pt"))
 
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 7f57bbe8519..ad4843c1c3e 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -186,6 +186,25 @@ class TestQNN(unittest.TestCase):
     inference_speed_output_path = "outputs/inference_speed.txt"
     static_llm_eval_method = ""
 
+    @classmethod
+    def setUpClass(cls):
+        if not cls.enable_x86_64 and not cls.compile_only:
+            # init device once
+            adb = SimpleADB(
+                qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+                build_path=cls.build_folder,
+                pte_path=[],
+                workspace="/data/local/tmp/qnn_executorch_test",
+                device_id=cls.device,
+                host_id=cls.host,
+                soc_model=cls.model,
+                error_only=cls.error_only,
+                target=cls.target,
+            )
+            adb.push(
+                init_env=True,
+            )
+
     def _assert_outputs_equal(self, model_output, ref_output):
         self.assertTrue(len(ref_output) == len(model_output))
         for i in range(len(ref_output)):
@@ -490,6 +509,7 @@ def validate_intermediate_tensor():
                 adb.push(
                     inputs=[processed_inputs],
                     files=op_package_paths,
+                    init_env=False,
                 )
                 adb.extra_cmds += extra_cmds
                 if save_inference_speed:
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 2e738e77c86..50091362256 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -98,7 +98,10 @@ def __init__(
         self.workspace = workspace
         self.device_id = device_id
         self.host_id = host_id
-        self.working_dir = Path(self.pte_path[0]).parent.absolute()
+        if len(self.pte_path) > 0:
+            self.working_dir = Path(self.pte_path[0]).parent.absolute()
+        else:
+            self.working_dir = Path.cwd()
         self.input_list_filename = "input_list.txt"
         self.etdump_path = f"{self.workspace}/etdump.etdp"
         self.dump_intermediate_outputs = dump_intermediate_outputs
@@ -132,39 +135,42 @@ def _adb(self, cmd, output_callback: Optional[Callable[[str], None]] = None):
             )
 
     def push(self, inputs=None, input_list=None, files=None, init_env=True):
-        artifacts = []
+        artifacts = [
+            *self.pte_path,
+        ]
         if init_env:
             self._adb(["shell", f"rm -rf {self.workspace}"])
             self._adb(["shell", f"mkdir -p {self.workspace}"])
 
-        # necessary artifacts
-        artifacts = {
-            QnnExecuTorchBackendType.kHtpBackend: [
-                f"{self.qnn_sdk}/lib/{self.target}/libQnnHtp.so",
-                (
-                    f"{self.qnn_sdk}/lib/hexagon-v{self.htp_arch}/"
-                    f"unsigned/libQnnHtpV{self.htp_arch}Skel.so"
-                ),
-                (
-                    f"{self.qnn_sdk}/lib/{self.target}/"
-                    f"libQnnHtpV{self.htp_arch}Stub.so"
-                ),
-                f"{self.qnn_sdk}/lib/{self.target}/libQnnHtpPrepare.so",
-            ],
-            QnnExecuTorchBackendType.kGpuBackend: [
-                f"{self.qnn_sdk}/lib/{self.target}/libQnnGpu.so",
-            ],
-        }[self.backend]
-
-        artifacts.extend(
-            [
-                *self.pte_path,
-                f"{self.qnn_sdk}/lib/{self.target}/libQnnSystem.so",
-                f"{self.build_path}/{self.runner}",
-                f"{self.build_path}/backends/qualcomm/libqnn_executorch_backend.so",
-                f"{self.qnn_sdk}/lib/{self.target}/libQnnModelDlc.so",
-            ]
-        )
+            # necessary artifacts
+            artifacts.extend(
+                {
+                    QnnExecuTorchBackendType.kHtpBackend: [
+                        f"{self.qnn_sdk}/lib/{self.target}/libQnnHtp.so",
+                        (
+                            f"{self.qnn_sdk}/lib/hexagon-v{self.htp_arch}/"
+                            f"unsigned/libQnnHtpV{self.htp_arch}Skel.so"
+                        ),
+                        (
+                            f"{self.qnn_sdk}/lib/{self.target}/"
+                            f"libQnnHtpV{self.htp_arch}Stub.so"
+                        ),
+                        f"{self.qnn_sdk}/lib/{self.target}/libQnnHtpPrepare.so",
+                    ],
+                    QnnExecuTorchBackendType.kGpuBackend: [
+                        f"{self.qnn_sdk}/lib/{self.target}/libQnnGpu.so",
+                    ],
+                }[self.backend]
+            )
+
+            artifacts.extend(
+                [
+                    f"{self.qnn_sdk}/lib/{self.target}/libQnnSystem.so",
+                    f"{self.build_path}/{self.runner}",
+                    f"{self.build_path}/backends/qualcomm/libqnn_executorch_backend.so",
+                    f"{self.qnn_sdk}/lib/{self.target}/libQnnModelDlc.so",
+                ]
+            )
         with tempfile.TemporaryDirectory() as tmp_dir:
             input_list_file, input_files = generate_inputs(
                 tmp_dir, self.input_list_filename, inputs
@@ -205,6 +211,7 @@ def execute(
         method_index=0,
         output_callback: Optional[Callable[[str], None]] = None,
     ):
+        self._adb(["shell", f"rm -rf {self.output_folder}"])
         self._adb(["shell", f"mkdir -p {self.output_folder}"])
         # run the delegation
         if custom_runner_cmd is None:
@@ -476,7 +483,8 @@ def build_executorch_binary(
         else:
             quantizer = custom_quantizer or make_quantizer(quant_dtype=quant_dtype)
             # ptq calibration
-            annotated_model = ptq_calibrate(captured_model, quantizer, dataset)
+            with torch.no_grad():
+                annotated_model = ptq_calibrate(captured_model, quantizer, dataset)
 
         quantized_model = convert_pt2e(annotated_model)
         edge_prog_mgr = to_edge_transform_and_lower_to_qnn(

From a5c5f70ee1ee1dfe44a020f14af359c6edf3ed25 Mon Sep 17 00:00:00 2001
From: "jiseong.oh" <jiseong.oh@samsung.com>
Date: Tue, 10 Feb 2026 10:39:04 +0000
Subject: [PATCH 09/16] fix to allocate samsung device issue

- Clean up operation is required to create an available device
- CI is complete, but device is not aware of the situation so
  it should be done by themselves

Signed-off-by: jiseong.oh <jiseong.oh@samsung.com>
---
 .ci/scripts/cleanup-samsung-linux-deps.sh | 23 ++++++
 .ci/scripts/setup-samsung-linux-deps.sh   | 95 +++++++++++------------
 .github/workflows/pull.yml                | 11 +--
 3 files changed, 74 insertions(+), 55 deletions(-)
 create mode 100644 .ci/scripts/cleanup-samsung-linux-deps.sh

diff --git a/.ci/scripts/cleanup-samsung-linux-deps.sh b/.ci/scripts/cleanup-samsung-linux-deps.sh
new file mode 100644
index 00000000000..b889c1de67b
--- /dev/null
+++ b/.ci/scripts/cleanup-samsung-linux-deps.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright (c) Samsung Electronics Co. LTD
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set +ex
+
+if [[ "${DEVICE_ACQUIRED:-0}" != "1" ]]; then
+  exit 0
+fi
+
+if ! command -v devicefarm-cli >/dev/null 2>&1; then
+  echo "[WARN] Skip device disconnect (devicefarm-cli not installed)." >&2
+  exit 0
+fi
+
+echo "[INFO] Disconnecting device (-d)..."
+devicefarm-cli -d || echo "::warning::Device disconnect failed (ignored)"
+
+set -ex
diff --git a/.ci/scripts/setup-samsung-linux-deps.sh b/.ci/scripts/setup-samsung-linux-deps.sh
index 489eaf855c6..9aa9c4380a5 100644
--- a/.ci/scripts/setup-samsung-linux-deps.sh
+++ b/.ci/scripts/setup-samsung-linux-deps.sh
@@ -14,41 +14,40 @@ if [[ -z "${API_KEY}" ]]; then
   exit 1
 fi
 
-OS_NAME="Ubuntu 22.04"
-LITECORE_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
-DEVICEFARM_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/remotelab/download"
-
-parse_url() {
-  local json="$1"
-  if command -v jq >/dev/null 2>&1; then
-    jq -r '.data // empty' <<<"$json"
-  else
-    sed -n 's/.*"data":[[:space:]]*"\([^"]*\)".*/\1/p' <<<"$json"
-  fi
-}
+export DEVICE_CONNECT_ENABLED=1
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --skip-device-connect)
+      export DEVICE_CONNECT_ENABLED=0
+      shift
+      ;;
+    *)
+      # Unknown option
+      shift
+      ;;
+  esac
+done
+
+LITECORE_VERSION="v1.0"
+LITECORE_FILE_NAME="ai-litecore-ubuntu2204-${LITECORE_VERSION}.tar.gz"
+DEVICEFARM_CLI_VERSION="beta-v1.1.0"
+DEVICEFARM_FILE_NAME="devicefarmcli-${DEVICEFARM_CLI_VERSION}.zip"
+
+LITECORE_URL="https://soc-developer.semiconductor.samsung.com/api/v1/resource/download-file/${LITECORE_FILE_NAME}"
+DEVICEFARM_URL="https://soc-developer.semiconductor.samsung.com/api/v1/resource/download-file/${DEVICEFARM_FILE_NAME}"
 
 download_and_extract() {
-  local base_url="$1"
-  local version="$2"
-  local out_dir="$3"
-  local out_file="$4"
+  local download_url="$1"
+  local out_dir="$2"
+  local out_file="$3"
 
-  local resp
-  resp=$(curl -fsSL -G \
+  echo "Downloading from ${download_url}..."
+  curl -fsSL --retry 3 \
     -H "apikey: ${API_KEY}" \
-    --data-urlencode "version=${version}" \
-    --data-urlencode "os=${OS_NAME}" \
-    "${base_url}")
-
-  local download_url
-  download_url=$(parse_url "$resp")
-  if [[ -z "${download_url}" ]]; then
-    echo "ERROR: It failed to download from ${base_url} ."
-    echo "Response: $resp" >&2
-    exit 1
-  fi
+    -o "${out_file}" \
+    "${download_url}"
 
-  curl -fsSL -L --retry 3 -o "${out_file}" "${download_url}"
   echo "Download completed: ${out_file}"
 
   mkdir -p "${out_dir}"
@@ -60,7 +59,7 @@ download_and_extract() {
 
   zip)
     echo "Extracting ZIP..."
-    unzip -q -d "${out_dir}" "${out_file}"
+    unzip -qo -d "${out_dir}" "${out_file}"
     ;;
 
   *)
@@ -71,13 +70,12 @@ download_and_extract() {
 }
 
 download_ai_lite_core() {
-  local litecore_version="${1:-1.0}"
-  local litecore_out="/tmp/exynos-ai-litecore-v${litecore_version}.tar.gz"
+  local litecore_version="${1:-${LITECORE_VERSION}}"
+  local litecore_out="/tmp/${LITECORE_FILE_NAME}"
   local litecore_dir="/tmp/exynos_ai_lite_core"
 
   download_and_extract \
-    "${LITECORE_BASE}" \
-    "${litecore_version}" \
+    "${LITECORE_URL}" \
     "${litecore_dir}" \
     "${litecore_out}"
 
@@ -86,13 +84,12 @@ download_ai_lite_core() {
 }
 
 install_devicefarm_cli() {
-  local cli_version="${1:-beta-1.0.9}"
-  local cli_out="/tmp/devicefarm-cli-v${cli_version}.zip"
+  local cli_version="${1:-${DEVICEFARM_CLI_VERSION}}"
+  local cli_out="/tmp/${DEVICEFARM_FILE_NAME}"
   local cli_dir="/tmp/devicefarm_cli"
 
   download_and_extract \
-    "${DEVICEFARM_BASE}" \
-    "${cli_version}" \
+    "${DEVICEFARM_URL}" \
     "${cli_dir}" \
     "${cli_out}"
 
@@ -100,8 +97,8 @@ install_devicefarm_cli() {
   chmod +x "${cli_dir}/devicefarm-cli"
 }
 
-Enqueue_device_request() {
-  export DEVICE_RESERVED=0
+acquire_device() {
+  export DEVICE_ACQUIRED=0
   if ! command -v devicefarm-cli >/dev/null 2>&1; then
     echo "[WARN] devicefarm-cli is not installed." >&2
     return 1
@@ -138,7 +135,7 @@ Enqueue_device_request() {
 	echo "$out"
 	# Execute test command
 	devicefarm-cli -E "ls /" || true
-	export DEVICE_RESERVED=1
+	export DEVICE_ACQUIRED=1
 	echo "[INFO] Device successfully assigned and connected."
 	return 0
 	;;
@@ -173,10 +170,12 @@ install_enn_backend() {
   export PYTHONPATH="${PYTHONPATH:-}:${EXECUTORCH_ROOT}/.."
 }
 
-litecore_ver="1.0"
-devicefarm_ver="beta-1.0.9"
-
-download_ai_lite_core ${litecore_ver}
-install_devicefarm_cli "${devicefarm_ver}"
+download_ai_lite_core ${LITECORE_VERSION}
 install_enn_backend
-Enqueue_device_request
+
+if [[ "${DEVICE_CONNECT_ENABLED}" == "1" ]]; then
+  install_devicefarm_cli "${DEVICEFARM_CLI_VERSION}"
+  acquire_device
+else
+  export DEVICE_ACQUIRED=0
+fi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 5a006e2e751..eb09a1c8aa2 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1011,13 +1011,9 @@ jobs:
 
         # Setup Samsung SDK (AI Lite Core) and install enn backend
         export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
-        source .ci/scripts/setup-samsung-linux-deps.sh
 
-        # Check if device was reserved
-        if [[ "${DEVICE_RESERVED:-0}" != "1" ]]; then
-          echo "::warning::Skipping tests - no Samsung device available"
-          exit 0
-        fi
+        trap 'bash .ci/scripts/cleanup-samsung-linux-deps.sh' EXIT TERM INT
+        source .ci/scripts/setup-samsung-linux-deps.sh --skip-device-connect
 
         # Test quant models
         model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
@@ -1052,10 +1048,11 @@ jobs:
 
         # Setup Samsung SDK (AI Lite Core) and install enn backend
         export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
+        trap 'bash .ci/scripts/cleanup-samsung-linux-deps.sh' EXIT TERM INT
         source .ci/scripts/setup-samsung-linux-deps.sh
 
         # Check if device was reserved
-        if [[ "${DEVICE_RESERVED:-0}" != "1" ]]; then
+        if [[ "${DEVICE_ACQUIRED:-0}" != "1" ]]; then
           echo "::warning::Skipping tests - no Samsung device available"
           exit 0
         fi

From 5cd350459fd6ec06dcbde618f2bc3fcc9a6a0d86 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 10 Feb 2026 10:36:14 -0800
Subject: [PATCH 10/16] Add death test matchers to verify assertion messages
 (#16543)

### Summary
Added message matchers to death tests in 2 test files to verify
tests fail with the expected error messages, not just that they fail.

evalue_test.cpp (18 matchers):
- Type checks: "EValue is not an int", "EValue is not a"
- Null pointer checks: "Pointer is null", "pointer cannot be null"
- List pointer checks: "string/int/bool/double/tensor list pointer is
null"
- BoxedEvalueList checks: "wrapped_vals/unwrapped_vals cannot be null"

tensor_util_test.cpp (29 matchers):
- Shape/dtype mismatches: "Tensors do not match"
- Dimension validation: "Ending/Starting dimension.*should be in the
range"
- Empty matchers for stride checks (Windows regex limitations)

Note: Matchers use only cross-platform compatible regex features
(no brackets, unions, or grouping which fail on Windows).


### Test plan
```
./test/run_oss_cpp_tests.sh
```
---
 .../exec_aten/util/test/tensor_util_test.cpp  | 61 ++++++++++++-------
 runtime/core/test/evalue_test.cpp             | 45 ++++++++------
 2 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/runtime/core/exec_aten/util/test/tensor_util_test.cpp b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
index 842f2341235..4c0d9404234 100644
--- a/runtime/core/exec_aten/util/test/tensor_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/tensor_util_test.cpp
@@ -65,15 +65,15 @@ TEST_F(TensorUtilTest, SameShapesDifferentDtypes) {
   ET_CHECK_SAME_SHAPE3(a, b, c);
 
   // Not the same dtypes. Check both positions.
-  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE2(a, b), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE2(b, a), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(a, b), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(b, a), "");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE2(a, b), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE2(b, a), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(a, b), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(b, a), "Tensors do not match");
 
   // Test with a mismatching tensor in all positions, where the other two agree.
-  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE3(a, b, b), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE3(b, a, b), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE3(b, b, a), "");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE3(a, b, b), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE3(b, a, b), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_DTYPE3(b, b, a), "Tensors do not match");
   ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE3(a, b, b), "");
   ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE3(b, a, b), "");
   ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE3(b, b, a), "");
@@ -88,13 +88,13 @@ TEST_F(TensorUtilTest, DifferentShapesSameDtypes) {
   Tensor b2 = tf_int_.ones({2, 2});
 
   // The different tensors are not the same shape. Check both positions.
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE2(a, b), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE2(b, a), "");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE2(a, b), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE2(b, a), "Tensors do not match");
 
   // Test with the different tensor in all positions.
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE3(a, b, b2), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE3(b, a, b2), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE3(b, b2, a), "");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE3(a, b, b2), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE3(b, a, b2), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE3(b, b2, a), "Tensors do not match");
 
   // They are the same dtypes.
   ET_CHECK_SAME_DTYPE2(a, b);
@@ -104,11 +104,14 @@ TEST_F(TensorUtilTest, DifferentShapesSameDtypes) {
   ET_CHECK_SAME_DTYPE3(b, b2, a);
 
   // But not the same shape-and-dtype.
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(a, b), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(b, a), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE3(a, b, b2), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE3(b, a, b2), "");
-  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE3(b, b2, a), "");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(a, b), "Tensors do not match");
+  ET_EXPECT_DEATH(ET_CHECK_SAME_SHAPE_AND_DTYPE2(b, a), "Tensors do not match");
+  ET_EXPECT_DEATH(
+      ET_CHECK_SAME_SHAPE_AND_DTYPE3(a, b, b2), "Tensors do not match");
+  ET_EXPECT_DEATH(
+      ET_CHECK_SAME_SHAPE_AND_DTYPE3(b, a, b2), "Tensors do not match");
+  ET_EXPECT_DEATH(
+      ET_CHECK_SAME_SHAPE_AND_DTYPE3(b, b2, a), "Tensors do not match");
 }
 
 TEST_F(TensorUtilTest, ZeroDimensionalTensor) {
@@ -163,9 +166,15 @@ TEST_F(TensorUtilTest, GetLeadingDimsInputOutOfBoundDies) {
   Tensor t = tf_int_.ones({2, 3, 4});
 
   // dim needs to be in the range [0, t.dim()]
-  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -2), "");
-  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -1), "");
-  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 4), "");
+  ET_EXPECT_DEATH(
+      executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -2),
+      "Ending dimension.*should be in the range");
+  ET_EXPECT_DEATH(
+      executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, -1),
+      "Ending dimension.*should be in the range");
+  ET_EXPECT_DEATH(
+      executorch::ET_RUNTIME_NAMESPACE::getLeadingDims(t, 4),
+      "Ending dimension.*should be in the range");
 }
 
 TEST_F(TensorUtilTest, GetTrailingDimsSmokeTest) {
@@ -187,9 +196,15 @@ TEST_F(TensorUtilTest, GetTrailingDimsInputOutOfBoundDies) {
   Tensor t = tf_int_.ones({2, 3, 4});
 
   // dim needs to be in the range [-1, t.dim() - 1)
-  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, -2), "");
-  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 3), "");
-  ET_EXPECT_DEATH(executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 4), "");
+  ET_EXPECT_DEATH(
+      executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, -2),
+      "Starting dimension.*should be in the range");
+  ET_EXPECT_DEATH(
+      executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 3),
+      "Starting dimension.*should be in the range");
+  ET_EXPECT_DEATH(
+      executorch::ET_RUNTIME_NAMESPACE::getTrailingDims(t, 4),
+      "Starting dimension.*should be in the range");
 }
 
 TEST_F(TensorUtilTest, ContiguousCheckSupported) {
diff --git a/runtime/core/test/evalue_test.cpp b/runtime/core/test/evalue_test.cpp
index 9e91ad70a0b..edf6a1b12c1 100644
--- a/runtime/core/test/evalue_test.cpp
+++ b/runtime/core/test/evalue_test.cpp
@@ -89,7 +89,7 @@ TEST_F(EValueTest, TypeMismatchFatals) {
         auto e = EValue(true);
         e.toInt();
       },
-      "");
+      "EValue is not an int");
 }
 
 TEST_F(EValueTest, NoneByDefault) {
@@ -279,38 +279,45 @@ TEST_F(EValueTest, ConstructFromTensorWrapper) {
 TEST_F(EValueTest, ConstructFromNullPtrAborts) {
   std::unique_ptr<executorch::aten::Tensor> null_ptr;
 
-  ET_EXPECT_DEATH({ EValue evalue(null_ptr); }, "");
+  ET_EXPECT_DEATH({ EValue evalue(null_ptr); }, "Pointer is null");
 }
 
 TEST_F(EValueTest, StringConstructorNullCheck) {
   executorch::aten::ArrayRef<char>* null_string_ptr = nullptr;
-  ET_EXPECT_DEATH({ EValue evalue(null_string_ptr); }, "");
+  ET_EXPECT_DEATH(
+      { EValue evalue(null_string_ptr); }, "pointer cannot be null");
 }
 
 TEST_F(EValueTest, BoolListConstructorNullCheck) {
   executorch::aten::ArrayRef<bool>* null_bool_list_ptr = nullptr;
-  ET_EXPECT_DEATH({ EValue evalue(null_bool_list_ptr); }, "");
+  ET_EXPECT_DEATH(
+      { EValue evalue(null_bool_list_ptr); }, "pointer cannot be null");
 }
 
 TEST_F(EValueTest, DoubleListConstructorNullCheck) {
   executorch::aten::ArrayRef<double>* null_double_list_ptr = nullptr;
-  ET_EXPECT_DEATH({ EValue evalue(null_double_list_ptr); }, "");
+  ET_EXPECT_DEATH(
+      { EValue evalue(null_double_list_ptr); }, "pointer cannot be null");
 }
 
 TEST_F(EValueTest, IntListConstructorNullCheck) {
   BoxedEvalueList<int64_t>* null_int_list_ptr = nullptr;
-  ET_EXPECT_DEATH({ EValue evalue(null_int_list_ptr); }, "");
+  ET_EXPECT_DEATH(
+      { EValue evalue(null_int_list_ptr); }, "pointer cannot be null");
 }
 
 TEST_F(EValueTest, TensorListConstructorNullCheck) {
   BoxedEvalueList<executorch::aten::Tensor>* null_tensor_list_ptr = nullptr;
-  ET_EXPECT_DEATH({ EValue evalue(null_tensor_list_ptr); }, "");
+  ET_EXPECT_DEATH(
+      { EValue evalue(null_tensor_list_ptr); }, "pointer cannot be null");
 }
 
 TEST_F(EValueTest, OptionalTensorListConstructorNullCheck) {
   BoxedEvalueList<std::optional<executorch::aten::Tensor>>*
       null_optional_tensor_list_ptr = nullptr;
-  ET_EXPECT_DEATH({ EValue evalue(null_optional_tensor_list_ptr); }, "");
+  ET_EXPECT_DEATH(
+      { EValue evalue(null_optional_tensor_list_ptr); },
+      "pointer cannot be null");
 }
 
 TEST_F(EValueTest, BoxedEvalueListConstructorNullChecks) {
@@ -321,16 +328,18 @@ TEST_F(EValueTest, BoxedEvalueListConstructorNullChecks) {
 
   // Test null wrapped_vals
   ET_EXPECT_DEATH(
-      { BoxedEvalueList<int64_t> list(nullptr, storage.data(), 3); }, "");
+      { BoxedEvalueList<int64_t> list(nullptr, storage.data(), 3); },
+      "wrapped_vals cannot be null");
 
   // Test null unwrapped_vals
   ET_EXPECT_DEATH(
-      { BoxedEvalueList<int64_t> list(values_p.data(), nullptr, 3); }, "");
+      { BoxedEvalueList<int64_t> list(values_p.data(), nullptr, 3); },
+      "unwrapped_vals cannot be null");
 
   // Test negative size
   ET_EXPECT_DEATH(
       { BoxedEvalueList<int64_t> list(values_p.data(), storage.data(), -1); },
-      "");
+      "size cannot be negative");
 }
 
 TEST_F(EValueTest, toListOptionalTensorTypeCheck) {
@@ -340,7 +349,7 @@ TEST_F(EValueTest, toListOptionalTensorTypeCheck) {
   EXPECT_FALSE(e.isListOptionalTensor());
 
   // Should fail type check
-  ET_EXPECT_DEATH({ e.toListOptionalTensor(); }, "");
+  ET_EXPECT_DEATH({ e.toListOptionalTensor(); }, "EValue is not a");
 }
 
 TEST_F(EValueTest, toStringNullPointerCheck) {
@@ -351,7 +360,7 @@ TEST_F(EValueTest, toStringNullPointerCheck) {
 
   // Should pass isString() check but fail null pointer check
   EXPECT_TRUE(e.isString());
-  ET_EXPECT_DEATH({ e.toString(); }, "");
+  ET_EXPECT_DEATH({ e.toString(); }, "string pointer is null");
 }
 
 TEST_F(EValueTest, toIntListNullPointerCheck) {
@@ -362,7 +371,7 @@ TEST_F(EValueTest, toIntListNullPointerCheck) {
 
   // Should pass isIntList() check but fail null pointer check
   EXPECT_TRUE(e.isIntList());
-  ET_EXPECT_DEATH({ e.toIntList(); }, "");
+  ET_EXPECT_DEATH({ e.toIntList(); }, "int list pointer is null");
 }
 
 TEST_F(EValueTest, toBoolListNullPointerCheck) {
@@ -373,7 +382,7 @@ TEST_F(EValueTest, toBoolListNullPointerCheck) {
 
   // Should pass isBoolList() check but fail null pointer check
   EXPECT_TRUE(e.isBoolList());
-  ET_EXPECT_DEATH({ e.toBoolList(); }, "");
+  ET_EXPECT_DEATH({ e.toBoolList(); }, "bool list pointer is null");
 }
 
 TEST_F(EValueTest, toDoubleListNullPointerCheck) {
@@ -384,7 +393,7 @@ TEST_F(EValueTest, toDoubleListNullPointerCheck) {
 
   // Should pass isDoubleList() check but fail null pointer check
   EXPECT_TRUE(e.isDoubleList());
-  ET_EXPECT_DEATH({ e.toDoubleList(); }, "");
+  ET_EXPECT_DEATH({ e.toDoubleList(); }, "double list pointer is null");
 }
 
 TEST_F(EValueTest, toTensorListNullPointerCheck) {
@@ -395,7 +404,7 @@ TEST_F(EValueTest, toTensorListNullPointerCheck) {
 
   // Should pass isTensorList() check but fail null pointer check
   EXPECT_TRUE(e.isTensorList());
-  ET_EXPECT_DEATH({ e.toTensorList(); }, "");
+  ET_EXPECT_DEATH({ e.toTensorList(); }, "tensor list pointer is null");
 }
 
 TEST_F(EValueTest, toListOptionalTensorNullPointerCheck) {
@@ -406,5 +415,5 @@ TEST_F(EValueTest, toListOptionalTensorNullPointerCheck) {
 
   // Should pass isListOptionalTensor() check but fail null pointer check
   EXPECT_TRUE(e.isListOptionalTensor());
-  ET_EXPECT_DEATH({ e.toListOptionalTensor(); }, "");
+  ET_EXPECT_DEATH({ e.toListOptionalTensor(); }, "pointer is null");
 }

From b7e063ac923a51220b276a8e07619673135151c6 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 10 Feb 2026 10:44:36 -0800
Subject: [PATCH 11/16] Validate dim_order is a permutation in
 dim_order_to_stride (#17314)

### Summary
The validate_dim_order function only checked that values were in bounds,
allowing invalid inputs like {0, 0, 0} to pass. This caused
uninitialized memory access in dim_order_to_stride_nocheck.

Fix by using a bitmask to detect duplicates. Also adds test fixture with
runtime_init() for error logging and removes duplicate include.

### Test plan
```
./test/run_oss_cpp_tests.sh
```

---------

Co-authored-by: Claude <noreply@anthropic.com>
---
 runtime/core/exec_aten/util/dim_order_util.h  | 16 +++-
 .../util/test/dim_order_util_test.cpp         | 73 +++++++++++++++++--
 2 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/runtime/core/exec_aten/util/dim_order_util.h b/runtime/core/exec_aten/util/dim_order_util.h
index d01b2079ab2..ea568276a8c 100644
--- a/runtime/core/exec_aten/util/dim_order_util.h
+++ b/runtime/core/exec_aten/util/dim_order_util.h
@@ -8,13 +8,13 @@
 
 #pragma once
 
-#include <c10/util/irange.h>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
 
 #include <c10/util/irange.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/compiler.h>
 
@@ -24,10 +24,22 @@ namespace runtime {
 namespace {
 template <typename DimOrderType>
 bool validate_dim_order(const DimOrderType* dim_order, const size_t dims) {
+  static_assert(
+      kTensorDimensionLimit <= 16,
+      "Bitmask-based validation requires kTensorDimensionLimit <= 16");
+  if (dims > kTensorDimensionLimit) {
+    return false;
+  }
+  uint16_t seen = 0;
   for (const auto i : c10::irange(dims)) {
     if (dim_order[i] >= static_cast<DimOrderType>(dims)) {
       return false;
     }
+    const uint16_t mask = 1u << dim_order[i];
+    if (seen & mask) {
+      return false;
+    }
+    seen |= mask;
   }
   return true;
 }
@@ -150,7 +162,7 @@ ET_NODISCARD inline Error dim_order_to_stride(
   ET_CHECK_OR_RETURN_ERROR(
       validate_dim_order(dim_order, dims),
       InvalidArgument,
-      "Invalid dim order. One of the value is larger than the number of dims %zu",
+      "Invalid dim order: values must be a permutation of [0, %zu)",
       dims);
 
   dim_order_to_stride_nocheck(sizes, dim_order, dims, strides);
diff --git a/runtime/core/exec_aten/util/test/dim_order_util_test.cpp b/runtime/core/exec_aten/util/test/dim_order_util_test.cpp
index fe3c1f6d82a..610a4611cda 100644
--- a/runtime/core/exec_aten/util/test/dim_order_util_test.cpp
+++ b/runtime/core/exec_aten/util/test/dim_order_util_test.cpp
@@ -12,6 +12,7 @@
 #include <numeric>
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <gtest/gtest.h>
 
@@ -21,6 +22,15 @@ using executorch::runtime::is_channels_last_dim_order;
 using executorch::runtime::is_contiguous_dim_order;
 using executorch::runtime::stride_to_dim_order;
 
+class DimOrderUtilTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // As some of these tests cause ET_LOG to be called, the PAL must be
+    // initialized first by calling runtime_init();
+    executorch::runtime::runtime_init();
+  }
+};
+
 namespace {
 void check_strides_eq(
     executorch::aten::ArrayRef<executorch::aten::StridesType> strides_a,
@@ -39,7 +49,7 @@ void check_dim_order_eq(
 }
 } // namespace
 
-TEST(DimOrderUtilTest, DimOrderToStride) {
+TEST_F(DimOrderUtilTest, DimOrderToStride) {
   executorch::aten::SizesType sizes_1[1] = {5};
   executorch::aten::SizesType dim_order_1[1] = {0};
   executorch::aten::SizesType strides_1[1] = {0};
@@ -204,7 +214,7 @@ TEST(DimOrderUtilTest, DimOrderToStride) {
   check_strides_eq({strides_3_zero, 3}, {expected_strides_3_zero, 3});
 }
 
-TEST(DimOrderUtilTest, StrideToDimOrder) {
+TEST_F(DimOrderUtilTest, StrideToDimOrder) {
   executorch::aten::SizesType strides[3] = {5, 1, 15};
   executorch::aten::DimOrderType dim_order[3] = {0, 0, 0};
 
@@ -216,7 +226,7 @@ TEST(DimOrderUtilTest, StrideToDimOrder) {
   check_dim_order_eq(dim_order, expected_dim_order);
 }
 
-TEST(DimOrderUtilTest, StrideToDimOrderSameStrides) {
+TEST_F(DimOrderUtilTest, StrideToDimOrderSameStrides) {
   executorch::aten::SizesType strides[4] = {4, 3, 1, 1};
   executorch::aten::DimOrderType dim_order[4] = {0, 0, 0, 0};
 
@@ -227,7 +237,7 @@ TEST(DimOrderUtilTest, StrideToDimOrderSameStrides) {
   check_dim_order_eq(dim_order, expected_dim_order);
 }
 
-TEST(DimOrderUtilTest, IsDefaultDimOrderTest) {
+TEST_F(DimOrderUtilTest, IsDefaultDimOrderTest) {
   for (const auto i : c10::irange(1, 7)) {
     std::vector<executorch::aten::DimOrderType> dim_order(i);
     std::iota(dim_order.begin(), dim_order.end(), 0);
@@ -240,7 +250,7 @@ TEST(DimOrderUtilTest, IsDefaultDimOrderTest) {
   }
 }
 
-TEST(DimOrderUtilTest, IsDefaultDimOrderFailCasesTest) {
+TEST_F(DimOrderUtilTest, IsDefaultDimOrderFailCasesTest) {
   // Dims is default order but have two elements swapped
   for (const auto i : c10::irange(3, 8)) {
     std::vector<executorch::aten::DimOrderType> dim_order(i);
@@ -261,7 +271,7 @@ TEST(DimOrderUtilTest, IsDefaultDimOrderFailCasesTest) {
   }
 }
 
-TEST(DimOrderUtilTest, IsChannelsLastDimOrderTest) {
+TEST_F(DimOrderUtilTest, IsChannelsLastDimOrderTest) {
   executorch::aten::DimOrderType dim_order_4d[4] = {0, 2, 3, 1};
   executorch::aten::DimOrderType dim_order_5d[5] = {0, 2, 3, 4, 1};
 
@@ -273,7 +283,7 @@ TEST(DimOrderUtilTest, IsChannelsLastDimOrderTest) {
   EXPECT_FALSE(is_contiguous_dim_order(dim_order_5d, 5));
 }
 
-TEST(DimOrderUtilTest, IsChannelsLastDimOrderFailCasesTest) {
+TEST_F(DimOrderUtilTest, IsChannelsLastDimOrderFailCasesTest) {
   // Non 4D and 5D dim order returns false
   executorch::aten::DimOrderType dim_order_3d[4] = {1, 2, 0};
   executorch::aten::DimOrderType dim_order_6d[6] = {0, 2, 3, 4, 5, 1};
@@ -287,3 +297,52 @@ TEST(DimOrderUtilTest, IsChannelsLastDimOrderFailCasesTest) {
   EXPECT_FALSE(is_channels_last_dim_order(dim_order_4d, 4));
   EXPECT_FALSE(is_channels_last_dim_order(dim_order_5d, 5));
 }
+
+TEST_F(DimOrderUtilTest, DimOrderWithAllDuplicatesReturnsError) {
+  executorch::aten::SizesType sizes[3] = {2, 3, 4};
+  executorch::aten::SizesType dim_order[3] = {0, 0, 0};
+  executorch::aten::SizesType strides[3] = {0, 0, 0};
+
+  auto error = dim_order_to_stride(sizes, dim_order, 3, strides);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(DimOrderUtilTest, DimOrderWithPartialDuplicateReturnsError) {
+  executorch::aten::SizesType sizes[3] = {2, 3, 4};
+  executorch::aten::SizesType dim_order[3] = {0, 1, 1};
+  executorch::aten::SizesType strides[3] = {0, 0, 0};
+
+  auto error = dim_order_to_stride(sizes, dim_order, 3, strides);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(DimOrderUtilTest, DimOrderWithMissingValueReturnsError) {
+  executorch::aten::SizesType sizes[3] = {2, 3, 4};
+  executorch::aten::SizesType dim_order[3] = {1, 2, 2};
+  executorch::aten::SizesType strides[3] = {0, 0, 0};
+
+  auto error = dim_order_to_stride(sizes, dim_order, 3, strides);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(DimOrderUtilTest, DimOrderWithOutOfBoundsValueReturnsError) {
+  executorch::aten::SizesType sizes[3] = {2, 3, 4};
+  executorch::aten::SizesType dim_order[3] = {0, 1, 5};
+  executorch::aten::SizesType strides[3] = {0, 0, 0};
+
+  auto error = dim_order_to_stride(sizes, dim_order, 3, strides);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(DimOrderUtilTest, TooManyDimsReturnsError) {
+  constexpr size_t kTooManyDims =
+      executorch::runtime::kTensorDimensionLimit + 1;
+  std::vector<executorch::aten::SizesType> sizes(kTooManyDims, 1);
+  std::vector<executorch::aten::SizesType> dim_order(kTooManyDims);
+  std::iota(dim_order.begin(), dim_order.end(), 0);
+  std::vector<executorch::aten::SizesType> strides(kTooManyDims, 0);
+
+  auto error = dim_order_to_stride(
+      sizes.data(), dim_order.data(), kTooManyDims, strides.data());
+  EXPECT_EQ(error, Error::InvalidArgument);
+}

From 23de89370ab9e2974249a63010b9250026aedb08 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Tue, 10 Feb 2026 07:24:59 -0800
Subject: [PATCH 12/16] [ET-VK] Fix missing memory barrier for first-use writes
 on aliased tensors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/17309

Tensors sharing physical memory via SharedObject each track their own
`last_access_` independently. When a tensor's first access is a write,
`prev_stage` is `NO_STAGE`, causing `transition()` to use
`TOP_OF_PIPE_BIT` as `srcStageMask` with no `srcAccessMask` — effectively
a no-op barrier. If the same physical memory was previously written
through a different aliased tensor handle, this creates a WAW hazard
where the new write may execute before or concurrently with the prior
write, producing non-deterministic results.

This was observed as non-deterministic q8ta_conv2d output in ResNet50:
running the model twice with the same input produced slightly different
quantized int8 values. Adding a debug print shader after each conv2d
dispatch masked the issue because the print node's read-after-write
barrier serialized GPU work.

The fix: when `prev_stage` is `NO_STAGE` and the current access is a
write, use `COMPUTE_SHADER_BIT` with `SHADER_WRITE_BIT` instead of
`TOP_OF_PIPE_BIT` with no access flags. This ensures all prior compute
shader work completes and its writes are made visible before the new
write begins.

Authored with Claude.
ghstack-source-id: 339884030
@exported-using-ghexport

Differential Revision: [D92715369](https://our.internmc.facebook.com/intern/diff/D92715369/)
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 2aa533f7aa8..47cefa1031a 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -775,9 +775,22 @@ void vTensorStorage::transition(
   // RAR: no need for synchronization
   if (prev_written || cur_written || layout_changed) {
     VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
+    VkAccessFlags src_access = vkapi::vk_access(prev_stage, prev_access);
+
     if (0u == src_stage) {
-      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+      if (cur_written) {
+        // First access through this tensor handle, and it's a write. The
+        // underlying memory may have been previously written through a
+        // different aliased tensor handle (via SharedObject). Wait for all
+        // prior compute work and make those writes available to prevent WAW
+        // hazards on aliased memory.
+        src_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+        src_access = VK_ACCESS_SHADER_WRITE_BIT;
+      } else {
+        src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+      }
     }
+
     VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
     if (0u == dst_stage) {
       dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
@@ -786,20 +799,15 @@ void vTensorStorage::transition(
     pipeline_barrier.stage.src |= src_stage;
     pipeline_barrier.stage.dst |= dst_stage;
 
+    VkAccessFlags dst_access = vkapi::vk_access(cur_stage, cur_access);
+
     if (image_) {
       pipeline_barrier.images.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          cur_layout,
-          new_layout,
-          image_);
+          src_access, dst_access, cur_layout, new_layout, image_);
 
       image_.set_layout(new_layout);
     } else if (buffer_) {
-      pipeline_barrier.buffers.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          buffer_);
+      pipeline_barrier.buffers.emplace_back(src_access, dst_access, buffer_);
     }
   }
 

From aed50d134a1e822c1745af364653b64864e80f36 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 11 Feb 2026 00:56:37 -0800
Subject: [PATCH 13/16] consolidate cuda stream

---
 backends/aoti/aoti_delegate_handle.h         |  2 -
 backends/cuda/runtime/TARGETS                |  3 +
 backends/cuda/runtime/cuda_backend.cpp       | 96 ++++++++------------
 backends/cuda/runtime/cuda_delegate_handle.h | 66 ++++++++++++++
 4 files changed, 107 insertions(+), 60 deletions(-)
 create mode 100644 backends/cuda/runtime/cuda_delegate_handle.h

diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
index 862bcb3bf02..2bc6abf9bd1 100644
--- a/backends/aoti/aoti_delegate_handle.h
+++ b/backends/aoti/aoti_delegate_handle.h
@@ -84,8 +84,6 @@ struct AOTIDelegateHandle {
   void* so_handle;
   std::string so_path;
   AOTInductorModelContainerHandle container_handle;
-  void* cuda_stream; // Per-handle CUDA stream. If nullptr, use backend's shared
-                     // stream instead (for skip-copy optimization).
   std::string method_name;
 
   // Function pointers specific to this handle's shared library
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 173fb95a399..5d1bdff4b0f 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -95,6 +95,9 @@ runtime.cxx_library(
     srcs = [
         "cuda_backend.cpp",
     ],
+    headers = [
+        "cuda_delegate_handle.h",
+    ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
     supports_python_dlopen = True,
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 297994ccbe1..ce4566d28d4 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -37,6 +37,7 @@
 
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
+#include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -151,19 +152,16 @@ class ET_EXPERIMENTAL CudaBackend final
     if (shared_cuda_stream_ != nullptr) {
       return; // Already created
     }
-    cudaError_t err = cudaStreamCreate(&shared_cuda_stream_);
-    if (err != cudaSuccess) {
-      ET_LOG(
-          Error,
-          "Failed to create shared CUDA stream: %s",
-          cudaGetErrorString(err));
+    shared_cuda_stream_ = cuda::create_shared_cuda_stream();
+    if (shared_cuda_stream_ == nullptr) {
+      ET_LOG(Error, "Failed to create shared CUDA stream");
       return;
     }
-    ET_LOG(Info, "Created shared CUDA stream: %p", shared_cuda_stream_);
+    ET_LOG(Info, "Created shared CUDA stream: %p", *shared_cuda_stream_);
   }
 
   // Get the shared CUDA stream. Returns nullptr if not in shared mode.
-  cudaStream_t get_shared_cuda_stream() const {
+  cuda::SharedCudaStream get_shared_cuda_stream() const {
     std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
     return shared_cuda_stream_;
   }
@@ -212,18 +210,6 @@ class ET_EXPERIMENTAL CudaBackend final
   }
 
  public:
-  // Destructor: clean up the shared CUDA stream if it was created.
-  ~CudaBackend() {
-    if (shared_cuda_stream_ != nullptr) {
-      cudaError_t err = cudaStreamDestroy(shared_cuda_stream_);
-      if (err != cudaSuccess) {
-        ET_LOG(
-            Error,
-            "Failed to destroy shared CUDA stream: %s",
-            cudaGetErrorString(err));
-      }
-    }
-  }
 
   bool is_available() const override {
     return 1;
@@ -335,7 +321,7 @@ class ET_EXPERIMENTAL CudaBackend final
     processed->Free();
 
     // Create handle and load function pointers into it
-    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    cuda::CudaDelegateHandle* handle = new cuda::CudaDelegateHandle();
     handle->so_handle = lib_handle;
     handle->so_path = so_path.string();
     handle->method_name = method_name;
@@ -371,20 +357,24 @@ class ET_EXPERIMENTAL CudaBackend final
     // A shared stream ensures proper ordering across multiple methods
     // (e.g., encoder, decoder, sampler) when using skip-copy optimization.
     if (is_using_shared_cuda_stream()) {
-      // Shared stream mode: set handle's stream to nullptr.
-      // The stream will be retrieved from backend in execute().
-      handle->cuda_stream = nullptr;
+      // Shared stream mode: all handles share the same stream.
+      handle->cuda_stream = get_shared_cuda_stream();
       ET_LOG(
-          Info, "Using shared CUDA stream for method %s", method_name.c_str());
+          Info,
+          "Using shared CUDA stream %p for method %s",
+          handle->get_cuda_stream(),
+          method_name.c_str());
     } else {
       // Per-handle stream mode: each handle owns its own stream.
-      cudaStream_t cuda_stream;
-      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
-      handle->cuda_stream = static_cast<void*>(cuda_stream);
+      handle->cuda_stream = cuda::create_shared_cuda_stream();
+      if (handle->cuda_stream == nullptr) {
+        delete handle;
+        return Error::Internal;
+      }
       ET_LOG(
           Info,
           "Created new CUDA stream %p for method %s",
-          handle->cuda_stream,
+          handle->get_cuda_stream(),
           method_name.c_str());
     }
 
@@ -396,7 +386,7 @@ class ET_EXPERIMENTAL CudaBackend final
       BackendExecutionContext& context,
       DelegateHandle* handle_,
       Span<EValue*> args) const override {
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;
 
     size_t n_inputs;
     handle->get_num_inputs(handle->container_handle, &n_inputs);
@@ -477,11 +467,8 @@ class ET_EXPERIMENTAL CudaBackend final
     // it's shared with the Metal backend. Instead, we reinterpret_cast
     // SlimTensor* to Tensor*
     //
-    // Get the CUDA stream: use handle's stream if set, otherwise get from
-    // backend's shared stream.
-    cudaStream_t cuda_stream = handle->cuda_stream != nullptr
-        ? static_cast<cudaStream_t>(handle->cuda_stream)
-        : get_shared_cuda_stream();
+    // Get the CUDA stream from the handle.
+    cudaStream_t cuda_stream = handle->get_cuda_stream();
     AOTIRuntimeError error = handle->run(
         handle->container_handle,
         reinterpret_cast<Tensor**>(gpu_inputs.data()),
@@ -502,12 +489,12 @@ class ET_EXPERIMENTAL CudaBackend final
     if (copy_outputs) {
       // Synchronize CUDA stream before D2H copy. This is required because
       // cudaMemcpy is not stream-ordered and needs the kernel to complete.
-      cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
-      ET_CHECK_OR_RETURN_ERROR(
-          sync_err == cudaSuccess,
-          Internal,
-          "cudaStreamSynchronize failed: %s",
-          cudaGetErrorString(sync_err));
+      // cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
+      // ET_CHECK_OR_RETURN_ERROR(
+      //     sync_err == cudaSuccess,
+      //     Internal,
+      //     "cudaStreamSynchronize failed: %s",
+      //     cudaGetErrorString(sync_err));
 
       // Deep copy GPU SlimTensor results back to CPU ETensors
       for (size_t i = 0; i < n_outputs; i++) {
@@ -564,7 +551,7 @@ class ET_EXPERIMENTAL CudaBackend final
     if (handle_ == nullptr) {
       return;
     }
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;
 
     // Clean up cached output tensors for this handle
     {
@@ -576,18 +563,10 @@ class ET_EXPERIMENTAL CudaBackend final
       }
     }
 
-    // Destroy the CUDA stream only if this handle owns it (non-null).
-    // When cuda_stream is nullptr, the handle uses the backend's shared
-    // stream which is managed by the backend singleton via shared_ptr.
-    if (handle->cuda_stream != nullptr) {
-      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
-      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
-      ET_CHECK_OR_LOG_ERROR(
-          stream_err == cudaSuccess,
-          "Failed to destroy CUDA stream: %s",
-          cudaGetErrorString(stream_err));
-      handle->cuda_stream = nullptr;
-    }
+    // The CUDA stream is managed by shared_ptr in the handle.
+    // It will be automatically destroyed when the last handle using it
+    // is destroyed. Just reset our reference.
+    handle->cuda_stream.reset();
 
     // NOTE: AOTInductorModelContainerDelete does not work correctly with
     // multiple .so files. Deleting one container frees shared resources,
@@ -627,16 +606,17 @@ class ET_EXPERIMENTAL CudaBackend final
   // Shared CUDA stream for all methods. When set (non-null), all methods use
   // the same stream to ensure proper ordering (critical for skip-copy
   // optimization). Created when use_shared_cuda_stream option is set to true.
-  // Cleaned up in destructor.
+  // Managed via shared_ptr so it's automatically cleaned up when last handle
+  // is destroyed.
   mutable std::mutex cuda_stream_mutex_;
-  cudaStream_t shared_cuda_stream_ = nullptr;
+  cuda::SharedCudaStream shared_cuda_stream_ = nullptr;
 
   // Cached output tensors for skip-copy optimization.
   // When skip-copy is enabled, output SlimTensors are cached here to keep
   // the underlying GPU memory alive while the caller processes the results.
-  // Maps each AOTIDelegateHandle* to its vector of cached output tensors.
+  // Maps each CudaDelegateHandle* to its vector of cached output tensors.
   mutable std::mutex cached_outputs_mutex_;
-  mutable std::unordered_map<AOTIDelegateHandle*, std::vector<SlimTensor*>>
+  mutable std::unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
       cached_outputs_;
 };
 
diff --git a/backends/cuda/runtime/cuda_delegate_handle.h b/backends/cuda/runtime/cuda_delegate_handle.h
new file mode 100644
index 00000000000..986718ffdf5
--- /dev/null
+++ b/backends/cuda/runtime/cuda_delegate_handle.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/aoti_delegate_handle.h>
+#include <memory>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+// Shared CUDA stream wrapper with proper RAII cleanup.
+// This ensures the stream is destroyed when all handles using it are destroyed.
+struct CudaStreamDeleter {
+  void operator()(cudaStream_t* stream) const {
+    if (stream != nullptr && *stream != nullptr) {
+      cudaStreamDestroy(*stream);
+    }
+    delete stream;
+  }
+};
+
+using SharedCudaStream = std::shared_ptr<cudaStream_t>;
+
+// Creates a new shared CUDA stream.
+// Returns nullptr on failure.
+inline SharedCudaStream create_shared_cuda_stream() {
+  cudaStream_t stream;
+  cudaError_t err = cudaStreamCreate(&stream);
+  if (err != cudaSuccess) {
+    return nullptr;
+  }
+  return SharedCudaStream(new cudaStream_t(stream), CudaStreamDeleter());
+}
+
+// CUDA-specific delegate handle that extends AOTIDelegateHandle.
+// This consolidates CUDA stream management into a single location.
+struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
+  // Shared CUDA stream for this handle.
+  // When multiple handles share the same stream (e.g., for skip-copy
+  // optimization), they will all hold a reference to the same shared_ptr.
+  // The stream is automatically destroyed when the last handle is destroyed.
+  SharedCudaStream cuda_stream;
+
+  // Get the raw CUDA stream pointer for use in CUDA API calls.
+  // Returns nullptr if no stream is set.
+  cudaStream_t get_cuda_stream() const {
+    return cuda_stream ? *cuda_stream : nullptr;
+  }
+
+  // Check if this handle has a valid CUDA stream.
+  bool has_cuda_stream() const {
+    return cuda_stream != nullptr && *cuda_stream != nullptr;
+  }
+};
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch

From bddcb88279a103a7042d31334d8385cf9e718e70 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 11 Feb 2026 01:08:37 -0800
Subject: [PATCH 14/16] reformat

---
 backends/cuda/runtime/cuda_backend.cpp | 67 +++++++++++++-------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index ce4566d28d4..3b6f67d9d82 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -210,7 +210,6 @@ class ET_EXPERIMENTAL CudaBackend final
   }
 
  public:
-
   bool is_available() const override {
     return 1;
   }
@@ -411,21 +410,21 @@ class ET_EXPERIMENTAL CudaBackend final
     // Process input tensors: convert ETensor (CPU) to SlimTensor (GPU)
     for (size_t i = 0; i < n_inputs; i++) {
       auto* cpu_tensor = &(args[i]->toTensor());
-
-      // Check if input data is already on GPU (skip-copy optimization for
-      // inputs) This can happen when the caller has pre-staged data on GPU
-      cudaPointerAttributes attributes{};
       const void* data_ptr = cpu_tensor->const_data_ptr();
-      if (data_ptr != nullptr) {
-        cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr);
-        if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
-          // Data is already on GPU - wrap it directly without copy
-          auto sizes = cpu_tensor->sizes();
-          auto strides = cpu_tensor->strides();
-          std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
-          std::vector<int64_t> strides_vec(strides.begin(), strides.end());
-
-          gpu_inputs[i] = new SlimTensor(slim::from_blob(
+
+      // Check if input data is already on GPU by looking up cached outputs.
+      // This avoids calling cudaPointerGetAttributes which is a sync point.
+      // If the data pointer matches a cached output tensor, we know it's on
+      // GPU.
+      SlimTensor* cached_tensor = find_cached_tensor_by_data_ptr(data_ptr);
+      if (cached_tensor != nullptr) {
+        // Data is already on GPU from a previous method's output.
+        // Wrap it directly without copy using from_blob.
+        auto sizes = cpu_tensor->sizes();
+        auto strides = cpu_tensor->strides();
+        std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+        std::vector<int64_t> strides_vec(strides.begin(), strides.end());
+        gpu_inputs[i] = new SlimTensor(slim::from_blob(
               const_cast<void*>(data_ptr),
               slim::makeArrayRef(sizes_vec),
               slim::makeArrayRef(strides_vec),
@@ -487,15 +486,6 @@ class ET_EXPERIMENTAL CudaBackend final
     const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
 
     if (copy_outputs) {
-      // Synchronize CUDA stream before D2H copy. This is required because
-      // cudaMemcpy is not stream-ordered and needs the kernel to complete.
-      // cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
-      // ET_CHECK_OR_RETURN_ERROR(
-      //     sync_err == cudaSuccess,
-      //     Internal,
-      //     "cudaStreamSynchronize failed: %s",
-      //     cudaGetErrorString(sync_err));
-
       // Deep copy GPU SlimTensor results back to CPU ETensors
       for (size_t i = 0; i < n_outputs; i++) {
         auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
@@ -509,13 +499,7 @@ class ET_EXPERIMENTAL CudaBackend final
     } else {
       // Skip-copy optimization: point ETensor directly to GPU data.
       // The caller is responsible for handling GPU data directly.
-      //
-      // No cudaStreamSynchronize needed here because:
-      // 1. All operations (kernel, allocations, frees) are on the same stream
-      // 2. cudaFreeAsync is stream-ordered, so CUDA guarantees the kernel
-      //    completes before any memory is freed
-      // 3. The next execution's operations will also be ordered on this stream
-      //
+
       // Lifetime management: We cache the newly created GPU tensors and delete
       // the previous round's tensors, since they are no longer needed.
       {
@@ -616,8 +600,25 @@ class ET_EXPERIMENTAL CudaBackend final
   // the underlying GPU memory alive while the caller processes the results.
   // Maps each CudaDelegateHandle* to its vector of cached output tensors.
   mutable std::mutex cached_outputs_mutex_;
-  mutable std::unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
-      cached_outputs_;
+  mutable std::
+      unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
+          cached_outputs_;
+
+  // Finds a cached SlimTensor by data pointer.
+  // Returns the cached SlimTensor if found, nullptr otherwise.
+  // This is used to detect if input data is already on GPU from a previous
+  // method's output, avoiding the need for cudaPointerGetAttributes.
+  SlimTensor* find_cached_tensor_by_data_ptr(const void* data_ptr) const {
+    std::lock_guard<std::mutex> guard(cached_outputs_mutex_);
+    for (const auto& [handle, tensors] : cached_outputs_) {
+      for (SlimTensor* tensor : tensors) {
+        if (tensor != nullptr && tensor->data_ptr() == data_ptr) {
+          return tensor;
+        }
+      }
+    }
+    return nullptr;
+  }
 };
 
 } // namespace executorch::backends::cuda

From 6916192060d9236c84fe1cb18833b22365889543 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 11 Feb 2026 01:47:36 -0800
Subject: [PATCH 15/16] rename for better clarification

---
 backends/cuda/runtime/cuda_backend.cpp       | 12 ++++++------
 backends/cuda/runtime/cuda_delegate_handle.h | 18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 3b6f67d9d82..fc21326b7f5 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -147,12 +147,12 @@ class ET_EXPERIMENTAL CudaBackend final
 
   // Create the shared CUDA stream. Called when use_shared_cuda_stream option
   // is set to true. The presence of shared_cuda_stream_ indicates shared mode.
-  void create_shared_cuda_stream() {
+  void create_cuda_stream() {
     std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
     if (shared_cuda_stream_ != nullptr) {
       return; // Already created
     }
-    shared_cuda_stream_ = cuda::create_shared_cuda_stream();
+    shared_cuda_stream_ = cuda::create_cuda_stream();
     if (shared_cuda_stream_ == nullptr) {
       ET_LOG(Error, "Failed to create shared CUDA stream");
       return;
@@ -161,7 +161,7 @@ class ET_EXPERIMENTAL CudaBackend final
   }
 
   // Get the shared CUDA stream. Returns nullptr if not in shared mode.
-  cuda::SharedCudaStream get_shared_cuda_stream() const {
+  std::shared_ptr<cudaStream_t> get_shared_cuda_stream() const {
     std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
     return shared_cuda_stream_;
   }
@@ -233,7 +233,7 @@ class ET_EXPERIMENTAL CudaBackend final
       } else if (std::strcmp(option.key, kUseSharedCudaStream) == 0) {
         if (auto* val = std::get_if<bool>(&option.value)) {
           if (*val) {
-            create_shared_cuda_stream();
+            create_cuda_stream();
           }
         } else {
           ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
@@ -365,7 +365,7 @@ class ET_EXPERIMENTAL CudaBackend final
           method_name.c_str());
     } else {
       // Per-handle stream mode: each handle owns its own stream.
-      handle->cuda_stream = cuda::create_shared_cuda_stream();
+      handle->cuda_stream = cuda::create_cuda_stream();
       if (handle->cuda_stream == nullptr) {
         delete handle;
         return Error::Internal;
@@ -593,7 +593,7 @@ class ET_EXPERIMENTAL CudaBackend final
   // Managed via shared_ptr so it's automatically cleaned up when last handle
   // is destroyed.
   mutable std::mutex cuda_stream_mutex_;
-  cuda::SharedCudaStream shared_cuda_stream_ = nullptr;
+  std::shared_ptr<cudaStream_t> shared_cuda_stream_ = nullptr;
 
   // Cached output tensors for skip-copy optimization.
   // When skip-copy is enabled, output SlimTensors are cached here to keep
diff --git a/backends/cuda/runtime/cuda_delegate_handle.h b/backends/cuda/runtime/cuda_delegate_handle.h
index 986718ffdf5..2ded24808d1 100644
--- a/backends/cuda/runtime/cuda_delegate_handle.h
+++ b/backends/cuda/runtime/cuda_delegate_handle.h
@@ -27,27 +27,27 @@ struct CudaStreamDeleter {
   }
 };
 
-using SharedCudaStream = std::shared_ptr<cudaStream_t>;
-
 // Creates a new shared CUDA stream.
 // Returns nullptr on failure.
-inline SharedCudaStream create_shared_cuda_stream() {
+inline std::shared_ptr<cudaStream_t> create_cuda_stream() {
   cudaStream_t stream;
   cudaError_t err = cudaStreamCreate(&stream);
   if (err != cudaSuccess) {
     return nullptr;
   }
-  return SharedCudaStream(new cudaStream_t(stream), CudaStreamDeleter());
+  return std::shared_ptr<cudaStream_t>(
+      new cudaStream_t(stream), CudaStreamDeleter());
 }
 
 // CUDA-specific delegate handle that extends AOTIDelegateHandle.
 // This consolidates CUDA stream management into a single location.
 struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
-  // Shared CUDA stream for this handle.
-  // When multiple handles share the same stream (e.g., for skip-copy
-  // optimization), they will all hold a reference to the same shared_ptr.
-  // The stream is automatically destroyed when the last handle is destroyed.
-  SharedCudaStream cuda_stream;
+  // CUDA stream for this handle, support both shared mode and single mode.
+  // In shared mode, all cuda delegate handles share the same stream (e.g., for
+  // skip-copy optimization), they will all hold a reference to the same
+  // shared_ptr. The stream is automatically destroyed when the last handle is
+  // destroyed. In single mode, every cuda delegate handle has its own stream.
+  std::shared_ptr<cudaStream_t> cuda_stream;
 
   // Get the raw CUDA stream pointer for use in CUDA API calls.
   // Returns nullptr if no stream is set.

From 7d06ab5384a5ea4fcdf6b6f77d7cbf8b2635228c Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 11 Feb 2026 11:04:27 -0800
Subject: [PATCH 16/16] rebase to latest main

---
 backends/cuda/runtime/cuda_backend.cpp | 27 ++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index fc21326b7f5..e5d8c803efe 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -419,25 +419,28 @@ class ET_EXPERIMENTAL CudaBackend final
       SlimTensor* cached_tensor = find_cached_tensor_by_data_ptr(data_ptr);
       if (cached_tensor != nullptr) {
         // Data is already on GPU from a previous method's output.
-        // Wrap it directly without copy using from_blob.
+        // Use it directly without copy using from_blob and input etensor
+        // metadata. We do not direclty used cached_tensor here as gpu_input[i]
+        // because although the underlying data is the same, the shape and
+        // strides may be different between the cached tensor and the current
+        // input tensor.
         auto sizes = cpu_tensor->sizes();
         auto strides = cpu_tensor->strides();
         std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
         std::vector<int64_t> strides_vec(strides.begin(), strides.end());
         gpu_inputs[i] = new SlimTensor(slim::from_blob(
-              const_cast<void*>(data_ptr),
-              slim::makeArrayRef(sizes_vec),
-              slim::makeArrayRef(strides_vec),
-              static_cast<slim::c10::ScalarType>(cpu_tensor->scalar_type()),
-              DEFAULT_CUDA_DEVICE,
-              0 // storage_offset
-              ));
-
-          continue;
-        }
+            const_cast<void*>(data_ptr),
+            slim::makeArrayRef(sizes_vec),
+            slim::makeArrayRef(strides_vec),
+            static_cast<slim::c10::ScalarType>(cpu_tensor->scalar_type()),
+            DEFAULT_CUDA_DEVICE,
+            0 // storage_offset
+            ));
+
+        continue;
       }
 
-      // Data is on CPU - use from_etensor to copy to GPU
+      // Data is not cacheed -- it must on CPU - use from_etensor to copy to GPU
       gpu_inputs[i] = new SlimTensor(
           from_etensor(*cpu_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE));
     }