From 89f02d9d7059dacfe6ccd565ea81654c49828b80 Mon Sep 17 00:00:00 2001
From: Songhao Jia <gasoonjia@meta.com>
Date: Thu, 12 Feb 2026 12:35:35 -0800
Subject: [PATCH] add cuda backend to backend test infra (#17403)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/17403

Differential Revision: D93019490
---
 .ci/scripts/test_backend.sh               |  9 +++
 .github/workflows/test-backend-cuda.yml   | 30 ++++++++++
 backends/cuda/test/tester.py              | 71 +++++++++++++++++++++++
 backends/test/harness/stages/serialize.py | 28 ++++++++-
 backends/test/suite/flow.py               |  9 +++
 backends/test/suite/flows/cuda.py         | 26 +++++++++
 6 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/test-backend-cuda.yml
 create mode 100644 backends/cuda/test/tester.py
 create mode 100644 backends/test/suite/flows/cuda.py

diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index 1a8e3219be0..37dd8a70859 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -56,6 +56,15 @@ if [[ "$FLOW" == *vulkan* ]]; then
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *cuda* ]]; then
+    # Fix libstdc++ GLIBCXX version for CUDA backend.
+    # The embedded .so files in the CUDA blob require GLIBCXX_3.4.30
+    # which the default conda libstdc++ doesn't have.
+    echo "Installing newer libstdc++ for CUDA backend..."
+    conda install -y -c conda-forge 'libstdcxx-ng>=12'
+    export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
+fi
+
 if [[ "$FLOW" == *arm* ]]; then
 
     # Setup ARM deps.
diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml
new file mode 100644
index 00000000000..ac5c9b97c43
--- /dev/null
+++ b/.github/workflows/test-backend-cuda.yml
@@ -0,0 +1,30 @@
+name: Test CUDA Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/test-backend-cuda.yml
+      - .github/workflows/_test_backend.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-cuda:
+    uses: ./.github/workflows/_test_backend.yml
+    with:
+      backend: cuda
+      flows: '["cuda"]'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      run-linux: true
+      runner-linux: linux.g5.4xlarge.nvidia.gpu
diff --git a/backends/cuda/test/tester.py b/backends/cuda/test/tester.py
new file mode 100644
index 00000000000..e4ac2b366d4
--- /dev/null
+++ b/backends/cuda/test/tester.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Optional, Tuple
+
+import executorch
+import executorch.backends.test.harness.stages as BaseStages
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.backends.test.harness import Tester as TesterBase
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.partitioner import Partitioner
+
+
+def _create_default_partitioner() -> CudaPartitioner:
+    """Create a CudaPartitioner with default compile specs."""
+    compile_specs = [CudaBackend.generate_method_name_compile_spec("forward")]
+    return CudaPartitioner(compile_specs)
+
+
+class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
+    """CUDA-specific ToEdgeTransformAndLower stage."""
+
+    def __init__(
+        self,
+        partitioners: Optional[List[Partitioner]] = None,
+        edge_compile_config: Optional[EdgeCompileConfig] = None,
+    ):
+        if partitioners is None:
+            partitioners = [_create_default_partitioner()]
+
+        super().__init__(
+            default_partitioner_cls=_create_default_partitioner,
+            partitioners=partitioners,
+            edge_compile_config=edge_compile_config
+            or EdgeCompileConfig(_check_ir_validity=False),
+        )
+
+
+class CudaTester(TesterBase):
+    """
+    Tester subclass for CUDA backend.
+
+    This tester defines the recipe for lowering models to the CUDA backend
+    using AOTInductor compilation.
+    """
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        example_inputs: Tuple[torch.Tensor],
+        dynamic_shapes: Optional[Tuple[Any]] = None,
+    ):
+        stage_classes = (
+            executorch.backends.test.harness.Tester.default_stage_classes()
+            | {
+                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
+            }
+        )
+
+        super().__init__(
+            module=module,
+            stage_classes=stage_classes,
+            example_inputs=example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
diff --git a/backends/test/harness/stages/serialize.py b/backends/test/harness/stages/serialize.py
index a5be1631d98..e30da5c34c8 100644
--- a/backends/test/harness/stages/serialize.py
+++ b/backends/test/harness/stages/serialize.py
@@ -1,7 +1,7 @@
 import copy
 import logging
 
-from typing import Optional
+from typing import Dict, Optional
 
 from executorch.backends.test.harness.stages.stage import Stage, StageType
 from executorch.exir import ExecutorchProgramManager
@@ -23,12 +23,15 @@
 class Serialize(Stage):
     def __init__(self):
         self.buffer = None
+        self.data_files: Dict[str, bytes] = {}
 
     def stage_type(self) -> StageType:
         return StageType.SERIALIZE
 
     def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None:
         self.buffer = artifact.buffer
+        # Capture external data files (e.g., .ptd files for CUDA backend)
+        self.data_files = artifact.data_files
 
     @property
     def artifact(self) -> bytes:
@@ -40,8 +43,29 @@ def graph_module(self) -> None:
 
     def run_artifact(self, inputs):
         inputs_flattened, _ = tree_flatten(inputs)
+
+        # Combine all external data files into a single buffer for data_map_buffer
+        # Most backends have at most one external data file, but we concatenate
+        # in case there are multiple (though this may not be fully supported)
+        data_map_buffer = None
+        if self.data_files:
+            # If there's exactly one data file, use it directly
+            # Otherwise, log a warning - multiple external files may need special handling
+            if len(self.data_files) == 1:
+                data_map_buffer = list(self.data_files.values())[0]
+            else:
+                # For multiple files, we use the first one and warn
+                # This is a limitation - proper handling would need runtime support
+                logger.warning(
+                    f"Multiple external data files found ({list(self.data_files.keys())}). "
+                    f"Using the first one. This may not work correctly for all backends."
+                )
+                data_map_buffer = list(self.data_files.values())[0]
+
         executorch_module = _load_for_executorch_from_buffer(
-            self.buffer, program_verification=Verification.Minimal
+            self.buffer,
+            data_map_buffer=data_map_buffer,
+            program_verification=Verification.Minimal,
         )
         executorch_output = copy.deepcopy(
             executorch_module.run_method("forward", tuple(inputs_flattened))
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index f3c9ee75083..2209a0ce11f 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -147,4 +147,13 @@ def all_flows() -> dict[str, TestFlow]:
     except Exception as e:
         logger.info(f"Skipping ARM flow registration: {e}")
 
+    try:
+        from executorch.backends.test.suite.flows.cuda import CUDA_TEST_FLOW
+
+        flows += [
+            CUDA_TEST_FLOW,
+        ]
+    except Exception as e:
+        logger.info(f"Skipping CUDA flow registration: {e}")
+
     return {f.name: f for f in flows if f is not None}
diff --git a/backends/test/suite/flows/cuda.py b/backends/test/suite/flows/cuda.py
new file mode 100644
index 00000000000..0c9c1d8bd63
--- /dev/null
+++ b/backends/test/suite/flows/cuda.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.cuda.test.tester import CudaTester
+from executorch.backends.test.suite.flow import TestFlow
+
+
+def _create_cuda_flow(name: str = "cuda") -> TestFlow:
+    """Create a test flow for the CUDA backend.
+
+    The CUDA backend saves data externally (.so and weights blob in .ptd file).
+    The test harness serialize stage has been updated to support loading external
+    data via the data_map_buffer parameter of _load_for_executorch_from_buffer.
+    """
+    return TestFlow(
+        name,
+        backend="cuda",
+        tester_factory=CudaTester,
+        quantize=False,
+    )
+
+
+CUDA_TEST_FLOW = _create_cuda_flow("cuda")