pytorch · SS-JIA · Mar 19, 2026 · Mar 19, 2026 · Mar 20, 2026 · Mar 27, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -14,7 +14,7 @@ Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir] [m
 Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
 
 Arguments:
-  device       cuda, metal, or xnnpack (required)
+  device       cuda, metal, vulkan, or xnnpack (required)
 
   hf_model     HuggingFace model ID (required)
                Supported models:
@@ -49,6 +49,7 @@ Examples:
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
+  export_model_artifact.sh vulkan "nvidia/parakeet-tdt" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
   export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
   export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-8da4w" "./output"
@@ -103,9 +104,11 @@ case "$DEVICE" in
     ;;
   xnnpack)
     ;;
+  vulkan)
+    ;;
   *)
     echo "Error: Unsupported device '$DEVICE'"
-    echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
+    echo "Supported devices: cuda, cuda-windows, metal, vulkan, xnnpack"
     exit 1
     ;;
 esac
@@ -226,8 +229,8 @@ case "$QUANT_NAME" in
     EXTRA_ARGS="--qlinear fpa4w --qlinear_encoder fpa4w"
     ;;
   quantized-8da4w)
-    if [ "$DEVICE" != "xnnpack" ]; then
-      echo "Error: quantized-8da4w is only supported with xnnpack device"
+    if [ "$DEVICE" != "xnnpack" ] && [ "$DEVICE" != "vulkan" ]; then
+      echo "Error: quantized-8da4w is only supported with xnnpack or vulkan device"
       exit 1
     fi
     EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
@@ -250,9 +253,11 @@ pip list
 if [ "$MODEL_NAME" = "parakeet" ]; then
   pip install -r examples/models/parakeet/install_requirements.txt
 
-  # Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
+  # Set dtype based on backend (XNNPACK uses fp32, Vulkan uses fp16, CUDA/Metal use bf16)
   if [ "$DEVICE" = "xnnpack" ]; then
     DTYPE_ARG=""
+  elif [ "$DEVICE" = "vulkan" ]; then
+    DTYPE_ARG="--vulkan_force_fp16"
   else
     DTYPE_ARG="--dtype bf16"
   fi

diff --git a/.ci/scripts/setup-vulkan-linux-deps.sh b/.ci/scripts/setup-vulkan-linux-deps.sh
@@ -41,9 +41,20 @@ install_vulkan_sdk() {
   tar -C "${_vulkan_sdk_dir}" -xJf "${_tmp_archive}"
 
   export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/"
+  export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/lib/"
 }
 
 VULKAN_SDK_VERSION="1.4.321.1"
 
-install_swiftshader
+# Parse arguments: --gpu skips SwiftShader (use NVIDIA driver's Vulkan ICD instead)
+USE_GPU=false
+for arg in "$@"; do
+  case $arg in
+    --gpu) USE_GPU=true ;;
+  esac
+done
+
+if [ "$USE_GPU" = false ]; then
+  install_swiftshader
+fi
 install_vulkan_sdk "${VULKAN_SDK_VERSION}"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -14,7 +14,7 @@ Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir] [mode]
 Build and run end-to-end tests for CUDA/Metal/XNNPACK models.
 
 Arguments:
-  device      cuda, metal, or xnnpack (required)
+  device      cuda, metal, vulkan, or xnnpack (required)
 
   hf_model    HuggingFace model ID (required)
               Supported models:
@@ -47,6 +47,7 @@ Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
   test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
+  test_model_e2e.sh vulkan "nvidia/parakeet-tdt" "non-quantized" "./model_output"
   test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
   test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
   test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
@@ -274,8 +275,8 @@ echo "::endgroup::"
 
 echo "::group::Build $MODEL_NAME Runner"
 
-if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
-  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
+if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "vulkan" ] && [ "$DEVICE" != "xnnpack" ]; then
+  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', 'vulkan', or 'xnnpack'."
   exit 1
 fi
 

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -1339,6 +1339,46 @@ jobs:
         python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"
         python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*torchao*"
 
+  test-vulkan-genai:
+    name: test-vulkan-genai
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Setup Vulkan SDK (no SwiftShader — use NVIDIA driver's Vulkan ICD)
+        source .ci/scripts/setup-vulkan-linux-deps.sh --gpu
+
+        # Install ExecuTorch Python package (for export step).
+        ./install_executorch.sh
+
+        # Export parakeet with Vulkan backend
+        bash .ci/scripts/export_model_artifact.sh vulkan "nvidia/parakeet-tdt" "quantized-8da4w" "${RUNNER_ARTIFACT_DIR}"
+
+        # Clean cmake-out/ — install_executorch.sh creates a CMake cache with
+        # most extensions OFF, and set_overridable_option in the Makefile
+        # presets cannot override cached variables. Without this, the
+        # llm-debug-vulkan preset fails to enable EXECUTORCH_BUILD_EXTENSION_LLM
+        # and the tokenizers header is never installed.
+        rm -rf cmake-out
+
+        # Build runner and test e2e (make parakeet-vulkan handles the full C++ build)
+        bash .ci/scripts/test_model_e2e.sh vulkan "nvidia/parakeet-tdt" "quantized-8da4w" "${RUNNER_ARTIFACT_DIR}"
+
   test-coreml-bc-macos:
     name: test-coreml-bc-macos (${{ matrix.runner }})
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main

@@ -40,11 +40,21 @@ if(TARGET optimized_native_cpu_ops_lib)
   endif()
 endif()
 
-# CPU-only builds need quantized and custom ops
-if(NOT EXECUTORCH_BUILD_CUDA)
-  list(APPEND link_libraries quantized_ops_lib custom_ops)
-  executorch_target_link_options_shared_lib(quantized_ops_lib)
-  executorch_target_link_options_shared_lib(custom_ops)
+# Quantized and custom ops
+if(TARGET quantized_ops_lib)
+  list(APPEND link_libraries quantized_ops_lib)
+  get_target_property(_is_imported quantized_ops_lib IMPORTED)
+  if(NOT _is_imported)
+    executorch_target_link_options_shared_lib(quantized_ops_lib)
+  endif()
+endif()
+
+if(TARGET custom_ops)
+  list(APPEND link_libraries custom_ops)
+  get_target_property(_is_imported custom_ops IMPORTED)
+  if(NOT _is_imported)
+    executorch_target_link_options_shared_lib(custom_ops)
+  endif()
 endif()
 
 # XNNPACK
@@ -74,9 +84,12 @@ list(
   extension_data_loader
   extension_tensor
   extension_flat_tensor
-  tokenizers::tokenizers
 )
 
+if(TARGET tokenizers::tokenizers)
+  list(APPEND link_libraries tokenizers::tokenizers)
+endif()
+
 # Link CUDA backend
 if(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)