Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir] [m
Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.

Arguments:
device cuda, metal, or xnnpack (required)
device cuda, metal, vulkan, or xnnpack (required)

hf_model HuggingFace model ID (required)
Supported models:
Expand Down Expand Up @@ -49,6 +49,7 @@ Examples:
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
export_model_artifact.sh vulkan "nvidia/parakeet-tdt" "non-quantized" "./output"
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-8da4w" "./output"
Expand Down Expand Up @@ -103,9 +104,11 @@ case "$DEVICE" in
;;
xnnpack)
;;
vulkan)
;;
*)
echo "Error: Unsupported device '$DEVICE'"
echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
echo "Supported devices: cuda, cuda-windows, metal, vulkan, xnnpack"
exit 1
;;
esac
Expand Down Expand Up @@ -226,8 +229,8 @@ case "$QUANT_NAME" in
EXTRA_ARGS="--qlinear fpa4w --qlinear_encoder fpa4w"
;;
quantized-8da4w)
if [ "$DEVICE" != "xnnpack" ]; then
echo "Error: quantized-8da4w is only supported with xnnpack device"
if [ "$DEVICE" != "xnnpack" ] && [ "$DEVICE" != "vulkan" ]; then
echo "Error: quantized-8da4w is only supported with xnnpack or vulkan device"
exit 1
fi
EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
Expand All @@ -250,9 +253,11 @@ pip list
if [ "$MODEL_NAME" = "parakeet" ]; then
pip install -r examples/models/parakeet/install_requirements.txt

# Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
# Set dtype based on backend (XNNPACK uses fp32, Vulkan uses fp16, CUDA/Metal use bf16)
if [ "$DEVICE" = "xnnpack" ]; then
DTYPE_ARG=""
elif [ "$DEVICE" = "vulkan" ]; then
DTYPE_ARG="--vulkan_force_fp16"
else
DTYPE_ARG="--dtype bf16"
fi
Expand Down
13 changes: 12 additions & 1 deletion .ci/scripts/setup-vulkan-linux-deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,20 @@ install_vulkan_sdk() {
tar -C "${_vulkan_sdk_dir}" -xJf "${_tmp_archive}"

export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/lib/"
}

VULKAN_SDK_VERSION="1.4.321.1"

install_swiftshader
# Parse arguments: --gpu skips SwiftShader (use NVIDIA driver's Vulkan ICD instead)
USE_GPU=false
for arg in "$@"; do
case $arg in
--gpu) USE_GPU=true ;;
esac
done

if [ "$USE_GPU" = false ]; then
install_swiftshader
fi
install_vulkan_sdk "${VULKAN_SDK_VERSION}"
7 changes: 4 additions & 3 deletions .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir] [mode]
Build and run end-to-end tests for CUDA/Metal/XNNPACK models.

Arguments:
device cuda, metal, or xnnpack (required)
device cuda, metal, vulkan, or xnnpack (required)

hf_model HuggingFace model ID (required)
Supported models:
Expand Down Expand Up @@ -47,6 +47,7 @@ Examples:
test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
test_model_e2e.sh vulkan "nvidia/parakeet-tdt" "non-quantized" "./model_output"
test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
Expand Down Expand Up @@ -274,8 +275,8 @@ echo "::endgroup::"

echo "::group::Build $MODEL_NAME Runner"

if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "vulkan" ] && [ "$DEVICE" != "xnnpack" ]; then
echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', 'vulkan', or 'xnnpack'."
exit 1
fi

Expand Down
40 changes: 40 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,46 @@ jobs:
python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"
python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*torchao*"

test-vulkan-genai:
name: test-vulkan-genai
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.g5.4xlarge.nvidia.gpu
docker-image: ci-image:executorch-ubuntu-22.04-clang12
gpu-arch-type: cuda
gpu-arch-version: "12.6"
submodules: 'recursive'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 90
script: |
set -eux

# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"

# Setup Vulkan SDK (no SwiftShader — use NVIDIA driver's Vulkan ICD)
source .ci/scripts/setup-vulkan-linux-deps.sh --gpu

# Install ExecuTorch Python package (for export step).
./install_executorch.sh

# Export parakeet with Vulkan backend
bash .ci/scripts/export_model_artifact.sh vulkan "nvidia/parakeet-tdt" "quantized-8da4w" "${RUNNER_ARTIFACT_DIR}"

# Clean cmake-out/ — install_executorch.sh creates a CMake cache with
# most extensions OFF, and set_overridable_option in the Makefile
# presets cannot override cached variables. Without this, the
# llm-debug-vulkan preset fails to enable EXECUTORCH_BUILD_EXTENSION_LLM
# and the tokenizers header is never installed.
rm -rf cmake-out

# Build runner and test e2e (make parakeet-vulkan handles the full C++ build)
bash .ci/scripts/test_model_e2e.sh vulkan "nvidia/parakeet-tdt" "quantized-8da4w" "${RUNNER_ARTIFACT_DIR}"

test-coreml-bc-macos:
name: test-coreml-bc-macos (${{ matrix.runner }})
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
Expand Down
25 changes: 19 additions & 6 deletions examples/models/parakeet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,21 @@ if(TARGET optimized_native_cpu_ops_lib)
endif()
endif()

# CPU-only builds need quantized and custom ops
if(NOT EXECUTORCH_BUILD_CUDA)
list(APPEND link_libraries quantized_ops_lib custom_ops)
executorch_target_link_options_shared_lib(quantized_ops_lib)
executorch_target_link_options_shared_lib(custom_ops)
# Quantized and custom ops
if(TARGET quantized_ops_lib)
list(APPEND link_libraries quantized_ops_lib)
get_target_property(_is_imported quantized_ops_lib IMPORTED)
if(NOT _is_imported)
executorch_target_link_options_shared_lib(quantized_ops_lib)
endif()
endif()

if(TARGET custom_ops)
list(APPEND link_libraries custom_ops)
get_target_property(_is_imported custom_ops IMPORTED)
if(NOT _is_imported)
executorch_target_link_options_shared_lib(custom_ops)
endif()
endif()

# XNNPACK
Expand Down Expand Up @@ -74,9 +84,12 @@ list(
extension_data_loader
extension_tensor
extension_flat_tensor
tokenizers::tokenizers
)

if(TARGET tokenizers::tokenizers)
list(APPEND link_libraries tokenizers::tokenizers)
endif()

# Link CUDA backend
if(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
Expand Down
Loading