diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 0a4c6a6f757..123680e5275 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in LINTRUNNER="" GCC_VERSION=11 CUDA_WINDOWS_CROSS_COMPILE=yes - CUDA_VERSION=12.8 + CUDA_VERSION=13.0 SKIP_PYTORCH=yes ;; executorch-ubuntu-24.04-gcc14) @@ -97,6 +97,10 @@ esac TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt) BUILD_DOCS=1 +if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then + PYTORCH_BUILD_MAX_JOBS=6 +fi + # Copy requirements-lintrunner.txt from root to here cp ../../requirements-lintrunner.txt ./ @@ -109,6 +113,7 @@ docker build \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ --build-arg "TORCH_VERSION=${TORCH_VERSION}" \ + --build-arg "PYTORCH_BUILD_MAX_JOBS=${PYTORCH_BUILD_MAX_JOBS:-}" \ --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \ --build-arg "LINTRUNNER=${LINTRUNNER:-}" \ --build-arg "BUILD_DOCS=${BUILD_DOCS}" \ diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index f6e39a63b92..242371cbebe 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -release/2.11 \ No newline at end of file +release/2.12 diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh index 7b7d39994ca..82be8697320 100755 --- a/.ci/docker/common/install_cache.sh +++ b/.ci/docker/common/install_cache.sh @@ -76,6 +76,9 @@ init_sccache() { # This is the remote cache bucket export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2 export SCCACHE_S3_KEY_PREFIX=executorch + export SCCACHE_REGION=us-east-1 + export AWS_REGION=us-east-1 + export AWS_DEFAULT_REGION=us-east-1 export SCCACHE_IDLE_TIMEOUT=0 export SCCACHE_ERROR_LOG=/tmp/sccache_error.log export RUST_LOG=sccache::server=error diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh index 4300dd62d7a..9794e146407 100644 --- a/.ci/docker/common/install_cuda.sh +++ b/.ci/docker/common/install_cuda.sh @@ -10,10 +10,10 @@ set -ex -# CUDA version must be specified (e.g., 12.8) +# CUDA version must be specified (e.g., 13.0) CUDA_VERSION="${CUDA_VERSION:?CUDA_VERSION must be set}" -# Convert version format (e.g., 12.8 -> 12-8 for package names) +# Convert version format (e.g., 13.0 -> 13-0 for package names) CUDA_VERSION_DASH=$(echo "${CUDA_VERSION}" | tr '.' '-') # Add NVIDIA package repository diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh index e3529751221..b5d92c8215d 100644 --- a/.ci/docker/common/install_cuda_windows_cross_compile.sh +++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh @@ -17,6 +17,7 @@ declare -A CUDA_DRIVER_MAP=( ["12.6"]="12.6.3:561.17" ["12.8"]="12.8.1:572.61" ["12.9"]="12.9.1:576.57" + ["13.0"]="13.0.2:" ) install_mingw() { @@ -76,19 +77,26 @@ install_windows_cuda() { CUDA_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f1) CUDA_DRIVER_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f2) - echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}" + if [ -n "${CUDA_DRIVER_VERSION}" ]; then + echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}" + CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe" + else + echo "Using CUDA ${CUDA_VERSION}" + CUDA_INSTALLER="cuda_${CUDA_VERSION}_windows.exe" + fi echo "Installing Windows CUDA toolkit ${CUDA_VERSION}..." mkdir -p "${INSTALL_DIR}" cd "${INSTALL_DIR}" - CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe" CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}" # Check if already downloaded and extracted if [ -d "${INSTALL_DIR}/extracted/cuda_cudart" ]; then echo "Windows CUDA toolkit already installed, skipping download..." + chmod -R a+rX "${INSTALL_DIR}" + chmod -R a+rwX "${INSTALL_DIR}/extracted/cuda_cudart/cudart/lib" return 0 fi @@ -98,8 +106,11 @@ install_windows_cuda() { echo "Extracting CUDA toolkit..." 7z x "${CUDA_INSTALLER}" -o"extracted" -y - # Fix permissions so ci-user can access the files + # Fix permissions so ci-user can access the files. PyTorch Inductor also + # needs to write a MinGW import library beside cudart.lib during Windows + # cross-compilation. chmod -R a+rX "${INSTALL_DIR}" + chmod -R a+rwX "${INSTALL_DIR}/extracted/cuda_cudart/cudart/lib" # Clean up installer to save space rm -f "${CUDA_INSTALLER}" diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh index 548a24f885d..3c80d093ab2 100755 --- a/.ci/docker/common/install_pytorch.sh +++ b/.ci/docker/common/install_pytorch.sh @@ -27,6 +27,12 @@ install_pytorch_and_domains() { chown -R ci-user . export _GLIBCXX_USE_CXX11_ABI=1 + if [[ "$(uname -m)" == "aarch64" ]]; then + export BUILD_IGNORE_SVE_UNAVAILABLE=1 + fi + if [[ -n "${PYTORCH_BUILD_MAX_JOBS:-}" ]]; then + export MAX_JOBS="${PYTORCH_BUILD_MAX_JOBS}" + fi # Then build and install PyTorch conda_run python setup.py bdist_wheel pip_install "$(echo dist/*.whl)" @@ -34,7 +40,7 @@ install_pytorch_and_domains() { # Grab the pinned audio and vision commits from PyTorch TORCHAUDIO_VERSION=release/2.11 export TORCHAUDIO_VERSION - TORCHVISION_VERSION=release/0.26 + TORCHVISION_VERSION=release/0.27 export TORCHVISION_VERSION install_domains diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 0e2d7e48eb9..9a5b2536df0 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -62,9 +62,12 @@ RUN bash ./install_cache.sh && rm install_cache.sh utils.sh ENV SCCACHE_BUCKET ossci-compiler-cache-circleci-v2 ENV SCCACHE_S3_KEY_PREFIX executorch ENV SCCACHE_REGION us-east-1 +ENV AWS_REGION us-east-1 +ENV AWS_DEFAULT_REGION us-east-1 ARG TORCH_VERSION ARG SKIP_PYTORCH +ARG PYTORCH_BUILD_MAX_JOBS COPY ./common/install_pytorch.sh install_pytorch.sh COPY ./common/utils.sh utils.sh RUN if [ -z "${SKIP_PYTORCH}" ]; then bash ./install_pytorch.sh; fi && rm install_pytorch.sh utils.sh diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 9adea394993..db447bb907f 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -518,7 +518,7 @@ fi DEVICE_ARG="" if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then - DEVICE_ARG="--device cuda" + DEVICE_ARG="--device cuda:0" elif [ "$DEVICE" = "metal" ]; then DEVICE_ARG="--device mps" fi diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh index 08673533927..e717718be66 100755 --- a/.ci/scripts/test-cuda-build.sh +++ b/.ci/scripts/test-cuda-build.sh @@ -7,7 +7,7 @@ set -exu -CUDA_VERSION=${1:-"12.6"} +CUDA_VERSION=${1:-"13.0"} echo "=== Testing ExecuTorch CUDA ${CUDA_VERSION} Build ===" diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1 index 26bd2ef3c93..3490b6e071a 100644 --- a/.ci/scripts/test_model_e2e_windows.ps1 +++ b/.ci/scripts/test_model_e2e_windows.ps1 @@ -159,18 +159,31 @@ try { } Write-Host "CUDA version check passed: $actualCudaVersion" } + $cmakeCudaArgs = @() + if (-not [string]::IsNullOrWhiteSpace($env:CUDA_HOME)) { + $cudaNvcc = Join-Path -Path $env:CUDA_HOME -ChildPath "bin\nvcc.exe" + if (-not (Test-Path -Path $cudaNvcc -PathType Leaf)) { + throw "CUDA compiler not found at '$cudaNvcc'" + } + $env:CUDACXX = $cudaNvcc + $cmakeCudaArgs = @( + "-T", "cuda=$env:CUDA_HOME", + "-DCMAKE_CUDA_COMPILER=$cudaNvcc", + "-DCUDAToolkit_ROOT=$env:CUDA_HOME" + ) + } Write-Host "::endgroup::" Write-Host "::group::Build ExecuTorch (CUDA)" $numCores = [Math]::Max([Environment]::ProcessorCount - 1, 1) - cmake --preset llm-release-cuda + cmake --preset llm-release-cuda @cmakeCudaArgs cmake --build cmake-out --target install --config Release -j $numCores Write-Host "::endgroup::" Write-Host "::group::Build $runnerTarget" Push-Location (Join-Path -Path $executorchRoot -ChildPath "examples\models\$runnerPath") try { - cmake --preset $runnerPreset + cmake --preset $runnerPreset @cmakeCudaArgs cmake --build (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath") --target $runnerTarget --config Release -j $numCores } finally { diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh index 486745f4bf6..b312d0ede83 100644 --- a/.ci/scripts/utils.sh +++ b/.ci/scripts/utils.sh @@ -107,7 +107,7 @@ install_pytorch_and_domains() { local torch_release=$(cat version.txt) # Download key must match the upload key below (basename of dist/*.whl, # which always carries setup.py's resolved +gitHASH). Branch-ref pins - # like `release/2.11` would otherwise produce `+gitrelease` here and + # like `release/2.12` would otherwise produce `+gitrelease` here and # never hit the cache. local torch_short_hash=$(git rev-parse --short=7 HEAD) local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}" @@ -132,6 +132,9 @@ install_pytorch_and_domains() { # (e.g. executorch's requirements-ci.txt). pip install -r requirements-build.txt git submodule update --init --recursive + if [[ "$(uname -m)" == "aarch64" ]]; then + export BUILD_IGNORE_SVE_UNAVAILABLE=1 + fi USE_DISTRIBUTED=1 python setup.py bdist_wheel pip install "$(echo dist/*.whl)" @@ -175,7 +178,7 @@ install_pytorch_and_domains() { # Grab the pinned audio and vision commits from PyTorch TORCHAUDIO_VERSION=release/2.11 export TORCHAUDIO_VERSION - TORCHVISION_VERSION=release/0.26 + TORCHVISION_VERSION=release/0.27 export TORCHVISION_VERSION install_domains diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml index b2302cf91af..ada2fb9e696 100644 --- a/.github/workflows/cuda-perf.yml +++ b/.github/workflows/cuda-perf.yml @@ -110,7 +110,7 @@ jobs: secrets-env: EXECUTORCH_HF_TOKEN runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda - gpu-arch-version: "12.6" + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} @@ -124,10 +124,11 @@ jobs: echo "::endgroup::" echo "::group::Setup Huggingface" - pip install -U "huggingface_hub[cli]<1.0" accelerate - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1" + HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')" + hf auth login --token "$HF_AUTH_TOKEN" OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} echo "::endgroup::" echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}" @@ -158,7 +159,7 @@ jobs: timeout: 90 runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda - gpu-arch-version: "12.6" + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }} diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml index 265b7e3069d..aae27121bd0 100644 --- a/.github/workflows/cuda-windows.yml +++ b/.github/workflows/cuda-windows.yml @@ -64,7 +64,7 @@ jobs: secrets-env: EXECUTORCH_HF_TOKEN runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: 12.8 + gpu-arch-version: "13.0" docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows submodules: recursive upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }} @@ -98,10 +98,11 @@ jobs: # Setup Huggingface only for models that need it (not dinov2) if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then echo "::group::Setup Huggingface" - pip install -U "huggingface_hub[cli]<1.0" accelerate - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1" + HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')" + hf auth login --token "$HF_AUTH_TOKEN" OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} echo "::endgroup::" fi @@ -146,7 +147,7 @@ jobs: timeout: 240 runner: windows.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: 12.8 + gpu-arch-version: "13.0" download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | @@ -158,7 +159,7 @@ jobs: \$ErrorActionPreference = 'Stop' \$PSNativeCommandUseErrorActionPreference = \$true - \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8' + \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0' \$env:CUDA_PATH = \$env:CUDA_HOME \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\" nvcc --version @@ -169,5 +170,5 @@ jobs: throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.' } - .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8' + .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '13.0' }" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index e1eaba6b7c1..f19b937994f 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -1,6 +1,6 @@ # Test ExecuTorch CUDA Build Compatibility # This workflow tests whether ExecuTorch can be successfully built with CUDA support -# across different CUDA versions (12.6, 12.8, 12.9, 13.0) using the command: +# across different CUDA versions (12.6, 13.0) using the command: # ./install_executorch.sh # # Note: ExecuTorch automatically detects the system CUDA version using nvcc and @@ -31,7 +31,7 @@ jobs: strategy: fail-fast: false matrix: - cuda-version: ["12.6", "12.8", "12.9", "13.0"] + cuda-version: ["12.6", "13.0"] name: test-executorch-cuda-build-${{ matrix.cuda-version }} uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main @@ -66,7 +66,7 @@ jobs: echo "CUDA build results: ${{ needs.test-cuda-builds.result }}" exit 1 else - echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9, 13.0) completed successfully!" + echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 13.0) completed successfully!" fi test-models-cuda: @@ -79,7 +79,7 @@ jobs: timeout: 90 runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: 12.6 + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -114,7 +114,7 @@ jobs: timeout: 90 runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: 12.6 + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -271,7 +271,7 @@ jobs: secrets-env: EXECUTORCH_HF_TOKEN runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda - gpu-arch-version: 12.6 + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }} @@ -288,10 +288,11 @@ jobs: # Setup Huggingface only for models that need it (not parakeet or dinov2) if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ]; then echo "::group::Setup Huggingface" - pip install -U "huggingface_hub[cli]<1.0" accelerate - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1" + HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')" + hf auth login --token "$HF_AUTH_TOKEN" OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} echo "::endgroup::" fi @@ -406,7 +407,7 @@ jobs: timeout: 90 runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda - gpu-arch-version: 12.6 + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }} @@ -442,7 +443,7 @@ jobs: download-artifact: ${{ matrix.artifact }} runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: 12.6 + gpu-arch-version: "13.0" use-custom-docker-registry: false submodules: recursive ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -469,13 +470,15 @@ jobs: echo "::endgroup::" echo "::group::Setup Huggingface" - pip install -U "huggingface_hub[cli]<1.0" - huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" + HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')" + hf auth login --token "$HF_AUTH_TOKEN" echo "::endgroup::" echo "::group::Install optimum-executorch" OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) - pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + pip install "optimum~=2.0.0" "transformers==5.0.0rc1" + pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} echo "::endgroup::" echo "::group::Test CUDA Model: ${{ matrix.model }} ${{ matrix.quantize }}" diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml index de6507e035a..7230f6660e6 100644 --- a/.github/workflows/metal.yml +++ b/.github/workflows/metal.yml @@ -226,7 +226,11 @@ jobs: # the PCH dir, which PyTorch derives from tempfile.gettempdir() independently. export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX") export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX") - ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + VR_MODE="" + if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then + VR_MODE="vr-offline" + fi + ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}" test-model-metal-e2e: name: test-model-metal-e2e @@ -290,4 +294,8 @@ jobs: fi echo "::endgroup::" - ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + VR_MODE="" + if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then + VR_MODE="vr-offline" + fi + ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}" diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index d429db16053..c4be146f862 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -120,10 +120,10 @@ jobs: --prompt-len 4 \ --max-new-tokens 5 2>&1) echo "$OUTPUT" - if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then + if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 94, 253, 88, 227\]"; then echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output" else - echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])" + echo "Failed: unexpected output (expected [167, 94, 253, 88, 227])" exit 1 fi echo "::endgroup::" diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index 55dad54526e..5d8aa1018a1 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -218,6 +219,25 @@ AOTI_SHIM_EXPORT void aoti_torch_warn( ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg); } +AOTI_SHIM_EXPORT void aoti_torch_check( + bool cond, + const char* func, + const char* file, + uint32_t line, + const char* msg) { + if (cond) { + return; + } + ET_LOG( + Fatal, + "[%s:%u] %s: %s", + file != nullptr ? file : "", + line, + func != nullptr ? func : "", + msg != nullptr ? msg : "AOTI check failed"); + ::executorch::runtime::runtime_abort(); +} + AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) { (void)tensor; diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h index 6f7313e9b60..8f5abd4ca6f 100644 --- a/backends/aoti/common_shims.h +++ b/backends/aoti/common_shims.h @@ -87,6 +87,13 @@ AOTI_SHIM_EXPORT void aoti_torch_warn( uint32_t line, const char* msg); +AOTI_SHIM_EXPORT void aoti_torch_check( + bool cond, + const char* func, + const char* file, + uint32_t line, + const char* msg); + AOTI_SHIM_EXPORT AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size); diff --git a/backends/aoti/common_shims_slim.cpp b/backends/aoti/common_shims_slim.cpp index 739b3ee68c0..8ea44538c55 100644 --- a/backends/aoti/common_shims_slim.cpp +++ b/backends/aoti/common_shims_slim.cpp @@ -7,6 +7,8 @@ */ #include +#include +#include namespace executorch { namespace backends { @@ -171,6 +173,25 @@ AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled) { return Error::Ok; } +void aoti_torch_check( + bool cond, + const char* func, + const char* file, + uint32_t line, + const char* msg) { + if (cond) { + return; + } + ET_LOG( + Fatal, + "[%s:%u] %s: %s", + file != nullptr ? file : "", + line, + func != nullptr ? func : "", + msg != nullptr ? msg : "AOTI check failed"); + ::executorch::runtime::runtime_abort(); +} + } // extern "C" } // namespace aoti } // namespace backends diff --git a/backends/aoti/common_shims_slim.h b/backends/aoti/common_shims_slim.h index 75ede847d5a..fa88e3b341c 100644 --- a/backends/aoti/common_shims_slim.h +++ b/backends/aoti/common_shims_slim.h @@ -93,6 +93,14 @@ AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda(); AOTI_SHIM_EXPORT bool aoti_torch_grad_mode_is_enabled(); AOTI_SHIM_EXPORT AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled); +// Error reporting helper emitted by newer AOTInductor wrappers. +AOTI_SHIM_EXPORT void aoti_torch_check( + bool cond, + const char* func, + const char* file, + uint32_t line, + const char* msg); + } // extern "C" } // namespace aoti } // namespace backends diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py index 1a1a179f456..add0f3aeb20 100644 --- a/backends/arm/_passes/arm_pass.py +++ b/backends/arm/_passes/arm_pass.py @@ -9,12 +9,14 @@ from abc import abstractmethod from typing import Any, List, Optional, Set, Type +import torch from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue from torch.fx import GraphModule from torch.fx.passes.infra.pass_base import PassResult +from torch.utils import _pytree as pytree class ArmPass(ExportPass): @@ -79,6 +81,13 @@ def get_name(pass_) -> str: ) def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False): + if ( + op == exir_ops.edge.aten.bmm.default + and isinstance(meta, NodeMetadata) + and len(meta.data.get("input_qparams", {})) > 0 + ): + return self._call_quantized_bmm_without_fake_kernel(op, args, kwargs, meta) + if not updated: return super().call_operator(op, args, kwargs, meta) @@ -91,6 +100,35 @@ def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False) new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}" return super().call_operator(op, args, kwargs, NodeMetadata(new_meta)) + def _call_quantized_bmm_without_fake_kernel( + self, + op, + args: tuple[ProxyValue, ...], + kwargs: dict[str, Any], + meta: NodeMetadata, + ) -> ProxyValue: + old_val = meta.data["val"] + output_qparams = meta.data.get("output_qparams", {}) + dtype = ( + next(iter(output_qparams.values())).dtype + if len(output_qparams) > 0 + else old_val.dtype + ) + res_data = torch.empty_like(old_val, dtype=dtype) + + args_proxy, kwargs_proxy = pytree.tree_map_only( + ProxyValue, lambda x: x.proxy, (args, kwargs) + ) + res_proxy = self.tracer.create_proxy( + "call_function", + op, + args_proxy, + kwargs_proxy, + ) + res_proxy.node.meta.update(meta.data) + self.tracer.set_metadata(res_proxy.node, res_data) + return ProxyValue(res_data, res_proxy) + def call_submodule( self, graph_module: GraphModule, inputs: tuple[Any, ...] ) -> PassResult: diff --git a/backends/cuda/runtime/aoti_cuda_shims.lib b/backends/cuda/runtime/aoti_cuda_shims.lib index 5b66c0d82eb..63c8d79bd7a 100644 Binary files a/backends/cuda/runtime/aoti_cuda_shims.lib and b/backends/cuda/runtime/aoti_cuda_shims.lib differ diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py index b3034ff17ed..706d8ed3e14 100644 --- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py +++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py @@ -169,14 +169,19 @@ def test_per_channel_convolution(self, _, use_qat: bool): atol=1.0, ) - nodes = list(exported_program.graph.nodes) - + conv_nodes = [ + node + for node in exported_program.graph.nodes + if node.target == exir_ops.edge.aten.convolution.default + ] + assert len(conv_nodes) == 1 + + conv_node = conv_nodes[0] assert ( - nodes[8].target + conv_node.args[1].target == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default ) assert ( - nodes[9].target + conv_node.args[2].target == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default ) - assert nodes[10].target == exir_ops.edge.aten.convolution.default diff --git a/examples/models/llama3_2_vision/text_decoder/model.py b/examples/models/llama3_2_vision/text_decoder/model.py index 8f3a620affc..9f15f777045 100644 --- a/examples/models/llama3_2_vision/text_decoder/model.py +++ b/examples/models/llama3_2_vision/text_decoder/model.py @@ -181,19 +181,19 @@ def get_example_kwarg_inputs(self): return None def get_dynamic_shapes(self): - batch_size = 1 + static = torch.export.Dim.STATIC dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len) # Hardcoding # of tiles to be 2. image tokens per tile is 1601. if self.use_kv_cache: dynamic_shapes = { - "tokens": {0: batch_size, 1: dim_seq_len}, - "encoder_input": None, - "encoder_mask": {0: 1, 1: dim_seq_len, 2: None}, - "mask": {0: batch_size, 1: dim_seq_len, 2: None}, - "input_pos": {0: batch_size, 1: dim_seq_len}, + "tokens": {0: static, 1: dim_seq_len}, + "encoder_input": {0: static, 1: static, 2: static}, + "encoder_mask": {0: static, 1: dim_seq_len, 2: static}, + "mask": {0: static, 1: dim_seq_len, 2: static}, + "input_pos": {0: static, 1: dim_seq_len}, } else: dynamic_shapes = { - "tokens": {0: batch_size, 1: dim_seq_len}, + "tokens": {0: static, 1: dim_seq_len}, } return dynamic_shapes diff --git a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py index 4af637212a8..0ef7b298139 100644 --- a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py +++ b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py @@ -69,7 +69,6 @@ def test_llama3_2_text_decoder_aoti(self) -> None: encoder, model.get_example_inputs(), kwargs=model.get_example_kwarg_inputs(), - dynamic_shapes=model.get_dynamic_shapes(), strict=True, ) with tempfile.TemporaryDirectory() as tmpdir: diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py index 6a18cd58218..9114a9db3f9 100644 --- a/examples/models/parakeet/export_parakeet_tdt.py +++ b/examples/models/parakeet/export_parakeet_tdt.py @@ -334,7 +334,7 @@ def export_all( programs = {} # Determine device based on backend (preprocessor always stays on CPU) - device = torch.device("cuda" if backend == "cuda" else "cpu") + device = torch.device("cuda:0" if backend == "cuda" else "cpu") # Get audio parameters from model config sample_rate = model.preprocessor._cfg.sample_rate @@ -360,8 +360,8 @@ def export_all( preprocessor_wrapper, (sample_audio, sample_length), dynamic_shapes={ - # min=1600 samples = 0.1 sec @ 16kHz, max aligned with encoder limit - "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)}, + # min=10 frames = 0.1 sec @ 16kHz, max aligned with encoder limit. + "audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)}, "length": {}, }, strict=False, @@ -370,7 +370,7 @@ def export_all( # Move model to CUDA after preprocessor export (preprocessor must stay on CPU) if backend == "cuda": - model.cuda() + model.to(device) feat_in = getattr(model.encoder, "_feat_in", 128) # Use max_mel_frames as example to ensure Dim.AUTO infers the full range. diff --git a/examples/models/qwen3_5_moe/test_turboquant.py b/examples/models/qwen3_5_moe/test_turboquant.py index 53474dc2515..2353dc0b8f2 100644 --- a/examples/models/qwen3_5_moe/test_turboquant.py +++ b/examples/models/qwen3_5_moe/test_turboquant.py @@ -143,7 +143,7 @@ def test_eager_decode_quality(self): logits_base.reshape(1, -1).float(), logits_tq.reshape(1, -1).float(), ).item() - self.assertGreater(cos, 0.99, f"Prefill cosine {cos:.4f}") + self.assertGreater(cos, 0.98, f"Prefill cosine {cos:.4f}") def test_export_matches_eager(self): """Exported TQ model produces same greedy tokens as eager.""" diff --git a/examples/models/sortformer/export_sortformer.py b/examples/models/sortformer/export_sortformer.py index e8f25780607..36f77f4dfdf 100644 --- a/examples/models/sortformer/export_sortformer.py +++ b/examples/models/sortformer/export_sortformer.py @@ -222,7 +222,8 @@ def export_all(model, backend: Optional[str] = None): preprocessor_wrapper, (sample_audio, sample_length), dynamic_shapes={ - "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)}, + # min=10 frames = 0.1 sec @ 16kHz, max is one 120s runner chunk. + "audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)}, "length": {}, }, strict=False, diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py index 3dfa53af16a..b9d29b7f22a 100644 --- a/examples/models/voxtral_realtime/export_voxtral_rt.py +++ b/examples/models/voxtral_realtime/export_voxtral_rt.py @@ -188,7 +188,7 @@ def export_all( programs = {} param_dtype = next(model.parameters()).dtype - device = "cuda" if backend == "cuda" else "cpu" + device = "cuda:0" if backend == "cuda" else "cpu" # 1. Audio encoder print("\nExporting audio_encoder...") @@ -275,7 +275,7 @@ def export_streaming( programs = {} param_dtype = next(model.parameters()).dtype - device = "cuda" if backend == "cuda" else "cpu" + device = "cuda:0" if backend == "cuda" else "cpu" # 1. Streaming audio encoder print("\nExporting encode_audio_chunk...") @@ -618,7 +618,7 @@ def main(): # Move to CUDA for CUDA backend export (AOTInductor needs CUDA tensors) if backend_for_export == "cuda": print("Moving model to CUDA...") - model.cuda() + model.to(torch.device("cuda:0")) # Untie output/embedding weights before quantization so each layer gets # its own quantization config (embedding: 8w, output linear: 8da4w). diff --git a/examples/models/voxtral_realtime/model.py b/examples/models/voxtral_realtime/model.py index e591445cc56..3ff110c161e 100644 --- a/examples/models/voxtral_realtime/model.py +++ b/examples/models/voxtral_realtime/model.py @@ -1129,7 +1129,8 @@ def create_causal_mask( return torch.where( valid, torch.zeros(1, dtype=dtype, device=start_pos.device), - torch.tensor(float("-inf"), dtype=dtype, device=start_pos.device), + # MPS SDPA can propagate NaNs from -inf additive masks in AOTI. + torch.tensor(-1e9, dtype=dtype, device=start_pos.device), ) diff --git a/examples/models/voxtral_realtime/tests/test_ring_kv_cache.py b/examples/models/voxtral_realtime/tests/test_ring_kv_cache.py new file mode 100644 index 00000000000..274f0e177ba --- /dev/null +++ b/examples/models/voxtral_realtime/tests/test_ring_kv_cache.py @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from types import ModuleType +from unittest.mock import patch + +import torch + +with patch.dict( + "sys.modules", + {"executorch.extension.llm.custom_ops.custom_ops": ModuleType("custom_ops")}, +): + from executorch.examples.models.voxtral_realtime.model import StandardRingKVCache + + +class StandardRingKVCacheTest(unittest.TestCase): + def test_additive_mask_uses_finite_negative_values(self): + cache = StandardRingKVCache(window_size=4, n_heads=1, head_dim=2) + + mask = cache.create_causal_mask( + torch.tensor(0), seq_len=1, dtype=torch.bfloat16 + ) + + self.assertEqual(mask.dtype, torch.bfloat16) + self.assertTrue(torch.isfinite(mask).all()) + self.assertEqual(mask[0, 0].item(), 0) + self.assertLess(mask[0, 1].float().item(), -1e8) + + def test_bool_mask_keeps_bool_dtype(self): + cache = StandardRingKVCache(window_size=4, n_heads=1, head_dim=2) + + mask = cache.create_causal_mask(torch.tensor(3), seq_len=2, bool_mask=True) + + self.assertEqual(mask.dtype, torch.bool) + + +if __name__ == "__main__": + unittest.main() diff --git a/extension/llm/modules/attention.py b/extension/llm/modules/attention.py index f9446ea3aa7..8869553875e 100644 --- a/extension/llm/modules/attention.py +++ b/extension/llm/modules/attention.py @@ -302,11 +302,9 @@ def false_fn(y): k, v = calculate_kv(y) else: # Expecting the k, v returning here to be the same size of self.kv_cache - # In eager, we expect this predicate to specialize. In export, this will - # become a SymBool so it's not specialized. - k, v, cache_pos = torch.cond( - torch.isnan(y).all().item(), true_fn, false_fn, (y,) - ) + # In eager, we expect this predicate to specialize. In export, keep it + # as a tensor predicate so AOTI does not introduce unbacked symbols. + k, v, cache_pos = torch.cond(torch.isnan(y).all(), true_fn, false_fn, (y,)) # Update key-value cache self.kv_cache.k_cache.copy_(k) self.kv_cache.v_cache.copy_(v) diff --git a/install_requirements.py b/install_requirements.py index b30068cbdb8..53204ffd3ee 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -49,7 +49,7 @@ def install_requirements(use_pytorch_nightly): # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note # that we don't need to set any version number there because they have already # been installed on CI before this step, so pip won't reinstall them - ("torch==2.11.0" if use_pytorch_nightly else "torch"), + ("torch==2.12.0" if use_pytorch_nightly else "torch"), ] # Install the requirements for core ExecuTorch package. @@ -112,7 +112,7 @@ def install_optional_example_requirements(use_pytorch_nightly): print("Installing torch domain libraries") DOMAIN_LIBRARIES = [ - ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"), + ("torchvision==0.27.0" if use_pytorch_nightly else "torchvision"), ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"), ] # Then install domain libraries diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h index 63aa0d20d8e..cef99df3f56 100644 --- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h +++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h @@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; #define C10_HIP_HOST_DEVICE #endif -#if defined(USE_ROCM) // C10_WARP_SIZE is only allowed for device code. -// Host code _must_ use at::cuda::warp_size() +// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size(). +// Host or device statically-sized arrays _must_ use either +// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed. +// // HIP header used to define warpSize as a constexpr that was either 32 or 64 // depending on the target device, and then always set it to 64 for host code. -// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we -// set it to something unreasonable to trigger obvious host code errors. - +// For a time, that allowed C10_WARP_SIZE to be defined like so: +// +// #ifdef USE_ROCM +// #define C10_WARP_SIZE warpSize +// #else +// #define C10_WARP_SIZE 32 +// #endif +// +// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior. +// We can now only use warpSize for C10_WARP_SIZE in device code and this is +// enforced by using __device__ in its definition. In host code where +// C10_WARP_SIZE was previously used as a compile-time constant, this will now +// cause a compile-time error. +// +// If an array was previously expected to be sized at compile-time using +// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or +// C10_WARP_SIZE_LOWER_BOUND depending on the situation. +// +// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users +// must now use at::cuda::warp_size() for the dynamic runtime query. +// +// Unfortunately, C10_WARP_SIZE has been public and available for both host and +// device since approximately 2019, so forcing it to be device-only would break +// existing code in the wild. +#if defined(USE_ROCM) namespace at::cuda { TORCH_CUDA_CPP_API int warp_size(); } -#ifdef __HIPCC__ -static inline int __host__ C10_WARP_SIZE_INTERNAL() { +#if defined(__HIPCC__) +static __host__ inline int C10_WARP_SIZE_INTERNAL() { return at::cuda::warp_size(); } - -static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() { +// NOTE: __device__ C10_WARP_SIZE_INTERNAL +// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__, +// we can use constexpr. This matches prior behavior. We preserve this for +// backward compatibility instead of forcing old code to use dynamic warpSize +// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv +// could expose where C10_WARP_SIZE was used incorrectly where the dynamic +// warpSize is not allowed. +#if defined(__SPIRV__) +static __device__ inline int C10_WARP_SIZE_INTERNAL() { + return warpSize; +} +#else // __SPIRV__ +static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() { #if defined(__GFX9__) return 64; #else // __GFX9__ return 32; #endif // __GFX9__ } -#else // __HIPCC__ +#endif // __SPIRV__ +#if defined(__SPIRV__) +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 64 +#elif defined(__GFX9__) +#define C10_WARP_SIZE_LOWER_BOUND 64 +#define C10_WARP_SIZE_UPPER_BOUND 64 +#else +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 32 +#endif +#else // !__HIPCC__ static inline int C10_WARP_SIZE_INTERNAL() { return at::cuda::warp_size(); } +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 64 #endif // __HIPCC__ - #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL()) -#define C10_WARP_SIZE_STATIC 64 - -#else // defined(USE_ROCM) +#else // !USE_ROCM #define C10_WARP_SIZE 32 -#endif +#define C10_WARP_SIZE_LOWER_BOUND 32 +#define C10_WARP_SIZE_UPPER_BOUND 32 +#endif // USE_ROCM #if defined(_MSC_VER) && _MSC_VER <= 1900 #define __func__ __FUNCTION__ @@ -629,7 +676,7 @@ __host__ __device__ // This macro is used to find older C++ compilers // that don't support move optimization for return values. -#if (defined(__GNUC__) && __GNUC__ < 13) || \ +#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \ (defined(__clang_major__) && __clang_major__ < 13) #define C10_RETURN_MOVE_IF_OLD_COMPILER 1 #else diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h index 64479ba36f1..9aa08c265bd 100644 --- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h +++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h @@ -12,7 +12,7 @@ #include #include -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) #include #endif @@ -46,7 +46,7 @@ struct alignas(2) BFloat16 { /* implicit */ inline C10_HOST_DEVICE BFloat16(float value); inline C10_HOST_DEVICE operator float() const; -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value); explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const; #endif @@ -124,8 +124,9 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") /// Constructors inline C10_HOST_DEVICE BFloat16::BFloat16(float value) : -#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \ - __CUDA_ARCH__ >= 800 +#if defined(__CUDACC__) && \ + (!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \ + defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702)) x(__bfloat16_as_ushort(__float2bfloat16(value))) #elif defined(__SYCL_DEVICE_ONLY__) && \ defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) @@ -139,7 +140,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value) /// Implicit conversions inline C10_HOST_DEVICE BFloat16::operator float() const { -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) return __bfloat162float(*reinterpret_cast(&x)); #elif defined(__SYCL_DEVICE_ONLY__) && \ defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) @@ -149,7 +150,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const { #endif } -#if defined(__CUDACC__) && !defined(USE_ROCM) +#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702)) inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) { x = *reinterpret_cast(&value); } diff --git a/torch_pin.py b/torch_pin.py index 3575d9a376d..0c5cd50fe6d 100644 --- a/torch_pin.py +++ b/torch_pin.py @@ -1,2 +1,2 @@ -TORCH_VERSION = "2.11.0" +TORCH_VERSION = "2.12.0" # NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287