pytorch · JacobSzwejbka · May 21, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in
     LINTRUNNER=""
     GCC_VERSION=11
     CUDA_WINDOWS_CROSS_COMPILE=yes
-    CUDA_VERSION=12.8
+    CUDA_VERSION=13.0
     SKIP_PYTORCH=yes
     ;;
   executorch-ubuntu-24.04-gcc14)
@@ -97,6 +97,10 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
+if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
+  PYTORCH_BUILD_MAX_JOBS=6
+fi
+
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -109,6 +113,7 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
+  --build-arg "PYTORCH_BUILD_MAX_JOBS=${PYTORCH_BUILD_MAX_JOBS:-}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-release/2.11
+release/2.12
diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
@@ -76,6 +76,9 @@ init_sccache() {
   # This is the remote cache bucket
   export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
   export SCCACHE_S3_KEY_PREFIX=executorch
+  export SCCACHE_REGION=us-east-1
+  export AWS_REGION=us-east-1
+  export AWS_DEFAULT_REGION=us-east-1
   export SCCACHE_IDLE_TIMEOUT=0
   export SCCACHE_ERROR_LOG=/tmp/sccache_error.log
   export RUST_LOG=sccache::server=error

diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
@@ -10,10 +10,10 @@
 
 set -ex
 
-# CUDA version must be specified (e.g., 12.8)
+# CUDA version must be specified (e.g., 13.0)
 CUDA_VERSION="${CUDA_VERSION:?CUDA_VERSION must be set}"
 
-# Convert version format (e.g., 12.8 -> 12-8 for package names)
+# Convert version format (e.g., 13.0 -> 13-0 for package names)
 CUDA_VERSION_DASH=$(echo "${CUDA_VERSION}" | tr '.' '-')
 
 # Add NVIDIA package repository

diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -17,6 +17,7 @@ declare -A CUDA_DRIVER_MAP=(
     ["12.6"]="12.6.3:561.17"
     ["12.8"]="12.8.1:572.61"
     ["12.9"]="12.9.1:576.57"
+    ["13.0"]="13.0.2:"
 )
 
 install_mingw() {
@@ -76,19 +77,26 @@ install_windows_cuda() {
     CUDA_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f1)
     CUDA_DRIVER_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f2)
 
-    echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}"
+    if [ -n "${CUDA_DRIVER_VERSION}" ]; then
+        echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}"
+        CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
+    else
+        echo "Using CUDA ${CUDA_VERSION}"
+        CUDA_INSTALLER="cuda_${CUDA_VERSION}_windows.exe"
+    fi
 
     echo "Installing Windows CUDA toolkit ${CUDA_VERSION}..."
 
     mkdir -p "${INSTALL_DIR}"
     cd "${INSTALL_DIR}"
 
-    CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
     CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
 
     # Check if already downloaded and extracted
     if [ -d "${INSTALL_DIR}/extracted/cuda_cudart" ]; then
         echo "Windows CUDA toolkit already installed, skipping download..."
+        chmod -R a+rX "${INSTALL_DIR}"
+        chmod -R a+rwX "${INSTALL_DIR}/extracted/cuda_cudart/cudart/lib"
         return 0
     fi
 
@@ -98,8 +106,11 @@ install_windows_cuda() {
     echo "Extracting CUDA toolkit..."
     7z x "${CUDA_INSTALLER}" -o"extracted" -y
 
-    # Fix permissions so ci-user can access the files
+    # Fix permissions so ci-user can access the files. PyTorch Inductor also
+    # needs to write a MinGW import library beside cudart.lib during Windows
+    # cross-compilation.
     chmod -R a+rX "${INSTALL_DIR}"
+    chmod -R a+rwX "${INSTALL_DIR}/extracted/cuda_cudart/cudart/lib"
 
     # Clean up installer to save space
     rm -f "${CUDA_INSTALLER}"

diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
@@ -27,14 +27,20 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
+  if [[ "$(uname -m)" == "aarch64" ]]; then
+    export BUILD_IGNORE_SVE_UNAVAILABLE=1
+  fi
+  if [[ -n "${PYTORCH_BUILD_MAX_JOBS:-}" ]]; then
+    export MAX_JOBS="${PYTORCH_BUILD_MAX_JOBS}"
+  fi
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.26
+  TORCHVISION_VERSION=release/0.27
   export TORCHVISION_VERSION
 
   install_domains

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -62,9 +62,12 @@ RUN bash ./install_cache.sh && rm install_cache.sh utils.sh
 ENV SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
 ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
+ENV AWS_REGION us-east-1
+ENV AWS_DEFAULT_REGION us-east-1
 
 ARG TORCH_VERSION
 ARG SKIP_PYTORCH
+ARG PYTORCH_BUILD_MAX_JOBS
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
 RUN if [ -z "${SKIP_PYTORCH}" ]; then bash ./install_pytorch.sh; fi && rm install_pytorch.sh utils.sh

diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -518,7 +518,7 @@ fi
 
 DEVICE_ARG=""
 if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
-  DEVICE_ARG="--device cuda"
+  DEVICE_ARG="--device cuda:0"
 elif [ "$DEVICE" = "metal" ]; then
   DEVICE_ARG="--device mps"
 fi

diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh
@@ -7,7 +7,7 @@
 
 set -exu
 
-CUDA_VERSION=${1:-"12.6"}
+CUDA_VERSION=${1:-"13.0"}
 
 echo "=== Testing ExecuTorch CUDA ${CUDA_VERSION} Build ==="
 

diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
@@ -159,18 +159,31 @@ try {
         }
         Write-Host "CUDA version check passed: $actualCudaVersion"
     }
+    $cmakeCudaArgs = @()
+    if (-not [string]::IsNullOrWhiteSpace($env:CUDA_HOME)) {
+        $cudaNvcc = Join-Path -Path $env:CUDA_HOME -ChildPath "bin\nvcc.exe"
+        if (-not (Test-Path -Path $cudaNvcc -PathType Leaf)) {
+            throw "CUDA compiler not found at '$cudaNvcc'"
+        }
+        $env:CUDACXX = $cudaNvcc
+        $cmakeCudaArgs = @(
+            "-T", "cuda=$env:CUDA_HOME",
+            "-DCMAKE_CUDA_COMPILER=$cudaNvcc",
+            "-DCUDAToolkit_ROOT=$env:CUDA_HOME"
+        )
+    }
     Write-Host "::endgroup::"
 
     Write-Host "::group::Build ExecuTorch (CUDA)"
     $numCores = [Math]::Max([Environment]::ProcessorCount - 1, 1)
-    cmake --preset llm-release-cuda
+    cmake --preset llm-release-cuda @cmakeCudaArgs
     cmake --build cmake-out --target install --config Release -j $numCores
     Write-Host "::endgroup::"
 
     Write-Host "::group::Build $runnerTarget"
     Push-Location (Join-Path -Path $executorchRoot -ChildPath "examples\models\$runnerPath")
     try {
-        cmake --preset $runnerPreset
+        cmake --preset $runnerPreset @cmakeCudaArgs
         cmake --build (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath") --target $runnerTarget --config Release -j $numCores
     }
     finally {

diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -107,7 +107,7 @@ install_pytorch_and_domains() {
   local torch_release=$(cat version.txt)
   # Download key must match the upload key below (basename of dist/*.whl,
   # which always carries setup.py's resolved +gitHASH). Branch-ref pins
-  # like `release/2.11` would otherwise produce `+gitrelease` here and
+  # like `release/2.12` would otherwise produce `+gitrelease` here and
   # never hit the cache.
   local torch_short_hash=$(git rev-parse --short=7 HEAD)
   local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
@@ -132,6 +132,9 @@ install_pytorch_and_domains() {
     # (e.g. executorch's requirements-ci.txt).
     pip install -r requirements-build.txt
     git submodule update --init --recursive
+    if [[ "$(uname -m)" == "aarch64" ]]; then
+      export BUILD_IGNORE_SVE_UNAVAILABLE=1
+    fi
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
@@ -175,7 +178,7 @@ install_pytorch_and_domains() {
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.26
+  TORCHVISION_VERSION=release/0.27
   export TORCHVISION_VERSION
 
   install_domains

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
@@ -110,7 +110,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
@@ -124,10 +124,11 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]<1.0" accelerate
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
+        HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+        hf auth login --token "$HF_AUTH_TOKEN"
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
         echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
@@ -158,7 +159,7 @@ jobs:
       timeout: 90
       runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}

diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
@@ -64,7 +64,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.8
+      gpu-arch-version: "13.0"
       docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
       upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
@@ -98,10 +98,11 @@ jobs:
         # Setup Huggingface only for models that need it (not dinov2)
         if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
           echo "::group::Setup Huggingface"
-          pip install -U "huggingface_hub[cli]<1.0" accelerate
-          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+          pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
+          HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+          hf auth login --token "$HF_AUTH_TOKEN"
           OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-          pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+          pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
           echo "::endgroup::"
         fi
 
@@ -146,7 +147,7 @@ jobs:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.8
+      gpu-arch-version: "13.0"
       download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
@@ -158,7 +159,7 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
+          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0'
           \$env:CUDA_PATH = \$env:CUDA_HOME
           \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
           nvcc --version
@@ -169,5 +170,5 @@ jobs:
             throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
           }
 
-          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
+          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '13.0'
         }"