diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 0a4c6a6f757..123680e5275 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -81,7 +81,7 @@ case "${IMAGE_NAME}" in
     LINTRUNNER=""
     GCC_VERSION=11
     CUDA_WINDOWS_CROSS_COMPILE=yes
-    CUDA_VERSION=12.8
+    CUDA_VERSION=13.0
     SKIP_PYTORCH=yes
     ;;
   executorch-ubuntu-24.04-gcc14)
@@ -97,6 +97,10 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
+if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
+  PYTORCH_BUILD_MAX_JOBS=6
+fi
+
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -109,6 +113,7 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
+  --build-arg "PYTORCH_BUILD_MAX_JOBS=${PYTORCH_BUILD_MAX_JOBS:-}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index f6e39a63b92..242371cbebe 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-release/2.11
\ No newline at end of file
+release/2.12
diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
index 7b7d39994ca..82be8697320 100755
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@@ -76,6 +76,9 @@ init_sccache() {
   # This is the remote cache bucket
   export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
   export SCCACHE_S3_KEY_PREFIX=executorch
+  export SCCACHE_REGION=us-east-1
+  export AWS_REGION=us-east-1
+  export AWS_DEFAULT_REGION=us-east-1
   export SCCACHE_IDLE_TIMEOUT=0
   export SCCACHE_ERROR_LOG=/tmp/sccache_error.log
   export RUST_LOG=sccache::server=error
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index 4300dd62d7a..9794e146407 100644
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -10,10 +10,10 @@
 
 set -ex
 
-# CUDA version must be specified (e.g., 12.8)
+# CUDA version must be specified (e.g., 13.0)
 CUDA_VERSION="${CUDA_VERSION:?CUDA_VERSION must be set}"
 
-# Convert version format (e.g., 12.8 -> 12-8 for package names)
+# Convert version format (e.g., 13.0 -> 13-0 for package names)
 CUDA_VERSION_DASH=$(echo "${CUDA_VERSION}" | tr '.' '-')
 
 # Add NVIDIA package repository
diff --git a/.ci/docker/common/install_cuda_windows_cross_compile.sh b/.ci/docker/common/install_cuda_windows_cross_compile.sh
index e3529751221..b5d92c8215d 100644
--- a/.ci/docker/common/install_cuda_windows_cross_compile.sh
+++ b/.ci/docker/common/install_cuda_windows_cross_compile.sh
@@ -17,6 +17,7 @@ declare -A CUDA_DRIVER_MAP=(
     ["12.6"]="12.6.3:561.17"
     ["12.8"]="12.8.1:572.61"
     ["12.9"]="12.9.1:576.57"
+    ["13.0"]="13.0.2:"
 )
 
 install_mingw() {
@@ -76,19 +77,26 @@ install_windows_cuda() {
     CUDA_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f1)
     CUDA_DRIVER_VERSION=$(echo "${CUDA_INFO}" | cut -d: -f2)
 
-    echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}"
+    if [ -n "${CUDA_DRIVER_VERSION}" ]; then
+        echo "Using CUDA ${CUDA_VERSION} with driver ${CUDA_DRIVER_VERSION}"
+        CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
+    else
+        echo "Using CUDA ${CUDA_VERSION}"
+        CUDA_INSTALLER="cuda_${CUDA_VERSION}_windows.exe"
+    fi
 
     echo "Installing Windows CUDA toolkit ${CUDA_VERSION}..."
 
     mkdir -p "${INSTALL_DIR}"
     cd "${INSTALL_DIR}"
 
-    CUDA_INSTALLER="cuda_${CUDA_VERSION}_${CUDA_DRIVER_VERSION}_windows.exe"
     CUDA_URL="https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${CUDA_INSTALLER}"
 
     # Check if already downloaded and extracted
     if [ -d "${INSTALL_DIR}/extracted/cuda_cudart" ]; then
         echo "Windows CUDA toolkit already installed, skipping download..."
+        chmod -R a+rX "${INSTALL_DIR}"
+        chmod -R a+rwX "${INSTALL_DIR}/extracted/cuda_cudart/cudart/lib"
         return 0
     fi
 
@@ -98,8 +106,11 @@ install_windows_cuda() {
     echo "Extracting CUDA toolkit..."
     7z x "${CUDA_INSTALLER}" -o"extracted" -y
 
-    # Fix permissions so ci-user can access the files
+    # Fix permissions so ci-user can access the files. PyTorch Inductor also
+    # needs to write a MinGW import library beside cudart.lib during Windows
+    # cross-compilation.
     chmod -R a+rX "${INSTALL_DIR}"
+    chmod -R a+rwX "${INSTALL_DIR}/extracted/cuda_cudart/cudart/lib"
 
     # Clean up installer to save space
     rm -f "${CUDA_INSTALLER}"
diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
index 548a24f885d..3c80d093ab2 100755
--- a/.ci/docker/common/install_pytorch.sh
+++ b/.ci/docker/common/install_pytorch.sh
@@ -27,6 +27,12 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
+  if [[ "$(uname -m)" == "aarch64" ]]; then
+    export BUILD_IGNORE_SVE_UNAVAILABLE=1
+  fi
+  if [[ -n "${PYTORCH_BUILD_MAX_JOBS:-}" ]]; then
+    export MAX_JOBS="${PYTORCH_BUILD_MAX_JOBS}"
+  fi
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
@@ -34,7 +40,7 @@ install_pytorch_and_domains() {
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.26
+  TORCHVISION_VERSION=release/0.27
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 0e2d7e48eb9..9a5b2536df0 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -62,9 +62,12 @@ RUN bash ./install_cache.sh && rm install_cache.sh utils.sh
 ENV SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
 ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
+ENV AWS_REGION us-east-1
+ENV AWS_DEFAULT_REGION us-east-1
 
 ARG TORCH_VERSION
 ARG SKIP_PYTORCH
+ARG PYTORCH_BUILD_MAX_JOBS
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
 RUN if [ -z "${SKIP_PYTORCH}" ]; then bash ./install_pytorch.sh; fi && rm install_pytorch.sh utils.sh
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 9adea394993..db447bb907f 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -518,7 +518,7 @@ fi
 
 DEVICE_ARG=""
 if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
-  DEVICE_ARG="--device cuda"
+  DEVICE_ARG="--device cuda:0"
 elif [ "$DEVICE" = "metal" ]; then
   DEVICE_ARG="--device mps"
 fi
diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh
index 08673533927..e717718be66 100755
--- a/.ci/scripts/test-cuda-build.sh
+++ b/.ci/scripts/test-cuda-build.sh
@@ -7,7 +7,7 @@
 
 set -exu
 
-CUDA_VERSION=${1:-"12.6"}
+CUDA_VERSION=${1:-"13.0"}
 
 echo "=== Testing ExecuTorch CUDA ${CUDA_VERSION} Build ==="
 
diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
index 26bd2ef3c93..3490b6e071a 100644
--- a/.ci/scripts/test_model_e2e_windows.ps1
+++ b/.ci/scripts/test_model_e2e_windows.ps1
@@ -159,18 +159,31 @@ try {
         }
         Write-Host "CUDA version check passed: $actualCudaVersion"
     }
+    $cmakeCudaArgs = @()
+    if (-not [string]::IsNullOrWhiteSpace($env:CUDA_HOME)) {
+        $cudaNvcc = Join-Path -Path $env:CUDA_HOME -ChildPath "bin\nvcc.exe"
+        if (-not (Test-Path -Path $cudaNvcc -PathType Leaf)) {
+            throw "CUDA compiler not found at '$cudaNvcc'"
+        }
+        $env:CUDACXX = $cudaNvcc
+        $cmakeCudaArgs = @(
+            "-T", "cuda=$env:CUDA_HOME",
+            "-DCMAKE_CUDA_COMPILER=$cudaNvcc",
+            "-DCUDAToolkit_ROOT=$env:CUDA_HOME"
+        )
+    }
     Write-Host "::endgroup::"
 
     Write-Host "::group::Build ExecuTorch (CUDA)"
     $numCores = [Math]::Max([Environment]::ProcessorCount - 1, 1)
-    cmake --preset llm-release-cuda
+    cmake --preset llm-release-cuda @cmakeCudaArgs
     cmake --build cmake-out --target install --config Release -j $numCores
     Write-Host "::endgroup::"
 
     Write-Host "::group::Build $runnerTarget"
     Push-Location (Join-Path -Path $executorchRoot -ChildPath "examples\models\$runnerPath")
     try {
-        cmake --preset $runnerPreset
+        cmake --preset $runnerPreset @cmakeCudaArgs
         cmake --build (Join-Path -Path $executorchRoot -ChildPath "cmake-out\examples\models\$runnerPath") --target $runnerTarget --config Release -j $numCores
     }
     finally {
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 486745f4bf6..b312d0ede83 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -107,7 +107,7 @@ install_pytorch_and_domains() {
   local torch_release=$(cat version.txt)
   # Download key must match the upload key below (basename of dist/*.whl,
   # which always carries setup.py's resolved +gitHASH). Branch-ref pins
-  # like `release/2.11` would otherwise produce `+gitrelease` here and
+  # like `release/2.12` would otherwise produce `+gitrelease` here and
   # never hit the cache.
   local torch_short_hash=$(git rev-parse --short=7 HEAD)
   local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
@@ -132,6 +132,9 @@ install_pytorch_and_domains() {
     # (e.g. executorch's requirements-ci.txt).
     pip install -r requirements-build.txt
     git submodule update --init --recursive
+    if [[ "$(uname -m)" == "aarch64" ]]; then
+      export BUILD_IGNORE_SVE_UNAVAILABLE=1
+    fi
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
@@ -175,7 +178,7 @@ install_pytorch_and_domains() {
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.26
+  TORCHVISION_VERSION=release/0.27
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index b2302cf91af..ada2fb9e696 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -110,7 +110,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       upload-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
@@ -124,10 +124,11 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]<1.0" accelerate
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
+        HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+        hf auth login --token "$HF_AUTH_TOKEN"
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
         echo "::group::Exporting model ${{ matrix.model }} with quantization ${{ matrix.quant }}"
@@ -158,7 +159,7 @@ jobs:
       timeout: 90
       runner: ${{ contains(matrix.model, 'Qwen3.5-35B-A3B') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       download-artifact: model-${{ matrix.model_safe }}-${{ matrix.quant }}
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 265b7e3069d..aae27121bd0 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -64,7 +64,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.8
+      gpu-arch-version: "13.0"
       docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
       upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
@@ -98,10 +98,11 @@ jobs:
         # Setup Huggingface only for models that need it (not dinov2)
         if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
           echo "::group::Setup Huggingface"
-          pip install -U "huggingface_hub[cli]<1.0" accelerate
-          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+          pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
+          HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+          hf auth login --token "$HF_AUTH_TOKEN"
           OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-          pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+          pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
           echo "::endgroup::"
         fi
 
@@ -146,7 +147,7 @@ jobs:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.8
+      gpu-arch-version: "13.0"
       download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
@@ -158,7 +159,7 @@ jobs:
           \$ErrorActionPreference = 'Stop'
           \$PSNativeCommandUseErrorActionPreference = \$true
 
-          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8'
+          \$env:CUDA_HOME = 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0'
           \$env:CUDA_PATH = \$env:CUDA_HOME
           \$env:PATH = \"\$env:CUDA_HOME\bin;\$env:PATH\"
           nvcc --version
@@ -169,5 +170,5 @@ jobs:
             throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
           }
 
-          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
+          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '13.0'
         }"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index e1eaba6b7c1..f19b937994f 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -1,6 +1,6 @@
 # Test ExecuTorch CUDA Build Compatibility
 # This workflow tests whether ExecuTorch can be successfully built with CUDA support
-# across different CUDA versions (12.6, 12.8, 12.9, 13.0) using the command:
+# across different CUDA versions (12.6, 13.0) using the command:
 #   ./install_executorch.sh
 #
 # Note: ExecuTorch automatically detects the system CUDA version using nvcc and
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda-version: ["12.6", "12.8", "12.9", "13.0"]
+        cuda-version: ["12.6", "13.0"]
 
     name: test-executorch-cuda-build-${{ matrix.cuda-version }}
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -66,7 +66,7 @@ jobs:
             echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
             exit 1
           else
-            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9, 13.0) completed successfully!"
+            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 13.0) completed successfully!"
           fi
 
   test-models-cuda:
@@ -79,7 +79,7 @@ jobs:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -114,7 +114,7 @@ jobs:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -271,7 +271,7 @@ jobs:
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
@@ -288,10 +288,11 @@ jobs:
         # Setup Huggingface only for models that need it (not parakeet or dinov2)
         if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
           echo "::group::Setup Huggingface"
-          pip install -U "huggingface_hub[cli]<1.0" accelerate
-          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+          pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
+          HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+          hf auth login --token "$HF_AUTH_TOKEN"
           OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-          pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+          pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
           echo "::endgroup::"
         fi
 
@@ -406,7 +407,7 @@ jobs:
       timeout: 90
       runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
@@ -442,7 +443,7 @@ jobs:
       download-artifact: ${{ matrix.artifact }}
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: 12.6
+      gpu-arch-version: "13.0"
       use-custom-docker-registry: false
       submodules: recursive
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -469,13 +470,15 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]<1.0"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        pip install -U "huggingface_hub[cli]>=1.2.1,<2.0"
+        HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+        hf auth login --token "$HF_AUTH_TOKEN"
         echo "::endgroup::"
 
         echo "::group::Install optimum-executorch"
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install "optimum~=2.0.0" "transformers==5.0.0rc1"
+        pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
         echo "::group::Test CUDA Model: ${{ matrix.model }} ${{ matrix.quantize }}"
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
index de6507e035a..7230f6660e6 100644
--- a/.github/workflows/metal.yml
+++ b/.github/workflows/metal.yml
@@ -226,7 +226,11 @@ jobs:
         # the PCH dir, which PyTorch derives from tempfile.gettempdir() independently.
         export TMPDIR=$(mktemp -d "${RUNNER_TEMP}/tmpdir_XXXXXX")
         export TORCHINDUCTOR_CACHE_DIR=$(mktemp -d "${RUNNER_TEMP}/inductor_cache_XXXXXX")
-        ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        VR_MODE=""
+        if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
+          VR_MODE="vr-offline"
+        fi
+        ${CONDA_RUN} bash .ci/scripts/export_model_artifact.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}"
 
   test-model-metal-e2e:
     name: test-model-metal-e2e
@@ -290,4 +294,8 @@ jobs:
         fi
         echo "::endgroup::"
 
-        ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        VR_MODE=""
+        if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
+          VR_MODE="vr-offline"
+        fi
+        ${CONDA_RUN} bash .ci/scripts/test_model_e2e.sh metal "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}"
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index d429db16053..c4be146f862 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -120,10 +120,10 @@ jobs:
           --prompt-len 4 \
           --max-new-tokens 5 2>&1)
         echo "$OUTPUT"
-        if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then
+        if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 94, 253, 88, 227\]"; then
           echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output"
         else
-          echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])"
+          echo "Failed: unexpected output (expected [167, 94, 253, 88, 227])"
           exit 1
         fi
         echo "::endgroup::"
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 55dad54526e..5d8aa1018a1 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/aoti/common_shims.h>
+#include <executorch/runtime/platform/abort.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 
@@ -218,6 +219,25 @@ AOTI_SHIM_EXPORT void aoti_torch_warn(
   ET_LOG(Error, "[%s:%u] %s: %s", file, line, func, msg);
 }
 
+AOTI_SHIM_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  if (cond) {
+    return;
+  }
+  ET_LOG(
+      Fatal,
+      "[%s:%u] %s: %s",
+      file != nullptr ? file : "<unknown>",
+      line,
+      func != nullptr ? func : "<unknown>",
+      msg != nullptr ? msg : "AOTI check failed");
+  ::executorch::runtime::runtime_abort();
+}
+
 AOTI_SHIM_EXPORT AOTITorchError
 aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size) {
   (void)tensor;
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index 6f7313e9b60..8f5abd4ca6f 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -87,6 +87,13 @@ AOTI_SHIM_EXPORT void aoti_torch_warn(
     uint32_t line,
     const char* msg);
 
+AOTI_SHIM_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
 AOTI_SHIM_EXPORT AOTITorchError
 aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
 
diff --git a/backends/aoti/common_shims_slim.cpp b/backends/aoti/common_shims_slim.cpp
index 739b3ee68c0..8ea44538c55 100644
--- a/backends/aoti/common_shims_slim.cpp
+++ b/backends/aoti/common_shims_slim.cpp
@@ -7,6 +7,8 @@
  */
 
 #include <executorch/backends/aoti/common_shims_slim.h>
+#include <executorch/runtime/platform/abort.h>
+#include <executorch/runtime/platform/log.h>
 
 namespace executorch {
 namespace backends {
@@ -171,6 +173,25 @@ AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled) {
   return Error::Ok;
 }
 
+void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  if (cond) {
+    return;
+  }
+  ET_LOG(
+      Fatal,
+      "[%s:%u] %s: %s",
+      file != nullptr ? file : "<unknown>",
+      line,
+      func != nullptr ? func : "<unknown>",
+      msg != nullptr ? msg : "AOTI check failed");
+  ::executorch::runtime::runtime_abort();
+}
+
 } // extern "C"
 } // namespace aoti
 } // namespace backends
diff --git a/backends/aoti/common_shims_slim.h b/backends/aoti/common_shims_slim.h
index 75ede847d5a..fa88e3b341c 100644
--- a/backends/aoti/common_shims_slim.h
+++ b/backends/aoti/common_shims_slim.h
@@ -93,6 +93,14 @@ AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cuda();
 AOTI_SHIM_EXPORT bool aoti_torch_grad_mode_is_enabled();
 AOTI_SHIM_EXPORT AOTITorchError aoti_torch_grad_mode_set_enabled(bool enabled);
 
+// Error reporting helper emitted by newer AOTInductor wrappers.
+AOTI_SHIM_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
 } // extern "C"
 } // namespace aoti
 } // namespace backends
diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py
index 1a1a179f456..add0f3aeb20 100644
--- a/backends/arm/_passes/arm_pass.py
+++ b/backends/arm/_passes/arm_pass.py
@@ -9,12 +9,14 @@
 from abc import abstractmethod
 from typing import Any, List, Optional, Set, Type
 
+import torch
 from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
+from torch.utils import _pytree as pytree
 
 
 class ArmPass(ExportPass):
@@ -79,6 +81,13 @@ def get_name(pass_) -> str:
             )
 
     def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
+        if (
+            op == exir_ops.edge.aten.bmm.default
+            and isinstance(meta, NodeMetadata)
+            and len(meta.data.get("input_qparams", {})) > 0
+        ):
+            return self._call_quantized_bmm_without_fake_kernel(op, args, kwargs, meta)
+
         if not updated:
             return super().call_operator(op, args, kwargs, meta)
 
@@ -91,6 +100,35 @@ def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False)
         new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
         return super().call_operator(op, args, kwargs, NodeMetadata(new_meta))
 
+    def _call_quantized_bmm_without_fake_kernel(
+        self,
+        op,
+        args: tuple[ProxyValue, ...],
+        kwargs: dict[str, Any],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        old_val = meta.data["val"]
+        output_qparams = meta.data.get("output_qparams", {})
+        dtype = (
+            next(iter(output_qparams.values())).dtype
+            if len(output_qparams) > 0
+            else old_val.dtype
+        )
+        res_data = torch.empty_like(old_val, dtype=dtype)
+
+        args_proxy, kwargs_proxy = pytree.tree_map_only(
+            ProxyValue, lambda x: x.proxy, (args, kwargs)
+        )
+        res_proxy = self.tracer.create_proxy(
+            "call_function",
+            op,
+            args_proxy,
+            kwargs_proxy,
+        )
+        res_proxy.node.meta.update(meta.data)
+        self.tracer.set_metadata(res_proxy.node, res_data)
+        return ProxyValue(res_data, res_proxy)
+
     def call_submodule(
         self, graph_module: GraphModule, inputs: tuple[Any, ...]
     ) -> PassResult:
diff --git a/backends/cuda/runtime/aoti_cuda_shims.lib b/backends/cuda/runtime/aoti_cuda_shims.lib
index 5b66c0d82eb..63c8d79bd7a 100644
Binary files a/backends/cuda/runtime/aoti_cuda_shims.lib and b/backends/cuda/runtime/aoti_cuda_shims.lib differ
diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
index b3034ff17ed..706d8ed3e14 100644
--- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
+++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
@@ -169,14 +169,19 @@ def test_per_channel_convolution(self, _, use_qat: bool):
                 atol=1.0,
             )
 
-            nodes = list(exported_program.graph.nodes)
-
+            conv_nodes = [
+                node
+                for node in exported_program.graph.nodes
+                if node.target == exir_ops.edge.aten.convolution.default
+            ]
+            assert len(conv_nodes) == 1
+
+            conv_node = conv_nodes[0]
             assert (
-                nodes[8].target
+                conv_node.args[1].target
                 == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
             )
             assert (
-                nodes[9].target
+                conv_node.args[2].target
                 == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
             )
-            assert nodes[10].target == exir_ops.edge.aten.convolution.default
diff --git a/examples/models/llama3_2_vision/text_decoder/model.py b/examples/models/llama3_2_vision/text_decoder/model.py
index 8f3a620affc..9f15f777045 100644
--- a/examples/models/llama3_2_vision/text_decoder/model.py
+++ b/examples/models/llama3_2_vision/text_decoder/model.py
@@ -181,19 +181,19 @@ def get_example_kwarg_inputs(self):
             return None
 
     def get_dynamic_shapes(self):
-        batch_size = 1
+        static = torch.export.Dim.STATIC
         dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
         # Hardcoding # of tiles to be 2. image tokens per tile is 1601.
         if self.use_kv_cache:
             dynamic_shapes = {
-                "tokens": {0: batch_size, 1: dim_seq_len},
-                "encoder_input": None,
-                "encoder_mask": {0: 1, 1: dim_seq_len, 2: None},
-                "mask": {0: batch_size, 1: dim_seq_len, 2: None},
-                "input_pos": {0: batch_size, 1: dim_seq_len},
+                "tokens": {0: static, 1: dim_seq_len},
+                "encoder_input": {0: static, 1: static, 2: static},
+                "encoder_mask": {0: static, 1: dim_seq_len, 2: static},
+                "mask": {0: static, 1: dim_seq_len, 2: static},
+                "input_pos": {0: static, 1: dim_seq_len},
             }
         else:
             dynamic_shapes = {
-                "tokens": {0: batch_size, 1: dim_seq_len},
+                "tokens": {0: static, 1: dim_seq_len},
             }
         return dynamic_shapes
diff --git a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
index 4af637212a8..0ef7b298139 100644
--- a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
+++ b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
@@ -69,7 +69,6 @@ def test_llama3_2_text_decoder_aoti(self) -> None:
                 encoder,
                 model.get_example_inputs(),
                 kwargs=model.get_example_kwarg_inputs(),
-                dynamic_shapes=model.get_dynamic_shapes(),
                 strict=True,
             )
         with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
index 6a18cd58218..9114a9db3f9 100644
--- a/examples/models/parakeet/export_parakeet_tdt.py
+++ b/examples/models/parakeet/export_parakeet_tdt.py
@@ -334,7 +334,7 @@ def export_all(
     programs = {}
 
     # Determine device based on backend (preprocessor always stays on CPU)
-    device = torch.device("cuda" if backend == "cuda" else "cpu")
+    device = torch.device("cuda:0" if backend == "cuda" else "cpu")
 
     # Get audio parameters from model config
     sample_rate = model.preprocessor._cfg.sample_rate
@@ -360,8 +360,8 @@ def export_all(
         preprocessor_wrapper,
         (sample_audio, sample_length),
         dynamic_shapes={
-            # min=1600 samples = 0.1 sec @ 16kHz, max aligned with encoder limit
-            "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)},
+            # min=10 frames = 0.1 sec @ 16kHz, max aligned with encoder limit.
+            "audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)},
             "length": {},
         },
         strict=False,
@@ -370,7 +370,7 @@ def export_all(
 
     # Move model to CUDA after preprocessor export (preprocessor must stay on CPU)
     if backend == "cuda":
-        model.cuda()
+        model.to(device)
 
     feat_in = getattr(model.encoder, "_feat_in", 128)
     # Use max_mel_frames as example to ensure Dim.AUTO infers the full range.
diff --git a/examples/models/qwen3_5_moe/test_turboquant.py b/examples/models/qwen3_5_moe/test_turboquant.py
index 53474dc2515..2353dc0b8f2 100644
--- a/examples/models/qwen3_5_moe/test_turboquant.py
+++ b/examples/models/qwen3_5_moe/test_turboquant.py
@@ -143,7 +143,7 @@ def test_eager_decode_quality(self):
             logits_base.reshape(1, -1).float(),
             logits_tq.reshape(1, -1).float(),
         ).item()
-        self.assertGreater(cos, 0.99, f"Prefill cosine {cos:.4f}")
+        self.assertGreater(cos, 0.98, f"Prefill cosine {cos:.4f}")
 
     def test_export_matches_eager(self):
         """Exported TQ model produces same greedy tokens as eager."""
diff --git a/examples/models/sortformer/export_sortformer.py b/examples/models/sortformer/export_sortformer.py
index e8f25780607..36f77f4dfdf 100644
--- a/examples/models/sortformer/export_sortformer.py
+++ b/examples/models/sortformer/export_sortformer.py
@@ -222,7 +222,8 @@ def export_all(model, backend: Optional[str] = None):
         preprocessor_wrapper,
         (sample_audio, sample_length),
         dynamic_shapes={
-            "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)},
+            # min=10 frames = 0.1 sec @ 16kHz, max is one 120s runner chunk.
+            "audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)},
             "length": {},
         },
         strict=False,
diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py
index 3dfa53af16a..b9d29b7f22a 100644
--- a/examples/models/voxtral_realtime/export_voxtral_rt.py
+++ b/examples/models/voxtral_realtime/export_voxtral_rt.py
@@ -188,7 +188,7 @@ def export_all(
 
     programs = {}
     param_dtype = next(model.parameters()).dtype
-    device = "cuda" if backend == "cuda" else "cpu"
+    device = "cuda:0" if backend == "cuda" else "cpu"
 
     # 1. Audio encoder
     print("\nExporting audio_encoder...")
@@ -275,7 +275,7 @@ def export_streaming(
 
     programs = {}
     param_dtype = next(model.parameters()).dtype
-    device = "cuda" if backend == "cuda" else "cpu"
+    device = "cuda:0" if backend == "cuda" else "cpu"
 
     # 1. Streaming audio encoder
     print("\nExporting encode_audio_chunk...")
@@ -618,7 +618,7 @@ def main():
     # Move to CUDA for CUDA backend export (AOTInductor needs CUDA tensors)
     if backend_for_export == "cuda":
         print("Moving model to CUDA...")
-        model.cuda()
+        model.to(torch.device("cuda:0"))
 
     # Untie output/embedding weights before quantization so each layer gets
     # its own quantization config (embedding: 8w, output linear: 8da4w).
diff --git a/examples/models/voxtral_realtime/model.py b/examples/models/voxtral_realtime/model.py
index e591445cc56..3ff110c161e 100644
--- a/examples/models/voxtral_realtime/model.py
+++ b/examples/models/voxtral_realtime/model.py
@@ -1129,7 +1129,8 @@ def create_causal_mask(
         return torch.where(
             valid,
             torch.zeros(1, dtype=dtype, device=start_pos.device),
-            torch.tensor(float("-inf"), dtype=dtype, device=start_pos.device),
+            # MPS SDPA can propagate NaNs from -inf additive masks in AOTI.
+            torch.tensor(-1e9, dtype=dtype, device=start_pos.device),
         )
 
 
diff --git a/examples/models/voxtral_realtime/tests/test_ring_kv_cache.py b/examples/models/voxtral_realtime/tests/test_ring_kv_cache.py
new file mode 100644
index 00000000000..274f0e177ba
--- /dev/null
+++ b/examples/models/voxtral_realtime/tests/test_ring_kv_cache.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from types import ModuleType
+from unittest.mock import patch
+
+import torch
+
+with patch.dict(
+    "sys.modules",
+    {"executorch.extension.llm.custom_ops.custom_ops": ModuleType("custom_ops")},
+):
+    from executorch.examples.models.voxtral_realtime.model import StandardRingKVCache
+
+
+class StandardRingKVCacheTest(unittest.TestCase):
+    def test_additive_mask_uses_finite_negative_values(self):
+        cache = StandardRingKVCache(window_size=4, n_heads=1, head_dim=2)
+
+        mask = cache.create_causal_mask(
+            torch.tensor(0), seq_len=1, dtype=torch.bfloat16
+        )
+
+        self.assertEqual(mask.dtype, torch.bfloat16)
+        self.assertTrue(torch.isfinite(mask).all())
+        self.assertEqual(mask[0, 0].item(), 0)
+        self.assertLess(mask[0, 1].float().item(), -1e8)
+
+    def test_bool_mask_keeps_bool_dtype(self):
+        cache = StandardRingKVCache(window_size=4, n_heads=1, head_dim=2)
+
+        mask = cache.create_causal_mask(torch.tensor(3), seq_len=2, bool_mask=True)
+
+        self.assertEqual(mask.dtype, torch.bool)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/extension/llm/modules/attention.py b/extension/llm/modules/attention.py
index f9446ea3aa7..8869553875e 100644
--- a/extension/llm/modules/attention.py
+++ b/extension/llm/modules/attention.py
@@ -302,11 +302,9 @@ def false_fn(y):
             k, v = calculate_kv(y)
         else:
             # Expecting the k, v returning here to be the same size of self.kv_cache
-            # In eager, we expect this predicate to specialize. In export, this will
-            # become a SymBool so it's not specialized.
-            k, v, cache_pos = torch.cond(
-                torch.isnan(y).all().item(), true_fn, false_fn, (y,)
-            )
+            # In eager, we expect this predicate to specialize. In export, keep it
+            # as a tensor predicate so AOTI does not introduce unbacked symbols.
+            k, v, cache_pos = torch.cond(torch.isnan(y).all(), true_fn, false_fn, (y,))
             # Update key-value cache
             self.kv_cache.k_cache.copy_(k)
             self.kv_cache.v_cache.copy_(v)
diff --git a/install_requirements.py b/install_requirements.py
index b30068cbdb8..53204ffd3ee 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -49,7 +49,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        ("torch==2.11.0" if use_pytorch_nightly else "torch"),
+        ("torch==2.12.0" if use_pytorch_nightly else "torch"),
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -112,7 +112,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
-        ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
+        ("torchvision==0.27.0" if use_pytorch_nightly else "torchvision"),
         ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
     ]
     # Then install domain libraries
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 63aa0d20d8e..cef99df3f56 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -325,41 +325,88 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
-#if defined(USE_ROCM)
 // C10_WARP_SIZE is only allowed for device code.
-// Host code _must_ use at::cuda::warp_size()
+// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
+// Host or device statically-sized arrays _must_ use either
+// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
+//
 // HIP header used to define warpSize as a constexpr that was either 32 or 64
 // depending on the target device, and then always set it to 64 for host code.
-// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
-// set it to something unreasonable to trigger obvious host code errors.
-
+// For a time, that allowed C10_WARP_SIZE to be defined like so:
+//
+// #ifdef USE_ROCM
+// #define C10_WARP_SIZE warpSize
+// #else
+// #define C10_WARP_SIZE 32
+// #endif
+//
+// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
+// We can now only use warpSize for C10_WARP_SIZE in device code and this is
+// enforced by using __device__ in its definition.  In host code where
+// C10_WARP_SIZE was previously used as a compile-time constant, this will now
+// cause a compile-time error.
+//
+// If an array was previously expected to be sized at compile-time using
+// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
+// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
+//
+// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
+// must now use at::cuda::warp_size() for the dynamic runtime query.
+//
+// Unfortunately, C10_WARP_SIZE has been public and available for both host and
+// device since approximately 2019, so forcing it to be device-only would break
+// existing code in the wild.
+#if defined(USE_ROCM)
 namespace at::cuda {
 TORCH_CUDA_CPP_API int warp_size();
 }
-#ifdef __HIPCC__
-static inline int __host__ C10_WARP_SIZE_INTERNAL() {
+#if defined(__HIPCC__)
+static __host__ inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-
-static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
+// NOTE: __device__ C10_WARP_SIZE_INTERNAL
+// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
+// we can use constexpr. This matches prior behavior. We preserve this for
+// backward compatibility instead of forcing old code to use dynamic warpSize
+// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
+// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
+// warpSize is not allowed.
+#if defined(__SPIRV__)
+static __device__ inline int C10_WARP_SIZE_INTERNAL() {
+  return warpSize;
+}
+#else // __SPIRV__
+static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
   return 64;
 #else // __GFX9__
   return 32;
 #endif // __GFX9__
 }
-#else // __HIPCC__
+#endif // __SPIRV__
+#if defined(__SPIRV__)
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 64
+#elif defined(__GFX9__)
+#define C10_WARP_SIZE_LOWER_BOUND 64
+#define C10_WARP_SIZE_UPPER_BOUND 64
+#else
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 32
+#endif
+#else // !__HIPCC__
 static inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 64
 #endif // __HIPCC__
-
 #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
-#define C10_WARP_SIZE_STATIC 64
-
-#else // defined(USE_ROCM)
+#else // !USE_ROCM
 #define C10_WARP_SIZE 32
-#endif
+#define C10_WARP_SIZE_LOWER_BOUND 32
+#define C10_WARP_SIZE_UPPER_BOUND 32
+#endif // USE_ROCM
 
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 #define __func__ __FUNCTION__
@@ -629,7 +676,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13) || \
+#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
index 64479ba36f1..9aa08c265bd 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
@@ -12,7 +12,7 @@
 #include <iosfwd>
 #include <ostream>
 
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
 #include <cuda_bf16.h>
 #endif
 
@@ -46,7 +46,7 @@ struct alignas(2) BFloat16 {
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
 
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
   inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
   explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
 #endif
@@ -124,8 +124,9 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 /// Constructors
 inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
     :
-#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
-    __CUDA_ARCH__ >= 800
+#if defined(__CUDACC__) &&                                                   \
+    (!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \
+     defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702))
       x(__bfloat16_as_ushort(__float2bfloat16(value)))
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -139,7 +140,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
 
 /// Implicit conversions
 inline C10_HOST_DEVICE BFloat16::operator float() const {
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
   return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -149,7 +150,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const {
 #endif
 }
 
-#if defined(__CUDACC__) && !defined(USE_ROCM)
+#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
 inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
 }
diff --git a/torch_pin.py b/torch_pin.py
index 3575d9a376d..0c5cd50fe6d 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
-TORCH_VERSION = "2.11.0"
+TORCH_VERSION = "2.12.0"
 # NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287