From 0befdc792142c42b1bc72e435a6ab5e5cf4362f7 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Wed, 20 May 2026 07:41:25 -0700
Subject: [PATCH] Revert "Bump PyTorch pins to 2.12 (#19643)"

This reverts commit d66a37c3f709c48f397f03f1e8b8ba80be8e4d62.
---
 .ci/docker/build.sh                           |  5 --
 .ci/docker/ci_commit_pins/pytorch.txt         |  2 +-
 .ci/docker/common/install_cache.sh            |  3 -
 .ci/docker/common/install_pytorch.sh          |  8 +-
 .ci/docker/ubuntu/Dockerfile                  |  3 -
 .ci/scripts/utils.sh                          |  7 +-
 .github/workflows/mlx.yml                     |  4 +-
 backends/arm/_passes/arm_pass.py              | 38 ---------
 .../test_per_channel_conversion.py            | 15 ++--
 .../llama3_2_vision/text_decoder/model.py     | 14 ++--
 .../text_decoder/test/test_text_decoder.py    |  1 +
 .../models/parakeet/export_parakeet_tdt.py    |  4 +-
 extension/llm/modules/attention.py            |  8 +-
 install_requirements.py                       |  4 +-
 .../c10/torch/headeronly/macros/Macros.h      | 79 ++++---------------
 .../c10/torch/headeronly/util/BFloat16.h      | 13 ++-
 torch_pin.py                                  |  2 +-
 17 files changed, 51 insertions(+), 159 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 24c4efce24a..0a4c6a6f757 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -97,10 +97,6 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
-if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
-  PYTORCH_BUILD_MAX_JOBS=6
-fi
-
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -113,7 +109,6 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
-  --build-arg "PYTORCH_BUILD_MAX_JOBS=${PYTORCH_BUILD_MAX_JOBS:-}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 242371cbebe..f6e39a63b92 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-release/2.12
+release/2.11
\ No newline at end of file
diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
index 82be8697320..7b7d39994ca 100755
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@@ -76,9 +76,6 @@ init_sccache() {
   # This is the remote cache bucket
   export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
   export SCCACHE_S3_KEY_PREFIX=executorch
-  export SCCACHE_REGION=us-east-1
-  export AWS_REGION=us-east-1
-  export AWS_DEFAULT_REGION=us-east-1
   export SCCACHE_IDLE_TIMEOUT=0
   export SCCACHE_ERROR_LOG=/tmp/sccache_error.log
   export RUST_LOG=sccache::server=error
diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
index 3c80d093ab2..548a24f885d 100755
--- a/.ci/docker/common/install_pytorch.sh
+++ b/.ci/docker/common/install_pytorch.sh
@@ -27,12 +27,6 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
-  if [[ "$(uname -m)" == "aarch64" ]]; then
-    export BUILD_IGNORE_SVE_UNAVAILABLE=1
-  fi
-  if [[ -n "${PYTORCH_BUILD_MAX_JOBS:-}" ]]; then
-    export MAX_JOBS="${PYTORCH_BUILD_MAX_JOBS}"
-  fi
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
@@ -40,7 +34,7 @@ install_pytorch_and_domains() {
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.27
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 9a5b2536df0..0e2d7e48eb9 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -62,12 +62,9 @@ RUN bash ./install_cache.sh && rm install_cache.sh utils.sh
 ENV SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
 ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
-ENV AWS_REGION us-east-1
-ENV AWS_DEFAULT_REGION us-east-1
 
 ARG TORCH_VERSION
 ARG SKIP_PYTORCH
-ARG PYTORCH_BUILD_MAX_JOBS
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
 RUN if [ -z "${SKIP_PYTORCH}" ]; then bash ./install_pytorch.sh; fi && rm install_pytorch.sh utils.sh
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index b312d0ede83..486745f4bf6 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -107,7 +107,7 @@ install_pytorch_and_domains() {
   local torch_release=$(cat version.txt)
   # Download key must match the upload key below (basename of dist/*.whl,
   # which always carries setup.py's resolved +gitHASH). Branch-ref pins
-  # like `release/2.12` would otherwise produce `+gitrelease` here and
+  # like `release/2.11` would otherwise produce `+gitrelease` here and
   # never hit the cache.
   local torch_short_hash=$(git rev-parse --short=7 HEAD)
   local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
@@ -132,9 +132,6 @@ install_pytorch_and_domains() {
     # (e.g. executorch's requirements-ci.txt).
     pip install -r requirements-build.txt
     git submodule update --init --recursive
-    if [[ "$(uname -m)" == "aarch64" ]]; then
-      export BUILD_IGNORE_SVE_UNAVAILABLE=1
-    fi
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
@@ -178,7 +175,7 @@ install_pytorch_and_domains() {
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.27
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index c4be146f862..d429db16053 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -120,10 +120,10 @@ jobs:
           --prompt-len 4 \
           --max-new-tokens 5 2>&1)
         echo "$OUTPUT"
-        if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 94, 253, 88, 227\]"; then
+        if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then
           echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output"
         else
-          echo "Failed: unexpected output (expected [167, 94, 253, 88, 227])"
+          echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])"
           exit 1
         fi
         echo "::endgroup::"
diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py
index add0f3aeb20..1a1a179f456 100644
--- a/backends/arm/_passes/arm_pass.py
+++ b/backends/arm/_passes/arm_pass.py
@@ -9,14 +9,12 @@
 from abc import abstractmethod
 from typing import Any, List, Optional, Set, Type
 
-import torch
 from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
-from torch.utils import _pytree as pytree
 
 
 class ArmPass(ExportPass):
@@ -81,13 +79,6 @@ def get_name(pass_) -> str:
             )
 
     def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
-        if (
-            op == exir_ops.edge.aten.bmm.default
-            and isinstance(meta, NodeMetadata)
-            and len(meta.data.get("input_qparams", {})) > 0
-        ):
-            return self._call_quantized_bmm_without_fake_kernel(op, args, kwargs, meta)
-
         if not updated:
             return super().call_operator(op, args, kwargs, meta)
 
@@ -100,35 +91,6 @@ def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False)
         new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
         return super().call_operator(op, args, kwargs, NodeMetadata(new_meta))
 
-    def _call_quantized_bmm_without_fake_kernel(
-        self,
-        op,
-        args: tuple[ProxyValue, ...],
-        kwargs: dict[str, Any],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        old_val = meta.data["val"]
-        output_qparams = meta.data.get("output_qparams", {})
-        dtype = (
-            next(iter(output_qparams.values())).dtype
-            if len(output_qparams) > 0
-            else old_val.dtype
-        )
-        res_data = torch.empty_like(old_val, dtype=dtype)
-
-        args_proxy, kwargs_proxy = pytree.tree_map_only(
-            ProxyValue, lambda x: x.proxy, (args, kwargs)
-        )
-        res_proxy = self.tracer.create_proxy(
-            "call_function",
-            op,
-            args_proxy,
-            kwargs_proxy,
-        )
-        res_proxy.node.meta.update(meta.data)
-        self.tracer.set_metadata(res_proxy.node, res_data)
-        return ProxyValue(res_data, res_proxy)
-
     def call_submodule(
         self, graph_module: GraphModule, inputs: tuple[Any, ...]
     ) -> PassResult:
diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
index 706d8ed3e14..b3034ff17ed 100644
--- a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
+++ b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
@@ -169,19 +169,14 @@ def test_per_channel_convolution(self, _, use_qat: bool):
                 atol=1.0,
             )
 
-            conv_nodes = [
-                node
-                for node in exported_program.graph.nodes
-                if node.target == exir_ops.edge.aten.convolution.default
-            ]
-            assert len(conv_nodes) == 1
-
-            conv_node = conv_nodes[0]
+            nodes = list(exported_program.graph.nodes)
+
             assert (
-                conv_node.args[1].target
+                nodes[8].target
                 == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
             )
             assert (
-                conv_node.args[2].target
+                nodes[9].target
                 == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
             )
+            assert nodes[10].target == exir_ops.edge.aten.convolution.default
diff --git a/examples/models/llama3_2_vision/text_decoder/model.py b/examples/models/llama3_2_vision/text_decoder/model.py
index 9f15f777045..8f3a620affc 100644
--- a/examples/models/llama3_2_vision/text_decoder/model.py
+++ b/examples/models/llama3_2_vision/text_decoder/model.py
@@ -181,19 +181,19 @@ def get_example_kwarg_inputs(self):
             return None
 
     def get_dynamic_shapes(self):
-        static = torch.export.Dim.STATIC
+        batch_size = 1
         dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
         # Hardcoding # of tiles to be 2. image tokens per tile is 1601.
         if self.use_kv_cache:
             dynamic_shapes = {
-                "tokens": {0: static, 1: dim_seq_len},
-                "encoder_input": {0: static, 1: static, 2: static},
-                "encoder_mask": {0: static, 1: dim_seq_len, 2: static},
-                "mask": {0: static, 1: dim_seq_len, 2: static},
-                "input_pos": {0: static, 1: dim_seq_len},
+                "tokens": {0: batch_size, 1: dim_seq_len},
+                "encoder_input": None,
+                "encoder_mask": {0: 1, 1: dim_seq_len, 2: None},
+                "mask": {0: batch_size, 1: dim_seq_len, 2: None},
+                "input_pos": {0: batch_size, 1: dim_seq_len},
             }
         else:
             dynamic_shapes = {
-                "tokens": {0: static, 1: dim_seq_len},
+                "tokens": {0: batch_size, 1: dim_seq_len},
             }
         return dynamic_shapes
diff --git a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
index 0ef7b298139..4af637212a8 100644
--- a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
+++ b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
@@ -69,6 +69,7 @@ def test_llama3_2_text_decoder_aoti(self) -> None:
                 encoder,
                 model.get_example_inputs(),
                 kwargs=model.get_example_kwarg_inputs(),
+                dynamic_shapes=model.get_dynamic_shapes(),
                 strict=True,
             )
         with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
index 75943f6c4ae..6a18cd58218 100644
--- a/examples/models/parakeet/export_parakeet_tdt.py
+++ b/examples/models/parakeet/export_parakeet_tdt.py
@@ -360,8 +360,8 @@ def export_all(
         preprocessor_wrapper,
         (sample_audio, sample_length),
         dynamic_shapes={
-            # min=10 frames = 0.1 sec @ 16kHz, max aligned with encoder limit.
-            "audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)},
+            # min=1600 samples = 0.1 sec @ 16kHz, max aligned with encoder limit
+            "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)},
             "length": {},
         },
         strict=False,
diff --git a/extension/llm/modules/attention.py b/extension/llm/modules/attention.py
index 8869553875e..f9446ea3aa7 100644
--- a/extension/llm/modules/attention.py
+++ b/extension/llm/modules/attention.py
@@ -302,9 +302,11 @@ def false_fn(y):
             k, v = calculate_kv(y)
         else:
             # Expecting the k, v returning here to be the same size of self.kv_cache
-            # In eager, we expect this predicate to specialize. In export, keep it
-            # as a tensor predicate so AOTI does not introduce unbacked symbols.
-            k, v, cache_pos = torch.cond(torch.isnan(y).all(), true_fn, false_fn, (y,))
+            # In eager, we expect this predicate to specialize. In export, this will
+            # become a SymBool so it's not specialized.
+            k, v, cache_pos = torch.cond(
+                torch.isnan(y).all().item(), true_fn, false_fn, (y,)
+            )
             # Update key-value cache
             self.kv_cache.k_cache.copy_(k)
             self.kv_cache.v_cache.copy_(v)
diff --git a/install_requirements.py b/install_requirements.py
index 53204ffd3ee..b30068cbdb8 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -49,7 +49,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        ("torch==2.12.0" if use_pytorch_nightly else "torch"),
+        ("torch==2.11.0" if use_pytorch_nightly else "torch"),
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -112,7 +112,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
-        ("torchvision==0.27.0" if use_pytorch_nightly else "torchvision"),
+        ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
         ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
     ]
     # Then install domain libraries
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index cef99df3f56..63aa0d20d8e 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -325,88 +325,41 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
+#if defined(USE_ROCM)
 // C10_WARP_SIZE is only allowed for device code.
-// Host code dynamically-sized launch configs _must_ use at::cuda::warp_size().
-// Host or device statically-sized arrays _must_ use either
-// C10_WARP_SIZE_UPPER_BOUND or C10_WARP_SIZE_LOWER_BOUND, as needed.
-//
+// Host code _must_ use at::cuda::warp_size()
 // HIP header used to define warpSize as a constexpr that was either 32 or 64
 // depending on the target device, and then always set it to 64 for host code.
-// For a time, that allowed C10_WARP_SIZE to be defined like so:
-//
-// #ifdef USE_ROCM
-// #define C10_WARP_SIZE warpSize
-// #else
-// #define C10_WARP_SIZE 32
-// #endif
-//
-// In ROCm 7, warpSize is no longer constexpr, matching CUDA behavior.
-// We can now only use warpSize for C10_WARP_SIZE in device code and this is
-// enforced by using __device__ in its definition.  In host code where
-// C10_WARP_SIZE was previously used as a compile-time constant, this will now
-// cause a compile-time error.
-//
-// If an array was previously expected to be sized at compile-time using
-// C10_WARP_SIZE, users must now use either C10_WARP_SIZE_UPPER_BOUND or
-// C10_WARP_SIZE_LOWER_BOUND depending on the situation.
-//
-// If C10_WARP_SIZE was previously used to determine kernel launch sizes, users
-// must now use at::cuda::warp_size() for the dynamic runtime query.
-//
-// Unfortunately, C10_WARP_SIZE has been public and available for both host and
-// device since approximately 2019, so forcing it to be device-only would break
-// existing code in the wild.
-#if defined(USE_ROCM)
+// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
+// set it to something unreasonable to trigger obvious host code errors.
+
 namespace at::cuda {
 TORCH_CUDA_CPP_API int warp_size();
 }
-#if defined(__HIPCC__)
-static __host__ inline int C10_WARP_SIZE_INTERNAL() {
+#ifdef __HIPCC__
+static inline int __host__ C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-// NOTE: __device__ C10_WARP_SIZE_INTERNAL
-// For __SPIRV__, we must use dynamic warpSize. When not targeting __SPIRV__,
-// we can use constexpr. This matches prior behavior. We preserve this for
-// backward compatibility instead of forcing old code to use dynamic warpSize
-// and losing constexpr. However, compiling for --offload-arch=amdgcnspirv
-// could expose where C10_WARP_SIZE was used incorrectly where the dynamic
-// warpSize is not allowed.
-#if defined(__SPIRV__)
-static __device__ inline int C10_WARP_SIZE_INTERNAL() {
-  return warpSize;
-}
-#else // __SPIRV__
-static __device__ inline constexpr int C10_WARP_SIZE_INTERNAL() {
+
+static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
 #if defined(__GFX9__)
   return 64;
 #else // __GFX9__
   return 32;
 #endif // __GFX9__
 }
-#endif // __SPIRV__
-#if defined(__SPIRV__)
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 64
-#elif defined(__GFX9__)
-#define C10_WARP_SIZE_LOWER_BOUND 64
-#define C10_WARP_SIZE_UPPER_BOUND 64
-#else
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 32
-#endif
-#else // !__HIPCC__
+#else // __HIPCC__
 static inline int C10_WARP_SIZE_INTERNAL() {
   return at::cuda::warp_size();
 }
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 64
 #endif // __HIPCC__
+
 #define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
-#else // !USE_ROCM
+#define C10_WARP_SIZE_STATIC 64
+
+#else // defined(USE_ROCM)
 #define C10_WARP_SIZE 32
-#define C10_WARP_SIZE_LOWER_BOUND 32
-#define C10_WARP_SIZE_UPPER_BOUND 32
-#endif // USE_ROCM
+#endif
 
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 #define __func__ __FUNCTION__
@@ -676,7 +629,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
+#if (defined(__GNUC__) && __GNUC__ < 13) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
index 9aa08c265bd..64479ba36f1 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
@@ -12,7 +12,7 @@
 #include <iosfwd>
 #include <ostream>
 
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
 #include <cuda_bf16.h>
 #endif
 
@@ -46,7 +46,7 @@ struct alignas(2) BFloat16 {
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
 
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
   inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
   explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
 #endif
@@ -124,9 +124,8 @@ C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 /// Constructors
 inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
     :
-#if defined(__CUDACC__) &&                                                   \
-    (!defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || \
-     defined(USE_ROCM) && (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 800
       x(__bfloat16_as_ushort(__float2bfloat16(value)))
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -140,7 +139,7 @@ inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
 
 /// Implicit conversions
 inline C10_HOST_DEVICE BFloat16::operator float() const {
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
   return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
 #elif defined(__SYCL_DEVICE_ONLY__) && \
     defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
@@ -150,7 +149,7 @@ inline C10_HOST_DEVICE BFloat16::operator float() const {
 #endif
 }
 
-#if defined(__CUDACC__) && (!defined(USE_ROCM) || (TORCH_HIP_VERSION >= 702))
+#if defined(__CUDACC__) && !defined(USE_ROCM)
 inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
   x = *reinterpret_cast<const unsigned short*>(&value);
 }
diff --git a/torch_pin.py b/torch_pin.py
index 0c5cd50fe6d..3575d9a376d 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
-TORCH_VERSION = "2.12.0"
+TORCH_VERSION = "2.11.0"
 # NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287