From df140b35b042e79658b92a1585c079d0749df050 Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Tue, 19 May 2026 23:20:20 +0000 Subject: [PATCH 1/6] Add NS via cusolvermp to wheel build Signed-off-by: Kirthi Shankar Sivamani --- build_tools/wheel_utils/Dockerfile.aarch | 13 ++++++++++++- build_tools/wheel_utils/Dockerfile.x86 | 13 ++++++++++++- build_tools/wheel_utils/build_wheels.sh | 4 ++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/build_tools/wheel_utils/Dockerfile.aarch b/build_tools/wheel_utils/Dockerfile.aarch index c040dadcdb..4f6635f507 100644 --- a/build_tools/wheel_utils/Dockerfile.aarch +++ b/build_tools/wheel_utils/Dockerfile.aarch @@ -35,12 +35,23 @@ RUN dnf clean all RUN dnf -y install glog.aarch64 glog-devel.aarch64 RUN dnf -y install libnccl libnccl-devel libnccl-static +# expose system libs for TE CMake build. +RUN dnf -y install \ + libcusolvermp0-cuda-${CUDA_MAJOR} libcusolvermp0-devel-cuda-${CUDA_MAJOR} && \ + dnf clean all +RUN mkdir -p /opt/nvidia/cusolvermp && \ + ln -s /usr/include/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/include && \ + ln -s /usr/lib64/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/lib && \ + echo "/usr/lib64/libcusolvermp/${CUDA_MAJOR}" > /etc/ld.so.conf.d/999_nvidia_cusolvermp.conf && \ + ldconfig + ENV PATH="/usr/local/cuda/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/opt/nvidia/cusolvermp/lib:${LD_LIBRARY_PATH}" ENV CUDA_HOME=/usr/local/cuda ENV CUDA_ROOT=/usr/local/cuda ENV CUDA_PATH=/usr/local/cuda ENV CUDADIR=/usr/local/cuda +ENV CUSOLVERMP_HOME=/opt/nvidia/cusolvermp ENV NVTE_RELEASE_BUILD=1 CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_aarch64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"] diff --git a/build_tools/wheel_utils/Dockerfile.x86 b/build_tools/wheel_utils/Dockerfile.x86 index 2728b6b7c1..b01e443910 100644 --- a/build_tools/wheel_utils/Dockerfile.x86 +++ b/build_tools/wheel_utils/Dockerfile.x86 @@ -35,12 +35,23 @@ RUN dnf clean all RUN dnf -y install glog.x86_64 glog-devel.x86_64 RUN dnf -y install libnccl libnccl-devel libnccl-static +# expose system libs for TE CMake build. +RUN dnf -y install \ + libcusolvermp0-cuda-${CUDA_MAJOR} libcusolvermp0-devel-cuda-${CUDA_MAJOR} && \ + dnf clean all +RUN mkdir -p /opt/nvidia/cusolvermp && \ + ln -s /usr/include/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/include && \ + ln -s /usr/lib64/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/lib && \ + echo "/usr/lib64/libcusolvermp/${CUDA_MAJOR}" > /etc/ld.so.conf.d/999_nvidia_cusolvermp.conf && \ + ldconfig + ENV PATH="/usr/local/cuda/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/opt/nvidia/cusolvermp/lib:${LD_LIBRARY_PATH}" ENV CUDA_HOME=/usr/local/cuda ENV CUDA_ROOT=/usr/local/cuda ENV CUDA_PATH=/usr/local/cuda ENV CUDADIR=/usr/local/cuda +ENV CUSOLVERMP_HOME=/opt/nvidia/cusolvermp ENV NVTE_RELEASE_BUILD=1 CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_x86_64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"] diff --git a/build_tools/wheel_utils/build_wheels.sh b/build_tools/wheel_utils/build_wheels.sh index e9ec854dba..74a4f21dee 100644 --- a/build_tools/wheel_utils/build_wheels.sh +++ b/build_tools/wheel_utils/build_wheels.sh @@ -25,6 +25,10 @@ git submodule update --init --recursive # Install deps /opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja setuptools wheel +# Enable optional build features. cuSolverMp is provided by the build image +# (see Dockerfile.x86 / Dockerfile.aarch), which also sets CUSOLVERMP_HOME. +export NVTE_WITH_CUSOLVERMP=1 + if $BUILD_METAPACKAGE ; then cd /TransformerEngine NVTE_BUILD_METAPACKAGE=1 /opt/python/cp310-cp310/bin/python setup.py bdist_wheel 2>&1 | tee /wheelhouse/logs/metapackage.txt From 50f17532ce88d65043ab6117bf71d4d396585ba7 Mon Sep 17 00:00:00 2001 From: ksivamani Date: Tue, 2 Jun 2026 16:58:25 -0400 Subject: [PATCH 2/6] Build dep runtime Signed-off-by: ksivamani --- build_tools/utils.py | 11 +++++++++-- setup.py | 4 ++++ transformer_engine/common/__init__.py | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/build_tools/utils.py b/build_tools/utils.py index d0f5eab425..f2548b4de6 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -14,7 +14,7 @@ import sys import platform from pathlib import Path -from importlib.metadata import version as get_version +from importlib.metadata import PackageNotFoundError, distribution, version as get_version from subprocess import CalledProcessError from typing import List, Optional, Tuple, Union @@ -292,10 +292,17 @@ def cuda_version() -> Tuple[int, ...]: version_str = get_version("nvidia-cuda-runtime-cu12") version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) return version_tuple - except importlib.metadata.PackageNotFoundError: + except PackageNotFoundError: raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") +def cusolvermp_pypi_package_name(cuda_major: Optional[int] = None) -> str: + """PyPI package providing cuSolverMp runtime libraries for a CUDA major version.""" + if cuda_major is None: + cuda_major = cuda_version()[0] + return f"nvidia-cusolvermp-cu{cuda_major}" + + def get_frameworks() -> List[str]: """DL frameworks to build support for""" _frameworks: List[str] = [] diff --git a/setup.py b/setup.py index ec277b6349..3d9e1de349 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ from build_tools.utils import ( cuda_archs, cuda_version, + cusolvermp_pypi_package_name, get_frameworks, remove_dups, min_python_version_str, @@ -112,6 +113,9 @@ def setup_requirements() -> Tuple[List[str], List[str]]: ] test_reqs: List[str] = ["pytest>=8.2.1"] + if bool(int(os.getenv("NVTE_WITH_CUSOLVERMP", "0"))): + install_reqs.append(cusolvermp_pypi_package_name()) + # Framework-specific requirements if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))): if "pytorch" in frameworks: diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 40933f17a9..55235fe94c 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -369,6 +369,7 @@ def _load_core_library(): _, _CUDNN_LIB_CTYPES = _load_cuda_library("cudnn") system_nvrtc, _NVRTC_LIB_CTYPES = _load_cuda_library("nvrtc") system_curand, _CURAND_LIB_CTYPES = _load_cuda_library("curand") + _, _CUSOLVERMP_LIB_CTYPES = _load_cuda_library_from_python("cusolverMp", strict=False) # This additional step is necessary to be able to install TE wheels # and import TE (without any guards) in an environment where the cuda From ccaccd5cb2829ecff6491e443d1e8fd3cf64e60c Mon Sep 17 00:00:00 2001 From: Kirthi Shankar Sivamani Date: Wed, 3 Jun 2026 19:13:23 +0000 Subject: [PATCH 3/6] Fix Signed-off-by: Kirthi Shankar Sivamani --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 3d9e1de349..7f6b51c148 100644 --- a/setup.py +++ b/setup.py @@ -110,12 +110,10 @@ def setup_requirements() -> Tuple[List[str], List[str]]: "pydantic", "importlib-metadata>=1.0", "packaging", + cusolvermp_pypi_package_name(), ] test_reqs: List[str] = ["pytest>=8.2.1"] - if bool(int(os.getenv("NVTE_WITH_CUSOLVERMP", "0"))): - install_reqs.append(cusolvermp_pypi_package_name()) - # Framework-specific requirements if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))): if "pytorch" in frameworks: From 799318095884085c39b76562ff71b5796cc66a57 Mon Sep 17 00:00:00 2001 From: ksivamani Date: Fri, 5 Jun 2026 16:57:11 -0400 Subject: [PATCH 4/6] fix Signed-off-by: ksivamani --- build_tools/utils.py | 9 +++++++++ setup.py | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/build_tools/utils.py b/build_tools/utils.py index f2548b4de6..dd8812fed2 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -296,6 +296,15 @@ def cuda_version() -> Tuple[int, ...]: raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") +def cublas_pypi_install_requirement(cuda_major: Optional[int] = None) -> Optional[str]: + """Pip install requirement for cuBLAS PyPI package, if a minimum version is needed.""" + if cuda_major is None: + cuda_major = cuda_version()[0] + if cuda_major == 13: + return "nvidia-cublas>=13.3.0.5" + return None + + def cusolvermp_pypi_package_name(cuda_major: Optional[int] = None) -> str: """PyPI package providing cuSolverMp runtime libraries for a CUDA major version.""" if cuda_major is None: diff --git a/setup.py b/setup.py index 7f6b51c148..cfaaef47be 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ from build_tools.utils import ( cuda_archs, cuda_version, + cublas_pypi_install_requirement, cusolvermp_pypi_package_name, get_frameworks, remove_dups, @@ -112,6 +113,9 @@ def setup_requirements() -> Tuple[List[str], List[str]]: "packaging", cusolvermp_pypi_package_name(), ] + cublas_req = cublas_pypi_install_requirement() + if cublas_req is not None: + install_reqs.append(cublas_req) test_reqs: List[str] = ["pytest>=8.2.1"] # Framework-specific requirements From 94e09d80734531971b9db858f6d8de3c1d687d62 Mon Sep 17 00:00:00 2001 From: ksivamani Date: Tue, 9 Jun 2026 16:50:39 -0400 Subject: [PATCH 5/6] Fix Signed-off-by: ksivamani --- transformer_engine/common/__init__.py | 29 ++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 55235fe94c..fd2d146616 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -255,6 +255,29 @@ def _nvidia_cudart_include_dir() -> str: return str(include_dir) if include_dir.exists() else "" +@functools.lru_cache(maxsize=None) +def _is_cusolvermp_installed_in_system() -> bool: + """Check if cuSolverMp is registered in the system library cache.""" + + if platform.system() != "Linux": + return False + + try: + result = subprocess.run( + ["ldconfig", "-p"], + capture_output=True, + text=True, + check=False, + ) + except (OSError, subprocess.SubprocessError): + return False + + if result.returncode != 0: + return False + + return any("cusolvermp" in line.lower() for line in result.stdout.splitlines()) + + @functools.lru_cache(maxsize=None) def _load_cuda_library_from_python(lib_name: str, strict: bool = False): """ @@ -369,7 +392,11 @@ def _load_core_library(): _, _CUDNN_LIB_CTYPES = _load_cuda_library("cudnn") system_nvrtc, _NVRTC_LIB_CTYPES = _load_cuda_library("nvrtc") system_curand, _CURAND_LIB_CTYPES = _load_cuda_library("curand") - _, _CUSOLVERMP_LIB_CTYPES = _load_cuda_library_from_python("cusolverMp", strict=False) + _CUSOLVERMP_LIB_CTYPES = None + if not _is_cusolvermp_installed_in_system() and any( + _is_package_installed(p) for p in ("nvidia-cusolvermp-cu12", "nvidia-cusolvermp-cu13") + ): + _, _CUSOLVERMP_LIB_CTYPES = _load_cuda_library_from_python("cusolverMp", strict=False) # This additional step is necessary to be able to install TE wheels # and import TE (without any guards) in an environment where the cuda From 34f828f9cf08b52f6f9dff93550f2eb77b6e8c6c Mon Sep 17 00:00:00 2001 From: ksivamani Date: Tue, 9 Jun 2026 16:52:29 -0400 Subject: [PATCH 6/6] rm prev cublas req Signed-off-by: ksivamani --- build_tools/utils.py | 9 --------- setup.py | 4 ---- 2 files changed, 13 deletions(-) diff --git a/build_tools/utils.py b/build_tools/utils.py index dd8812fed2..f2548b4de6 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -296,15 +296,6 @@ def cuda_version() -> Tuple[int, ...]: raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") -def cublas_pypi_install_requirement(cuda_major: Optional[int] = None) -> Optional[str]: - """Pip install requirement for cuBLAS PyPI package, if a minimum version is needed.""" - if cuda_major is None: - cuda_major = cuda_version()[0] - if cuda_major == 13: - return "nvidia-cublas>=13.3.0.5" - return None - - def cusolvermp_pypi_package_name(cuda_major: Optional[int] = None) -> str: """PyPI package providing cuSolverMp runtime libraries for a CUDA major version.""" if cuda_major is None: diff --git a/setup.py b/setup.py index cfaaef47be..7f6b51c148 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ from build_tools.utils import ( cuda_archs, cuda_version, - cublas_pypi_install_requirement, cusolvermp_pypi_package_name, get_frameworks, remove_dups, @@ -113,9 +112,6 @@ def setup_requirements() -> Tuple[List[str], List[str]]: "packaging", cusolvermp_pypi_package_name(), ] - cublas_req = cublas_pypi_install_requirement() - if cublas_req is not None: - install_reqs.append(cublas_req) test_reqs: List[str] = ["pytest>=8.2.1"] # Framework-specific requirements