Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 75 additions & 82 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,72 +22,65 @@ jobs:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
apt-get install -y git python3.9 pip cudnn9-cuda-12 ccache
pip install cmake==3.21.0 pybind11[global] ninja
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- uses: actions/cache@v4
with:
path: /root/.ccache
key: ccache-${{ runner.os }}-core-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-core-${{ github.ref }}-
ccache-${{ runner.os }}-core-
- name: 'Build'
run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v
run: NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: none
MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
CCACHE_DIR: /root/.ccache
CCACHE_MAXSIZE: 5G
- name: 'Sanity check'
run: python3 -c "import transformer_engine"
working-directory: /
pytorch:
name: 'PyTorch'
runs-on: ubuntu-latest
container:
image: ghcr.io/ptrendx/te_gha_pytorch:latest
options: --user root
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"

- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'

- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"

- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive

- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity

- name: 'Dependencies'
run: |
docker exec builder bash -c '\
apt-get update && \
apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
apt-get clean \
'

- name: "Disk space check after dependencies"
run: df -lh
- uses: actions/cache@v4
with:
path: /root/.ccache
key: ccache-${{ runner.os }}-pytorch-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-pytorch-${{ github.ref }}-
ccache-${{ runner.os }}-pytorch-
- name: "Disk space check after dependencies and ccache"
run: df -lh
- name: 'Build'
run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
run: |
export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')")
export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')")
export PATH=$CUDA_PATH/bin:$PATH
export NVTE_BUILD_USE_NVIDIA_WHEELS=1
NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
MAX_JOBS: 2
CCACHE_DIR: /root/.ccache
CCACHE_MAXSIZE: 5G
- name: 'Sanity check'
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
run: python3 tests/pytorch/test_sanity_import.py
jax:
name: 'JAX'
runs-on: ubuntu-latest
Expand All @@ -96,65 +89,65 @@ jobs:
options: --user root
steps:
- name: 'Dependencies'
run: pip install cmake==3.21.0 pybind11[global]
run: |
pip install cmake==3.21.0 pybind11[global]
apt-get update && apt-get install -y ccache
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- uses: actions/cache@v4
with:
path: /root/.ccache
key: ccache-${{ runner.os }}-jax-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-jax-${{ github.ref }}-
ccache-${{ runner.os }}-jax-
- name: 'Build'
run: |
NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: jax
MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
CCACHE_DIR: /root/.ccache
CCACHE_MAXSIZE: 5G
- name: 'Sanity check'
run: python3 tests/jax/test_sanity_import.py
all:
name: 'All'
runs-on: ubuntu-latest
container:
image: ghcr.io/ptrendx/te_gha_all:latest
options: --user root
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"

- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'

- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"

- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive

- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity

- name: 'Dependencies'
run: |
docker exec builder bash -c '\
pip install cmake==3.21.0 pybind11[global] einops onnxscript && \
pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
'
- name: "Disk space check after dependencies"
run: df -lh
- uses: actions/cache@v4
with:
path: /root/.ccache
key: ccache-${{ runner.os }}-all-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-all-${{ github.ref }}-
ccache-${{ runner.os }}-all-
- name: "Disk space check after dependencies and ccache"
run: df -lh
- name: 'Build'
run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps'
run: |
export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')")
export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')")
export PATH=$CUDA_PATH/bin:$PATH
export NVTE_BUILD_USE_NVIDIA_WHEELS=1
NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
env:
NVTE_FRAMEWORK: all
MAX_JOBS: 1
- name: 'Sanity check'
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py'
MAX_JOBS: 2
CCACHE_DIR: /root/.ccache
CCACHE_MAXSIZE: 5G
- name: 'Sanity check (pytorch)'
run: python3 tests/pytorch/test_sanity_import.py
- name: 'Sanity check (jax)'
run: python3 tests/jax/test_sanity_import.py
31 changes: 31 additions & 0 deletions build_tools/ci/Dockerfile.all
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM ubuntu:24.04

# Container dependencies
RUN apt-get update && apt-get install -y git python3 python3-pip ccache

ENV PIP_BREAK_SYSTEM_PACKAGES=1

# Python build dependencies
RUN pip install cmake ninja pybind11 numpy packaging

# PyTorch (CUDA 13.0)
RUN pip install torch --index-url https://download.pytorch.org/whl/cu130

# JAX with CUDA 13 support
RUN pip install "jax[cuda13]"

# NVIDIA CUDA toolkit wheels
RUN pip install \
"nvidia-cuda-nvcc<13.1" \
"nvidia-cuda-cccl<13.1" \
"nvidia-cuda-crt<13.1" \
"nvidia-nvvm<13.1" \
"nvidia-cuda-profiler-api<13.1" \
"nvidia-nvml-dev<13.1"

# Create symlinks for CUDA libraries
RUN CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") && \
ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so && \
ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so && \
ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so && \
ln -s $CUDA_PATH/lib $CUDA_PATH/lib64
28 changes: 28 additions & 0 deletions build_tools/ci/Dockerfile.pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM ubuntu:24.04

# Container dependencies
RUN apt-get update && apt-get install -y git python3 python3-pip ccache

ENV PIP_BREAK_SYSTEM_PACKAGES=1

# Python build dependencies
RUN pip install cmake ninja pybind11 numpy packaging

# PyTorch (CUDA 13.0)
RUN pip install torch --index-url https://download.pytorch.org/whl/cu130

# NVIDIA CUDA toolkit wheels
RUN pip install \
"nvidia-cuda-nvcc<13.1" \
"nvidia-cuda-cccl<13.1" \
"nvidia-cuda-crt<13.1" \
"nvidia-nvvm<13.1" \
"nvidia-cuda-profiler-api<13.1" \
"nvidia-nvml-dev<13.1"

# Create symlinks for CUDA libraries
RUN CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") && \
ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so && \
ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so && \
ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so && \
ln -s $CUDA_PATH/lib $CUDA_PATH/lib64
Comment on lines +22 to +28
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing libcudnn.so unversioned symlink

The symlink block creates unversioned aliases for libcudart, libcublas, and libnccl (needed for -lcudart, -lcublas, -lnccl at link time), but no equivalent symlink for libcudnn.so. The logging.h header unconditionally includes <cudnn.h>, and the build links against cudnn. If the cudnn pip wheel (typically pulled in as a torch dependency) only ships a versioned filename such as libcudnn.so.9, the linker will fail to resolve -lcudnn without an unversioned symlink.

Consider adding a similar symlink for cudnn, e.g.:

CUDNN_LIB=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn/lib')") && \
    ln -s $CUDNN_LIB/libcudnn.so.9 $CUDNN_LIB/libcudnn.so

The same applies to Dockerfile.all.

11 changes: 10 additions & 1 deletion build_tools/jax.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@

import setuptools

from .utils import get_cuda_include_dirs, all_files_in_dir, debug_build_enabled
from .utils import (
get_cuda_include_dirs,
all_files_in_dir,
debug_build_enabled,
get_cuda_library_dirs,
)
from typing import List


Expand Down Expand Up @@ -92,6 +97,9 @@ def setup_jax_extension(
]
)

# Library dirs
library_dirs = get_cuda_library_dirs()

# Compile flags
cxx_flags = ["-O3"]
if debug_build_enabled():
Expand All @@ -109,4 +117,5 @@ def setup_jax_extension(
include_dirs=[str(path) for path in include_dirs],
extra_compile_args=cxx_flags,
libraries=["nccl"],
library_dirs=[str(path) for path in library_dirs],
)
34 changes: 34 additions & 0 deletions build_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,33 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
]


@functools.lru_cache(maxsize=None)
def get_cuda_library_dirs() -> Tuple[str, str]:
"""Returns the CUDA library directory."""

force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0")))
# If cuda is installed via toolkit, all libraries
# are bundled inside the top level cuda directory.
if not force_wheels and cuda_toolkit_include_path() is not None:
return []

# Use pip wheels to include all libraries.
try:
import nvidia
except ModuleNotFoundError as e:
raise RuntimeError("CUDA not found.")

if nvidia.__file__ is not None:
cuda_root = Path(nvidia.__file__).parent
else:
cuda_root = Path(nvidia.__path__[0]) # namespace
return [
subdir / "lib"
for subdir in cuda_root.iterdir()
if subdir.is_dir() and (subdir / "lib").is_dir()
]


@functools.lru_cache(maxsize=None)
def cuda_archs() -> str:
archs = os.getenv("NVTE_CUDA_ARCHS")
Expand Down Expand Up @@ -292,6 +319,13 @@ def cuda_version() -> Tuple[int, ...]:
version_str = get_version("nvidia-cuda-runtime-cu12")
version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit())
return version_tuple
except:
pass
Comment on lines +322 to +323
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bare except: swallows all exceptions

The bare except: clause silently catches everything including KeyboardInterrupt, SystemExit, and MemoryError, making debugging harder if the failure isn't PackageNotFoundError. It should be narrowed to the expected exception type:

Suggested change
except:
pass
except importlib.metadata.PackageNotFoundError:
pass


try:
version_str = get_version("nvidia-cuda-runtime")
version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit())
return version_tuple
except importlib.metadata.PackageNotFoundError:
raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.")

Expand Down
4 changes: 2 additions & 2 deletions transformer_engine/common/util/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
#include <cudnn.h>
#include <nvrtc.h>

#include "nccl.h"

#ifdef NVTE_WITH_CUBLASMP
#include <cublasmp.h>

#include "nccl.h"
#endif // NVTE_WITH_CUBLASMP

#include <iostream>
Expand Down
1 change: 1 addition & 0 deletions transformer_engine/jax/csrc/extensions/cgemm_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#ifndef TRANSFORMER_ENGINE_JAX_CGEMM_HELPER_H_
#define TRANSFORMER_ENGINE_JAX_CGEMM_HELPER_H_

#include <nccl.h>
#include <unistd.h>

#include <chrono>
Expand Down
Loading