deepspeedai · tohtana · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026 · Jan 17, 2026
@@ -0,0 +1,106 @@
+################################################################################
+# DeepSpeed CI - AWS L40S GPU Tests (PyTorch Latest - Full Unit Tests)
+#
+# Migrated from nv-torch-latest-v100.yml which ran on deprecated V100 cluster.
+# Runs the full unit test suite (tests/unit/) on AWS self-hosted runners.
+# Manual trigger only (workflow_dispatch).
+################################################################################
+
+name: aws-torch-latest-full
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    name: Unit Tests (Full)
+    runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
+    timeout-minutes: 60
+
+    container:
+      image: nvidia/cuda:12.6.3-devel-ubuntu22.04
+      # Mount /mnt/aio for async I/O tests (O_DIRECT requires native filesystem, not overlayfs)
+      options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
+
+    env:
+      TORCH_VER: "2.7"
+      CUDA_VER: "12.6"
+      # Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
+      DS_DISABLE_REUSE_DIST_ENV: "1"
+
+    steps:
+      - name: Install system dependencies
+        run: |
+          apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
+          git lfs install
+          ln -sf /usr/bin/python3 /usr/bin/python
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install PyTorch
+        run: |
+          pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          git checkout 981c276
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install Python dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -r requirements/requirements.txt
+          pip install -r requirements/requirements-dev.txt
+          pip install -r requirements/requirements-deepcompile.txt
+          pip install pytest-timeout pytest-instafail
+
+      - name: Check environment
+        run: |
+          echo "=== GPU Information ==="
+          nvidia-smi
+          echo ""
+          echo "=== CUDA Version ==="
+          nvcc --version
+          echo ""
+          echo "=== Python/PyTorch Info ==="
+          python --version
+          python -c "import torch; print(f'PyTorch: {torch.__version__}')"
+          python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+          python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
+          python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
+
+      - name: Install DeepSpeed
+        run: |
+          # Initialize CUDA before install so setup.py can detect NCCL version
+          python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
+          # Use --no-build-isolation so setup.py can access pre-installed PyTorch
+          pip install --no-build-isolation .[dev,1bit,autotuning,deepcompile]
+          ds_report
+          # Debug: Check captured torch_info values
+          python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          # Use -n 1 to run tests sequentially to avoid parallel execution issues
+          # Use /mnt/aio/pytest as basetemp for O_DIRECT support in aio tests
+          rm -rf /mnt/aio/pytest
+          pytest -x --instafail --timeout 600 --forked -n 1 --basetemp=/mnt/aio/pytest unit/ --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
+          rm -rf /mnt/aio/pytest
+          pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
@@ -42,24 +42,28 @@ jobs:
               - '!tests/unit/inference/v2/**'
 
   unit-tests:
-    name: Unit Tests (V1)
+    name: Unit Tests (Full)
     needs: check-paths
     if: needs.check-paths.outputs.should_run == 'true'
     runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
-    timeout-minutes: 60
+    timeout-minutes: 180
 
     container:
       image: nvidia/cuda:12.6.3-devel-ubuntu22.04
-      options: --gpus all --shm-size "32G"
+      # Mount /mnt/aio for async I/O tests (O_DIRECT requires native filesystem, not overlayfs)
+      options: --gpus all --shm-size "32G" -v /mnt/aio:/mnt/aio
 
     env:
       TORCH_VER: "2.7"
       CUDA_VER: "12.6"
+      CUTLASS_PATH: /opt/cutlass
+      # Disable reuse_dist_env to prevent pool worker cleanup hangs in full test runs
+      DS_DISABLE_REUSE_DIST_ENV: "1"
 
     steps:
       - name: Install system dependencies
         run: |
-          apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
+          apt-get update && apt-get install -y git git-lfs libaio-dev pdsh python3 python3-pip
           git lfs install
           ln -sf /usr/bin/python3 /usr/bin/python
 
@@ -68,16 +72,30 @@ jobs:
         with:
           lfs: true
 
+      - name: Install CUTLASS
+        run: |
+          git clone --depth 1 --branch v3.5.1 https://github.com/NVIDIA/cutlass.git /opt/cutlass
+          echo "CUTLASS installed at /opt/cutlass"
+          ls -la /opt/cutlass/include/ | head -10
+
       - name: Install PyTorch
         run: |
           pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
 
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git checkout 981c276
+          pip install .
+
       - name: Install Python dependencies
         run: |
           pip install --upgrade pip
           pip install -r requirements/requirements.txt
           pip install -r requirements/requirements-dev.txt
           pip install -r requirements/requirements-deepcompile.txt
+          pip install pytest-timeout pytest-instafail
 
       - name: Check environment
         run: |
@@ -93,17 +111,45 @@ jobs:
           python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
           python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
           python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
+          echo ""
+          echo "=== CUTLASS ==="
+          echo "CUTLASS_PATH: $CUTLASS_PATH"
+          ls -la $CUTLASS_PATH/include/ | head -5
 
       - name: Install DeepSpeed
         run: |
           # Initialize CUDA before install so setup.py can detect NCCL version
           python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
           # Use --no-build-isolation so setup.py can access pre-installed PyTorch
-          pip install --no-build-isolation .
+          pip install --no-build-isolation .[dev,1bit,autotuning,deepcompile]
           ds_report
-          # Debug: Check captured torch_info values
-          python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
 
-      - name: Run unit tests
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
         run: |
-          pytest -n 4 --forked --verbose tests/unit/v1/ --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
+          export TORCH_CUDA_ARCH_LIST="8.9"
+          cd tests
+          # Skip tests requiring unavailable hardware or known issues:
+          # - nvme checkpointing: no nvme device
+          # - GDS tests: no GPUDirect Storage support
+          # - launcher user_args: pdsh requires SSH server
+          # - zenflow: Stage 3 tests have pre-existing bugs + CUDA/fork issues
+          rm -rf /mnt/aio/pytest
+          pytest --instafail --timeout 600 --forked -n 8 --basetemp=/mnt/aio/pytest unit/ \
+            --ignore=unit/runtime/zero/test_nvme_checkpointing.py \
+            --ignore=unit/ops/aio/test_gds.py \
+            --ignore=unit/launcher/test_user_args.py \
+            --ignore=unit/runtime/zenflow \
+            --ignore=unit/ops/adam/test_zf_torch_adam.py \
+            --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
+          rm -rf /mnt/aio/pytest
+          pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
+            --ignore=unit/runtime/zero/test_nvme_checkpointing.py \
+            --ignore=unit/ops/aio/test_gds.py \
+            --ignore=unit/launcher/test_user_args.py \
+            --ignore=unit/runtime/zenflow \
+            --ignore=unit/ops/adam/test_zf_torch_adam.py \
+            --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}
@@ -110,7 +110,9 @@ def scale_if_loss(self, value: Any) -> Any:
                 return self.external_loss_scale * value
             if self.torch_autocast_gradscaler:
                 return self.torch_autocast_gradscaler.scale(value)
-            return self.loss_scaler.scale_loss(value)
+            # Only call loss_scaler if it exists (not present in BF16_Optimizer)
+            if hasattr(self, 'loss_scaler') and self.loss_scaler is not None:
+                return self.loss_scaler.scale_loss(value)
 
         return value
 

@@ -59,6 +59,11 @@ def __init__(self,
                                   ], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}"
         self.grad_acc_dtype = grad_acc_dtype
 
+        # BF16 doesn't use loss scaling, but these attributes are needed for API compatibility
+        self.custom_loss_scaler = False
+        self.external_loss_scale = None
+        self.torch_autocast_gradscaler = None
+
         self.immediate_grad_update = bfloat16_config.immediate_grad_update
 
         self.clip_grad = clip_grad

@@ -144,7 +144,9 @@
 BFLOAT16_OPTIMIZER_STATES_DEFAULT = False
 
 # DDP variant of BFLOAT16
-DDP_BFLOAT16 = "bf16"
+# DDP variant: bf16 model with bf16 grad accumulation (uses FP16_Optimizer in bf16 mode)
+# Must be different from BFLOAT16 to allow proper optimizer selection
+DDP_BFLOAT16 = "ddp_bf16"
 
 #########################################
 # FP16 support

@@ -2934,6 +2934,11 @@ def _get_gradients_for_reduction(self):
             if not param.requires_grad:
                 continue
 
+            # Skip empty parameters (numel=0) as they contribute nothing to gradient reduction
+            # and cause issues with flatten/unflatten operations
+            if param.numel() == 0:
+                continue
+
             if param.grad is None:
                 # In cases where there is an imbalance of empty grads across
                 # ranks we must create empty grads, this will ensure that every
@@ -3414,7 +3419,7 @@ def _load_checkpoint(self,
             if self.optimizer is not None and hasattr(self.optimizer, 'refresh_fp32_params'):
                 self.optimizer.refresh_fp32_params()
         else:
-            has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
+            has_zero_optimizer_state = self.zero_optimization()
             if load_optimizer_states and self.optimizer is not None and not has_zero_optimizer_state:
                 if self.has_moe_layers:
                     largest_group_name = groups._get_max_expert_size_name()
@@ -3883,7 +3888,7 @@ def _save_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_parame
 
         save_path = self._get_ckpt_name(save_dir, tag)
 
-        zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
+        zero_optimizer_state = self.zero_optimization()
 
         save_frozen_param = self.zero_optimization_partition_gradients() and not exclude_frozen_parameters
 

@@ -852,13 +852,22 @@ def _exec_backward_pass(self, buffer_id):
             # manually call because we don't call optimizer.backward()
             self.optimizer.clear_lp_grads()
 
-        # This handles either a single tensor or tuple of tensors.
-        if isinstance(outputs, tuple):
-            out_tensors = [t for t in outputs if t.is_floating_point()]
-            assert len(out_tensors) == len(grad_tensors)
-            torch.autograd.backward(tensors=out_tensors, grad_tensors=grad_tensors)
-        else:
-            torch.autograd.backward(tensors=(outputs, ), grad_tensors=(grad_tensors, ))
+        # Set _running_engine_backward to avoid RuntimeError in post-backward hook
+        # when needs_scaler=True (the hook checks this flag to skip error checking)
+        self._running_engine_backward = True
+        try:
+            # Use tensor.backward(gradient) style which is now supported by DeepSpeed.
+            # This properly integrates with DeepSpeed's hooks and loss scaling.
+            if isinstance(outputs, tuple):
+                out_tensors = [t for t in outputs if t.is_floating_point()]
+                assert len(out_tensors) == len(grad_tensors)
+                # For multiple tensors, use retain_graph for all but the last
+                for i, (out, grad) in enumerate(zip(out_tensors, grad_tensors)):
+                    out.backward(gradient=grad, retain_graph=(i < len(out_tensors) - 1))
+            else:
+                outputs.backward(gradient=grad_tensors)
+        finally:
+            self._running_engine_backward = False
 
         if self.using_bf16_optimizer and not self.is_last_stage():
             # manually call because we don't call optimizer.backward()

@@ -13,15 +13,22 @@ tags: training inference
 
 ### 3.1 Installation
 
-`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3. `DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
+`DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3.
 
+`DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`.
+CUTLASS setup detection can be ignored by setting ```CUTLASS_PATH="DS_IGNORE_CUTLASS_DETECTION"```, which is useful if you have a well setup compiler (e.g., compiling in a conda package with cutlass and the cuda compilers installed).
+CUTLASS location can be automatically inferred using pypi's [nvidia-cutlass](https://pypi.org/project/nvidia-cutlass/) package by setting ```CUTLASS_PATH="DS_USE_CUTLASS_PYTHON_BINDINGS"```. Note that this is discouraged as ```nvidia-cutlass``` is not maintained anymore and outdated.
+
+You can always simply clone cutlass and setup ```CUTLASS_PATH```:
 ```shell
 git clone https://github.com/NVIDIA/cutlass
 export CUTLASS_PATH=/path/to/cutlass
 ```
 The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.
 
 `DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
+The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```
+and the environment value ```DS_EVOFORMER_GPU_ARCH={70|75|80}```, which controls the target GPU (80 being the last supported and meaning NVIDIA Ampere and later).
 
 ### 3.2 Unit test and benchmark