Skip to content
Draft
5 changes: 5 additions & 0 deletions benchmarks/autosp/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
*.log
*.pyc
logs
*.
*.pt
52 changes: 52 additions & 0 deletions benchmarks/autosp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# AutoSP Setup Guide
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a brief description of the files in this folder


Quick start guide to clone and set up the AutoSP repository.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Include a brief description that this example demonstrates usage in HF accelerate.


### Install dependencies

```bash
pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
```

```bash
pip install \
transformers==4.50.3 \
tokenizers \
huggingface-hub \
safetensors \
datasets \
accelerate \
scipy \
tqdm \
pyyaml
```

### Install DeepSpeed

```bash
pip install --no-build-isolation git+https://github.com/neeldani/DeepSpeed.git@autosp
```

## Benchmarking

See `benchmarks/autosp/` directory for benchmarking scripts:

```bash
cd benchmarks/autosp
```

#### Run autosp on 2 GPUs
```bash
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Include screenshot/snippet of expected output, this is helpful for users.

./run_autosp.sh --compile autosp --batch-size 1 --seq-length 64 --sp-size 2 --num-layers 1 --steps 1 --deterministic
```

#### Run eager mode ulysses on 2 GPUs
```bash
./run_autosp.sh --compile eager --batch-size 1 --seq-length 64 --sp-size 2 --num-layers 1 --steps 1 --deterministic
```

#### Run torch.compile'd ulysses on 2 GPUs
```bash
./run_autosp.sh --compile compile --batch-size 1 --seq-length 64 --sp-size 2 --num-layers 1 --steps 1 --deterministic
```
21 changes: 21 additions & 0 deletions benchmarks/autosp/configs/autosp_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{

"bf16": {
"enabled": true
},

"zero_optimization": {
"stage": 0
},
"compile": {
"deepcompile": true,
"passes": ["autosp"]
},
"gradient_accumulation_steps": 1,
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": 1,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false,
"sequence_parallel_size": 2
}
16 changes: 16 additions & 0 deletions benchmarks/autosp/configs/autosp_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
deepspeed_config_file: configs/autosp_config.json
distributed_type: DEEPSPEED
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
14 changes: 14 additions & 0 deletions benchmarks/autosp/configs/torchcompile_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"bf16": {
"enabled": true
},
"zero_optimization":{
"stage": 0
},
"gradient_accumulation_steps": 1,
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
16 changes: 16 additions & 0 deletions benchmarks/autosp/configs/torchcompile_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
deepspeed_config_file: configs/torchcompile_config.json
distributed_type: DEEPSPEED
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
137 changes: 137 additions & 0 deletions benchmarks/autosp/correctness/correctness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/bin/bash

# Correctness test suite for autosp vs baseline compiled DS-Ulysses.
#
# For each (sp_size, dp_size) x zero_stage configuration:
# 1. Runs baseline (--compile compile) for N steps
# 2. Runs autosp (--compile autosp) for N steps
# 3. Compares per-rank losses with validator.py
#
# Usage:
# ./correctness.sh # Default configs
# ./correctness.sh 2,1 2,2 4,1 # Custom sp,dp pairs

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_DIR="${SCRIPT_DIR}/output"
STEPS=5

cleanup() {
rm -rf "${OUTPUT_DIR}"
}
trap cleanup EXIT

# Parse sp,dp pairs from positional args (e.g. 2,1 2,2 4,1)
declare -a CONFIGS=()

if [ $# -gt 0 ]; then
for arg in "$@"; do
if [[ "$arg" =~ ^([0-9]+),([0-9]+)$ ]]; then
CONFIGS+=("$arg")
else
echo "Error: invalid config '${arg}'. Expected format: sp,dp (e.g. 2,1)"
exit 1
fi
done
else
CONFIGS=("1,1" "2,1" "4,1" "8,1")
fi

ZERO_STAGES=(0 1)

PASS_COUNT=0
FAIL_COUNT=0
TOTAL_COUNT=0
declare -a RESULTS=()

echo ""
echo "================================================================"
echo " AutoSP Correctness Test Suite"
echo "================================================================"
echo " Configs (sp,dp): ${CONFIGS[*]}"
echo " Zero stages: ${ZERO_STAGES[*]}"
echo " Steps: ${STEPS}"
echo " Output dir: ${OUTPUT_DIR}"
echo "================================================================"
echo ""

for config in "${CONFIGS[@]}"; do
sp_size="${config%%,*}"
dp_size="${config##*,}"

for zero_stage in "${ZERO_STAGES[@]}"; do
TEST_NAME="sp${sp_size}_dp${dp_size}_zero${zero_stage}"
TEST_DIR="${OUTPUT_DIR}/${TEST_NAME}"
mkdir -p "${TEST_DIR}"

((TOTAL_COUNT++))

echo "----------------------------------------------------------------"
echo " Test: sp_size=${sp_size}, dp_size=${dp_size}, zero_stage=${zero_stage}"
echo "----------------------------------------------------------------"

# --- Baseline (compiled DS-Ulysses) ---
echo " [1/3] Running baseline (--compile compile) ..."
if ! python3 "${SCRIPT_DIR}/correctness_run.py" \
--compile compile \
--sp-size "${sp_size}" \
--dp-size "${dp_size}" \
--zero-stage "${zero_stage}" \
--steps "${STEPS}" \
--output-file "${TEST_DIR}/baseline.json"; then

echo " FAIL: Baseline training failed"
RESULTS+=(" ${TEST_NAME}: FAIL (baseline training error)")
((FAIL_COUNT++))
echo ""
continue
fi

# --- AutoSP ---
echo " [2/3] Running autosp (--compile autosp) ..."
if ! python3 "${SCRIPT_DIR}/correctness_run.py" \
--compile autosp \
--sp-size "${sp_size}" \
--dp-size "${dp_size}" \
--zero-stage "${zero_stage}" \
--steps "${STEPS}" \
--output-file "${TEST_DIR}/autosp.json"; then

echo " FAIL: AutoSP training failed"
RESULTS+=(" ${TEST_NAME}: FAIL (autosp training error)")
((FAIL_COUNT++))
echo ""
continue
fi

# --- Validate ---
echo " [3/3] Validating per-rank losses ..."
if python3 "${SCRIPT_DIR}/validator.py" \
--baseline "${TEST_DIR}/baseline.json" \
--autosp "${TEST_DIR}/autosp.json"; then

RESULTS+=(" ${TEST_NAME}: PASS")
((PASS_COUNT++))
else
RESULTS+=(" ${TEST_NAME}: FAIL")
((FAIL_COUNT++))
fi

echo ""
done
done

# ---- Summary ----
echo "================================================================"
echo " SUMMARY"
echo "================================================================"
for result in "${RESULTS[@]}"; do
echo "${result}"
done
echo ""
echo " Passed: ${PASS_COUNT}/${TOTAL_COUNT} Failed: ${FAIL_COUNT}/${TOTAL_COUNT}"
echo "================================================================"

if [ "${FAIL_COUNT}" -gt 0 ]; then
exit 1
fi
exit 0
Loading