Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6b57108
docs: add DeepseekV4Template for DeepSeek V4 chat template encoding
meichangsu1 Apr 28, 2026
a3d201b
wip
meichangsu1 Apr 29, 2026
6b3df6f
wip
meichangsu1 Apr 29, 2026
229d2e0
refactor: remove debug logging from expert parallel implementation
meichangsu1 May 7, 2026
0e00035
refactor: remove sync_after_backward mechanism from expert parallel
meichangsu1 May 8, 2026
03289b5
feat: add configurable distributed timeout via TWINKLE_DIST_TIMEOUT_S…
meichangsu1 May 11, 2026
26433b8
fix: patch accelerate FSDP2 state dict loading to handle unsharded bu…
meichangsu1 May 11, 2026
9e7efb0
fix: handle missing parameter in FSDP2 load state dict patch
meichangsu1 May 11, 2026
4464251
feat(cookbook): add environment variable parsing utilities and enhanc…
meichangsu1 May 11, 2026
715df92
feat: add FSDP debug logging and early return for CUDA devices
meichangsu1 May 11, 2026
7ef1499
wip
meichangsu1 May 11, 2026
d087a0e
feat: enhance FSDP debug logging with train_debug utility and detaile…
meichangsu1 May 11, 2026
d206828
wip
meichangsu1 May 11, 2026
cb86c20
wip
meichangsu1 May 11, 2026
694379e
refactor(debug): enhance debug logging with timestamps and file output
meichangsu1 May 11, 2026
43bad86
wip
meichangsu1 May 11, 2026
8605169
fix: improve FSDP2 and native FSDP pretrained loading with EP support
meichangsu1 May 11, 2026
f7f6179
fix: correct indentation and add debug logging for FSDP expert parall…
meichangsu1 May 11, 2026
291f342
feat(moe): add expert parallelism debug tracing
meichangsu1 May 11, 2026
e44eb8e
feat(moe): add debug tag to all-to-all operations for improved tracing
meichangsu1 May 11, 2026
c4a9bd8
fix: preserve autograd graph and fix dtype mismatch in EP and FSDP
meichangsu1 May 11, 2026
dce440a
wi p
meichangsu1 May 11, 2026
ee339b3
refactor(moe): remove debug tracing from EP utilities
meichangsu1 May 12, 2026
b0ef503
wip
meichangsu1 May 12, 2026
fcfea94
fix: optimize memory usage in FSDP expert parameter broadcasting
meichangsu1 May 12, 2026
b616a63
fix native fsdp
kevssim May 12, 2026
0905ebd
Merge remote-tracking branch 'origin/main' into ep_lora
kevssim May 15, 2026
9494caa
fix(ep): fall back to plural shared_experts for dsv4 compatibility
kevssim May 15, 2026
c71a238
test(ep-lora): add P0 spike for ParamWrapper + FSDP2 DTensor compatib…
kevssim May 15, 2026
05e95ec
feat(ep-lora): trigger EP slicing before PEFT adapter patching
kevssim May 15, 2026
494198d
feat(ep-lora): add EP-aware PEFT save and load handling
kevssim May 15, 2026
3f7f434
test(ep-lora): add EP LoRA cookbooks and validation scripts
kevssim May 15, 2026
0cbb0fc
test(ep-lora): handle nested text configs in validation scripts
kevssim May 15, 2026
6ac03b2
delete
kevssim May 20, 2026
d795c77
fix
kevssim May 20, 2026
0c47857
wip
kevssim May 21, 2026
77dc099
cookboook
kevssim May 21, 2026
8caeda9
Merge remote-tracking branch 'origin/main' into ep_lora
kevssim May 21, 2026
cf0b79f
lint
kevssim May 21, 2026
0ed0e8f
WIP
kevssim May 21, 2026
442262d
wip
kevssim May 21, 2026
9e70d58
cookbook
kevssim May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ sh INSTALL_MEGATRON.sh
| Training Type | Model Framework | Cookbook Path |
| ------------------------------------ | --------------- | ----------------------------------------------------- |
| FSDP finetuning | transformers | [Script](cookbook/transformers/fsdp2.py) |
| FSDP MoE finetuning | transformers | [Script](cookbook/transformers/fsdp2_moe.py) |
| EP FSDP MoE finetuning | transformers | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) |
| EP FSDP2 LoRA finetuning | transformers | [Script](cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py) |
| SP FSDP finetuning | transformers | [Script](cookbook/transformers/sp_fsdp_dense.py) |
| pp/tp/cp finetuning | megatron | [Script](cookbook/megatron/tp.py) |
| pp/tp/cp MoE finetuning | megatron | [Script](cookbook/megatron/tp_moe.py) |
Expand Down
148 changes: 148 additions & 0 deletions cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
"""EP + FSDP2 + LoRA SFT cookbook for DeepSeek-V4.

Run on 4 GPUs:
torchrun --nproc-per-node=4 cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
"""
import os
from pathlib import Path

from peft import LoraConfig
from transformers import AutoConfig

import twinkle
from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.model import TransformersModel
from twinkle.preprocessor import SelfCognitionProcessor

logger = get_logger()

MODEL_ID = os.environ.get('DSV4_MODEL_ID', 'ms://deepseek-ai/DeepSeek-V4')
DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition')
TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'DeepseekV4Template')
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4'))
GRAD_ACCUM_STEPS = int(os.environ.get('GRAD_ACCUM_STEPS', '4'))
LOG_INTERVAL = GRAD_ACCUM_STEPS
LR = float(os.environ.get('LR', '1e-4'))
MAX_GRAD_NORM = float(os.environ.get('MAX_GRAD_NORM', '1.0'))
LORA_R = int(os.environ.get('LORA_R', '8'))
LORA_ALPHA = int(os.environ.get('LORA_ALPHA', '32'))
ENABLE_EP = os.environ.get('ENABLE_EP', '1') == '1'
OUTPUT_DIR = os.environ.get('OUTPUT_DIR', './output_dsv4')
RESUME_FROM_CHECKPOINT = os.environ.get('RESUME_FROM_CHECKPOINT') or None
RESUME_ONLY_MODEL = os.environ.get('RESUME_ONLY_MODEL', '0') == '1'
IGNORE_DATA_SKIP = os.environ.get('IGNORE_DATA_SKIP', '0') == '1'
ADAPTER_NAME = os.environ.get('ADAPTER_NAME', 'default')

device_mesh = DeviceMesh.from_sizes(
fsdp_size=4,
dp_size=1,
ep_size=4,
device_type=Platform.get_platform().device_prefix(),
)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)


def _build_lora_config(enable_ep: bool):
if enable_ep:
return LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules='all-linear',
exclude_modules=['o_a_proj'],
target_parameters=['mlp.experts.gate_up_proj', 'mlp.experts.down_proj'],
)
# Expert weights are bare nn.Parameters. PEFT trains them through
# target_parameters/ParamWrapper, which dynamically parametrizes weights
# during forward. That is not stable with plain FSDP2, so non-EP mode uses
# regular module LoRA and does not train expert parameters.
return LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
exclude_modules=['o_a_proj'],
target_modules='all-linear',
)


def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
return model.save(
name=checkpoint_name,
output_dir=OUTPUT_DIR,
adapter_name=ADAPTER_NAME,
save_optimizer=True,
consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
)


def train():
config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
text_config = getattr(config, 'text_config', config)
if hasattr(text_config, 'use_cache'):
text_config.use_cache = False

dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID))
dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID)
dataset.map(SelfCognitionProcessor('twinkle', 'ModelScope'))
dataset.encode(batched=True)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=device_mesh)

model = TransformersModel(
model_id=MODEL_ID,
config=config,
device_mesh=device_mesh,
strategy='native_fsdp',
memory_efficient_init=True,
fsdp_config={
'expert_parallel': {
'enabled': ENABLE_EP,
'router_dtype': 'fp32',
'keep_router_logits': False,
}
},
)
lora_cfg = _build_lora_config(ENABLE_EP)
model.add_adapter_to_model(ADAPTER_NAME, lora_cfg, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
model.set_optimizer('AdamW', lr=LR, foreach=False)
model.set_lr_scheduler(
scheduler_cls='CosineWarmupScheduler',
num_warmup_steps=5,
num_training_steps=len(dataloader),
)

if RESUME_FROM_CHECKPOINT:
checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
kwargs = {}
if ADAPTER_NAME:
kwargs['adapter_name'] = ADAPTER_NAME
progress = model.resume_from_checkpoint(
str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
if not IGNORE_DATA_SKIP:
dataloader.resume_from_checkpoint(progress['consumed_train_samples'])

logger.info(get_device_placement())
logger.info(model.get_train_configs())
logger.info(
f'Total steps: {len(dataloader)}, batch_size={BATCH_SIZE}, grad_accum={GRAD_ACCUM_STEPS}, '
f'enable_ep={ENABLE_EP}, output_dir={OUTPUT_DIR}')

optimizer_group = model.optimizer_group[ADAPTER_NAME]
for batch in dataloader:
if callable(batch):
batch = batch()
model.forward_backward(inputs=batch)
model.clip_grad_and_step(max_grad_norm=MAX_GRAD_NORM, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
cur_step = optimizer_group.cur_step
if cur_step > 0 and cur_step % LOG_INTERVAL == 0:
metric = model.calculate_metric(is_training=True)
if callable(metric):
metric = metric()
logger.info(f'Current is step {cur_step} of {len(dataloader)}, metric: {metric}')

final_checkpoint = save_checkpoint(model, 'checkpoint-final', dataloader)
logger.info(f'Saved final adapter to {final_checkpoint}')


if __name__ == '__main__':
train()
16 changes: 16 additions & 0 deletions cookbook/transformers/ep_fsdp2_lora_deepseek_v4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail

# EP + FSDP2 + LoRA training for DeepSeek-V4.
# ENABLE_EP=1 trains expert LoRA with target_parameters.
# ENABLE_EP=0 runs plain FSDP2 LoRA and does not train expert parameters.

export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
export NPROC_PER_NODE="${NPROC_PER_NODE:-4}"
export ENABLE_EP="${ENABLE_EP:-1}"
export BATCH_SIZE="${BATCH_SIZE:-4}"
export GRAD_ACCUM_STEPS="${GRAD_ACCUM_STEPS:-4}"
export OUTPUT_DIR="${OUTPUT_DIR:-./output_dsv4}"

torchrun --nproc-per-node="${NPROC_PER_NODE}" \
cookbook/transformers/ep_fsdp2_lora_deepseek_v4.py
148 changes: 148 additions & 0 deletions cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright (c) ModelScope Contributors. All rights reserved.
"""EP + FSDP2 + LoRA SFT cookbook for Qwen3.5-MoE.

Run on 4 GPUs:
torchrun --nproc-per-node=4 cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
"""
import os
from pathlib import Path

from peft import LoraConfig
from transformers import AutoConfig

import twinkle
from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.model import TransformersModel
from twinkle.preprocessor import SelfCognitionProcessor

logger = get_logger()

MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition')
TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Qwen3_5Template')
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '4'))
GRAD_ACCUM_STEPS = int(os.environ.get('GRAD_ACCUM_STEPS', '4'))
LOG_INTERVAL = GRAD_ACCUM_STEPS
LR = float(os.environ.get('LR', '1e-4'))
MAX_GRAD_NORM = float(os.environ.get('MAX_GRAD_NORM', '1.0'))
LORA_R = int(os.environ.get('LORA_R', '8'))
LORA_ALPHA = int(os.environ.get('LORA_ALPHA', '32'))
ENABLE_EP = os.environ.get('ENABLE_EP', '1') == '1'
OUTPUT_DIR = os.environ.get('OUTPUT_DIR', './output')
RESUME_FROM_CHECKPOINT = os.environ.get('RESUME_FROM_CHECKPOINT') or None
RESUME_ONLY_MODEL = os.environ.get('RESUME_ONLY_MODEL', '0') == '1'
IGNORE_DATA_SKIP = os.environ.get('IGNORE_DATA_SKIP', '0') == '1'
ADAPTER_NAME = os.environ.get('ADAPTER_NAME', 'default')

device_mesh = DeviceMesh.from_sizes(
fsdp_size=4,
dp_size=1,
ep_size=4,
device_type=Platform.get_platform().device_prefix(),
)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)


def _build_lora_config(enable_ep: bool):
if enable_ep:
return LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules='all-linear',
target_parameters=['mlp.experts.gate_up_proj', 'mlp.experts.down_proj'],
)
# Expert weights are bare nn.Parameters. PEFT trains them through
# target_parameters/ParamWrapper, which dynamically parametrizes weights
# during forward. That is not stable with plain FSDP2, so non-EP mode uses
# regular module LoRA and does not train expert parameters.
return LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
target_modules='all-linear',
)


def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
return model.save(
name=checkpoint_name,
output_dir=OUTPUT_DIR,
adapter_name=ADAPTER_NAME,
save_optimizer=True,
consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
)


def train():
config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)
text_config = getattr(config, 'text_config', config)
if hasattr(text_config, 'use_cache'):
text_config.use_cache = False

dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID))
try:
dataset.set_template(TEMPLATE_ID, model_id=MODEL_ID)
except ValueError:
dataset.set_template('Qwen3_5Template', model_id=MODEL_ID)
dataset.map(SelfCognitionProcessor('twinkle', 'ModelScope'))
dataset.encode(batched=True)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, device_mesh=device_mesh)

model = TransformersModel(
model_id=MODEL_ID,
config=config,
device_mesh=device_mesh,
strategy='native_fsdp',
fsdp_config={
'expert_parallel': {
'enabled': ENABLE_EP,
'router_dtype': 'fp32',
'keep_router_logits': False,
}
},
)
lora_cfg = _build_lora_config(ENABLE_EP)
model.add_adapter_to_model(ADAPTER_NAME, lora_cfg, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
model.set_optimizer('AdamW', lr=LR, foreach=False)
model.set_lr_scheduler(
scheduler_cls='CosineWarmupScheduler',
num_warmup_steps=5,
num_training_steps=len(dataloader),
)

if RESUME_FROM_CHECKPOINT:
checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
kwargs = {}
if ADAPTER_NAME:
kwargs['adapter_name'] = ADAPTER_NAME
progress = model.resume_from_checkpoint(
str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
if not IGNORE_DATA_SKIP:
dataloader.resume_from_checkpoint(progress['consumed_train_samples'])

logger.info(get_device_placement())
logger.info(model.get_train_configs())
logger.info(
f'Total steps: {len(dataloader)}, batch_size={BATCH_SIZE}, grad_accum={GRAD_ACCUM_STEPS}, '
f'enable_ep={ENABLE_EP}, output_dir={OUTPUT_DIR}')

optimizer_group = model.optimizer_group[ADAPTER_NAME]
for batch in dataloader:
if callable(batch):
batch = batch()
model.forward_backward(inputs=batch)
model.clip_grad_and_step(max_grad_norm=MAX_GRAD_NORM, gradient_accumulation_steps=GRAD_ACCUM_STEPS)
cur_step = optimizer_group.cur_step
if cur_step > 0 and cur_step % LOG_INTERVAL == 0:
metric = model.calculate_metric(is_training=True)
if callable(metric):
metric = metric()
logger.info(f'Current is step {cur_step} of {len(dataloader)}, metric: {metric}')

final_checkpoint = save_checkpoint(model, 'checkpoint-final', dataloader)
logger.info(f'Saved final adapter to {final_checkpoint}')


if __name__ == '__main__':
train()
16 changes: 16 additions & 0 deletions cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail

# EP + FSDP2 + LoRA training for Qwen3.5-MoE.
# ENABLE_EP=1 trains expert LoRA with target_parameters.
# ENABLE_EP=0 runs plain FSDP2 LoRA and does not train expert parameters.

export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3}"
export NPROC_PER_NODE="${NPROC_PER_NODE:-4}"
export ENABLE_EP="${ENABLE_EP:-1}"
export BATCH_SIZE="${BATCH_SIZE:-4}"
export GRAD_ACCUM_STEPS="${GRAD_ACCUM_STEPS:-4}"
export OUTPUT_DIR="${OUTPUT_DIR:-./output_qwen3_5_moe}"

torchrun --nproc-per-node="${NPROC_PER_NODE}" \
cookbook/transformers/ep_fsdp2_lora_qwen3_5_moe.py
Loading
Loading