Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
97a47de
test: add checkpoint robustness functional tests
adil-a Mar 25, 2026
1a2e726
test: add optional cross-TP checkpoint robustness phase
adil-a Mar 25, 2026
47197f0
test: add GPT-OSS 20B checkpoint robustness, vLLM PEFT + smoke test mode
adil-a Mar 26, 2026
1ea0108
test: add GPT-OSS PEFT vLLM smoke test with native LoRA
adil-a Mar 26, 2026
5c7877e
test: default to 8 GPUs, add cross-TP to Llama SFT
adil-a Mar 26, 2026
b3077df
refactor: consolidate checkpoint robustness scripts per model
adil-a Mar 26, 2026
7ef62d5
test: add Nemotron Nano V3 checkpoint robustness and vLLM smoke tests
adil-a Mar 31, 2026
0462084
test: add cross-cutting checkpoint robustness features
adil-a Mar 31, 2026
229bb84
test: add checkpoint robustness configs for 12 new models
adil-a Mar 31, 2026
ce81ee2
test: limit dataset samples to 500, add memory thresholds
adil-a Apr 1, 2026
5928505
Merge remote-tracking branch 'origin/main' into adil-a/checkpoint-rob…
adil-a Apr 1, 2026
39a413e
test: tighten thresholds, add cross-TP, fix configs for validated models
adil-a Apr 1, 2026
2f1a5a9
docs: update checkpoint robustness STATUS with all test results
adil-a Apr 1, 2026
e14e246
docs: comprehensive STATUS update with all results and TODOs
adil-a Apr 1, 2026
b170d69
test: add memory thresholds for Llama, GPT-OSS, Nemotron Nano V3
adil-a Apr 1, 2026
c7fa2a9
Merge remote-tracking branch 'origin/main' into adil-a/checkpoint-rob…
adil-a Apr 2, 2026
86a7c09
test: add --resume_loss_threshold CLI flag for configurable resume to…
adil-a Apr 2, 2026
cb54125
test: Phi-4 TP=2 passing, add --resume_loss_threshold flag
adil-a Apr 2, 2026
3e47111
test: add --hf_device_map_auto flag for large model Phase 4 HF loading
adil-a Apr 2, 2026
aaba6f9
test: multi-node checkpoint robustness (120B, 49B, Embed 1B)
adil-a Apr 2, 2026
cefbc18
test: move vLLM deployment tests to separate PR
adil-a Apr 2, 2026
58e9dc7
test: integrate checkpoint robustness into CI pipeline
adil-a Apr 2, 2026
4edad09
test: remove checkpoint robustness shell scripts from PR
adil-a Apr 2, 2026
cba941f
test: remove unused shell script runner
adil-a Apr 2, 2026
d77ea17
refactor: move ci.checkpoint_robustness parsing into test script
adil-a Apr 2, 2026
1b74e66
fix: remove num_samples_limit from common args, add per-model
adil-a Apr 2, 2026
7a597c9
fix: move all dataset limit args to per-model ci config
adil-a Apr 2, 2026
10639bc
fix: remove memory thresholds from ci.checkpoint_robustness
adil-a Apr 3, 2026
29d6450
fix: set ci.time to 00:45:00 for checkpoint robustness models
adil-a Apr 3, 2026
c0fea99
Organize nightly llm_finetune recipes
thomasdhc Apr 3, 2026
4edc6de
Revert some ci test updates for testing purposes
thomasdhc Apr 3, 2026
dbaf93b
Reorganize finetune sh
thomasdhc Apr 3, 2026
51b7099
Add banners
thomasdhc Apr 3, 2026
5731a7b
Update test stage name based on robustness check and give 5 extra min…
thomasdhc Apr 3, 2026
4e17910
Bump time for recipes
thomasdhc Apr 3, 2026
0901bf4
Set trusth remote code to False for checkpoint robustness
thomasdhc Apr 3, 2026
0c3893c
Bump for recipe runtimes
thomasdhc Apr 3, 2026
4ded46a
Improve pytest loggin readability
thomasdhc Apr 3, 2026
7398055
Set trust remote code for certain recipes
thomasdhc Apr 3, 2026
289764d
Update torchrun output log config
thomasdhc Apr 6, 2026
f8a3c99
Merge branch 'main' into adil-a/checkpoint-robustness-test
thomasdhc Apr 6, 2026
321ac05
Add recipe: TrainFinetuneRecipeForNextTokenPrediction for new recipes
thomasdhc Apr 6, 2026
3bec651
Merge branch 'main' into adil-a/checkpoint-robustness-test
thomasdhc Apr 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,15 @@ lr_scheduler:

ci:
recipe_owner: adil-a
time: "00:30:00"
time: "00:45:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
distributed.tp_size: 2
cross_tp_size: 2
cross_tp_kl_threshold: 5e-3
tokenizer_name: baichuan-inc/Baichuan2-7B-Chat
dataset.limit_dataset_samples: 500
validation_dataset.limit_dataset_samples: 500

# Uncomment and configure for W&B logging
# wandb:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,14 @@ lr_scheduler:

ci:
recipe_owner: adil-a
time: "00:30:00"
time: "00:45:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
trust_remote_code: true
distributed.tp_size: 2
tokenizer_name: baichuan-inc/Baichuan2-7B-Chat
dataset.limit_dataset_samples: 500
validation_dataset.limit_dataset_samples: 500

# Uncomment and configure for W&B logging
# wandb:
Expand Down
12 changes: 9 additions & 3 deletions examples/llm_finetune/gemma/gemma_3_270m_squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,18 @@ optimizer:
weight_decay: 0
# min_lr: 1.0e-5

ci:
recipe_owner: HuiyingLi
time: "00:20:00"
checkpoint_robustness:
hf_kl_threshold: 6e-3
tokenizer_name: google/gemma-3-270m
dataset.limit_dataset_samples: 500
validation_dataset.limit_dataset_samples: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>

ci:
recipe_owner: HuiyingLi
12 changes: 9 additions & 3 deletions examples/llm_finetune/gemma/gemma_3_270m_squad_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,18 @@ optimizer:
weight_decay: 0
# min_lr: 1.0e-5

ci:
recipe_owner: HuiyingLi
time: "00:20:00"
checkpoint_robustness:
hf_kl_threshold: 8e-3
tokenizer_name: google/gemma-3-270m
dataset.limit_dataset_samples: 500
validation_dataset.limit_dataset_samples: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>

ci:
recipe_owner: HuiyingLi
8 changes: 8 additions & 0 deletions examples/llm_finetune/gpt_oss/gpt_oss_20b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,12 @@ optimizer:

ci:
recipe_owner: hemildesai
time: "00:15:00"
node_multiplier: true
checkpoint_robustness:
hf_kl_threshold: 5e-2
tokenizer_name: openai/gpt-oss-20b
check_phantom_keys: true
no_check_resume: true
dataset.num_samples_limit: 500
validation_dataset.num_samples_limit: 500
9 changes: 9 additions & 0 deletions examples/llm_finetune/gpt_oss/gpt_oss_20b_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ optimizer:
weight_decay: 0.0
# min_lr: 1.0e-5

ci:
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 5e-2
tokenizer_name: openai/gpt-oss-20b
no_check_resume: true
dataset.num_samples_limit: 500
validation_dataset.num_samples_limit: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
Expand Down
15 changes: 12 additions & 3 deletions examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,21 @@ optimizer:
weight_decay: 0
# min_lr: 1.0e-5

ci:
recipe_owner: akoumpa
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
model.pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
tokenizer_name: meta-llama/Llama-3.2-3B-Instruct
cross_tp_size: 2
cross_tp_kl_threshold: 5e-3
dataset.num_samples_limit: 500
validation_dataset.num_samples_limit: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>

ci:
recipe_owner: akoumpa
10 changes: 10 additions & 0 deletions examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag_peft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ optimizer:
weight_decay: 0
# min_lr: 1.0e-5

ci:
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
model.pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
tokenizer_name: meta-llama/Llama-3.2-3B-Instruct
check_fused_qkv_keys: true
dataset.num_samples_limit: 500
validation_dataset.num_samples_limit: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
Expand Down
112 changes: 112 additions & 0 deletions examples/llm_finetune/mistral/ministral3_3b_squad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright (c) 2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# To run this recipe, please use the following command:
# torchrun --nproc-per-node=8 recipes/llm_finetune/finetune.py --config examples/llm_finetune/mistral/ministral3_3b_squad.yaml
# Adjust --nproc-per-node to the number of GPUs available on your host machine.


recipe: TrainFinetuneRecipeForNextTokenPrediction

step_scheduler:
global_batch_size: 64
local_batch_size: 8
ckpt_every_steps: 50
val_every_steps: 50 # will run every x number of gradient steps
max_steps: 100

dist_env:
backend: nccl
timeout_minutes: 1

rng:
_target_: nemo_automodel.components.training.rng.StatefulRNG
seed: 1111
ranked: true

model:
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
pretrained_model_name_or_path: mistralai/Ministral-3-3B-Instruct-2512

# torch.compile configuration
compile:
enabled: false
mode: "default" # Options: "default", "reduce-overhead", "max-autotune"
fullgraph: false
dynamic: true # Set to false for better performance with fixed shapes
backend: null # Use default backend (inductor)

distributed:
strategy: fsdp2
dp_size: none
tp_size: 1
cp_size: 1

sequence_parallel: false

loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy

dataset:
_target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
dataset_name: rajpurkar/squad
split: train

packed_sequence:
packed_sequence_size: 0

dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater
shuffle: false

validation_dataset:
_target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
dataset_name: rajpurkar/squad
split: validation
limit_dataset_samples: 64

validation_dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater

optimizer:
_target_: torch.optim.Adam
betas: [0.9, 0.999]
eps: 1e-8
lr: 1.0e-5
weight_decay: 0

lr_scheduler:
lr_decay_style: cosine
min_lr: 1.0e-6

ci:
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
distributed.tp_size: 2
tokenizer_name: mistralai/Ministral-3-3B-Instruct-2512
cross_tp_size: 2
cross_tp_kl_threshold: 5e-3
dataset.limit_dataset_samples: 500
validation_dataset.limit_dataset_samples: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>
127 changes: 127 additions & 0 deletions examples/llm_finetune/mistral/ministral3_3b_squad_peft.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Copyright (c) 2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# To run this recipe, please use the following command:
# torchrun --nproc-per-node=8 recipes/llm_finetune/finetune.py --config examples/llm_finetune/mistral/ministral3_3b_squad_peft.yaml
# Adjust --nproc-per-node to the number of GPUs available on your host machine.


recipe: TrainFinetuneRecipeForNextTokenPrediction

step_scheduler:
global_batch_size: 256
local_batch_size: 8
ckpt_every_steps: 50
val_every_steps: 50 # will run every x number of gradient steps
max_steps: 100

dist_env:
backend: nccl
timeout_minutes: 1

rng:
_target_: nemo_automodel.components.training.rng.StatefulRNG
seed: 1111
ranked: true

model:
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
pretrained_model_name_or_path: mistralai/Ministral-3-3B-Instruct-2512
torch_dtype: bf16

checkpoint:
enabled: false
checkpoint_dir: checkpoints/
model_save_format: safetensors # torch_save or safetensors
save_consolidated: false # saves the model in a consolidated safetensors format. Requires model_save_format to be safetensors.

peft:
_target_: nemo_automodel.components._peft.lora.PeftConfig
match_all_linear: True
dim: 8
alpha: 32
use_triton: True
# dtype needs a fix to resolve to type instead of string
# lora_dtype: torch.bfloat16

# torch.compile configuration
compile:
enabled: false
mode: "default" # Options: "default", "reduce-overhead", "max-autotune"
fullgraph: false
dynamic: true # Set to false for better performance with fixed shapes
backend: null # Use default backend (inductor)

distributed:
strategy: fsdp2
dp_size: none
tp_size: 1
cp_size: 1

sequence_parallel: false

loss_fn:
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy

dataset:
_target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
dataset_name: rajpurkar/squad
split: train

packed_sequence:
# Set packed_sequence_size > 0 to run with packed sequences
packed_sequence_size: 0

dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater
shuffle: false

validation_dataset:
_target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
dataset_name: rajpurkar/squad
split: validation
limit_dataset_samples: 64

validation_dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
collate_fn: nemo_automodel.components.datasets.utils.default_collater

optimizer:
_target_: torch.optim.Adam
betas: [0.9, 0.999]
eps: 1e-8
lr: 1.0e-5
weight_decay: 0

lr_scheduler:
lr_decay_style: cosine
min_lr: 1.0e-6

ci:
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
distributed.tp_size: 2
tokenizer_name: mistralai/Ministral-3-3B-Instruct-2512
dataset.limit_dataset_samples: 500
validation_dataset.limit_dataset_samples: 500

# Uncomment and configure for W&B logging
# wandb:
# project: <your_wandb_project>
# entity: <your_wandb_entity>
# name: <your_wandb_exp_name>
# save_dir: <your_wandb_save_dir>
Loading
Loading