NVIDIA-NeMo · thomasdhc · Apr 6, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 26, 2026
@@ -101,7 +101,15 @@ lr_scheduler:
 
 ci:
   recipe_owner: adil-a
-  time: "00:30:00"
+  time: "00:45:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-3
+    distributed.tp_size: 2
+    cross_tp_size: 2
+    cross_tp_kl_threshold: 5e-3
+    tokenizer_name: baichuan-inc/Baichuan2-7B-Chat
+    dataset.limit_dataset_samples: 500
+    validation_dataset.limit_dataset_samples: 500
 
 # Uncomment and configure for W&B logging
 # wandb:

@@ -118,7 +118,14 @@ lr_scheduler:
 
 ci:
   recipe_owner: adil-a
-  time: "00:30:00"
+  time: "00:45:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-3
+    trust_remote_code: true
+    distributed.tp_size: 2
+    tokenizer_name: baichuan-inc/Baichuan2-7B-Chat
+    dataset.limit_dataset_samples: 500
+    validation_dataset.limit_dataset_samples: 500
 
 # Uncomment and configure for W&B logging
 # wandb:

@@ -91,12 +91,18 @@ optimizer:
   weight_decay: 0
   # min_lr: 1.0e-5
 
+ci:
+  recipe_owner: HuiyingLi
+  time: "00:20:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 6e-3
+    tokenizer_name: google/gemma-3-270m
+    dataset.limit_dataset_samples: 500
+    validation_dataset.limit_dataset_samples: 500
+
 # Uncomment and configure for W&B logging
 # wandb:
 #   project: <your_wandb_project>
 #   entity: <your_wandb_entity>
 #   name: <your_wandb_exp_name>
 #   save_dir: <your_wandb_save_dir>
-
-ci:
-  recipe_owner: HuiyingLi
@@ -98,12 +98,18 @@ optimizer:
   weight_decay: 0
   # min_lr: 1.0e-5
 
+ci:
+  recipe_owner: HuiyingLi
+  time: "00:20:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 8e-3
+    tokenizer_name: google/gemma-3-270m
+    dataset.limit_dataset_samples: 500
+    validation_dataset.limit_dataset_samples: 500
+
 # Uncomment and configure for W&B logging
 # wandb:
 #   project: <your_wandb_project>
 #   entity: <your_wandb_entity>
 #   name: <your_wandb_exp_name>
 #   save_dir: <your_wandb_save_dir>
-
-ci:
-  recipe_owner: HuiyingLi
@@ -118,4 +118,12 @@ optimizer:
 
 ci:
   recipe_owner: hemildesai
+  time: "00:15:00"
   node_multiplier: true
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-2
+    tokenizer_name: openai/gpt-oss-20b
+    check_phantom_keys: true
+    no_check_resume: true
+    dataset.num_samples_limit: 500
+    validation_dataset.num_samples_limit: 500
@@ -114,6 +114,15 @@ optimizer:
   weight_decay: 0.0
   # min_lr: 1.0e-5
 
+ci:
+  time: "00:15:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-2
+    tokenizer_name: openai/gpt-oss-20b
+    no_check_resume: true
+    dataset.num_samples_limit: 500
+    validation_dataset.num_samples_limit: 500
+
 # Uncomment and configure for W&B logging
 # wandb:
 #   project: <your_wandb_project>

@@ -89,12 +89,21 @@ optimizer:
   weight_decay: 0
   # min_lr: 1.0e-5
 
+ci:
+  recipe_owner: akoumpa
+  time: "00:15:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-3
+    model.pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
+    tokenizer_name: meta-llama/Llama-3.2-3B-Instruct
+    cross_tp_size: 2
+    cross_tp_kl_threshold: 5e-3
+    dataset.num_samples_limit: 500
+    validation_dataset.num_samples_limit: 500
+
 # Uncomment and configure for W&B logging
 # wandb:
 #   project: <your_wandb_project>
 #   entity: <your_wandb_entity>
 #   name: <your_wandb_exp_name>
 #   save_dir: <your_wandb_save_dir>
-
-ci:
-  recipe_owner: akoumpa
@@ -99,6 +99,16 @@ optimizer:
   weight_decay: 0
   # min_lr: 1.0e-5
 
+ci:
+  time: "00:15:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-3
+    model.pretrained_model_name_or_path: meta-llama/Llama-3.2-3B-Instruct
+    tokenizer_name: meta-llama/Llama-3.2-3B-Instruct
+    check_fused_qkv_keys: true
+    dataset.num_samples_limit: 500
+    validation_dataset.num_samples_limit: 500
+
 # Uncomment and configure for W&B logging
 # wandb:
 #   project: <your_wandb_project>

@@ -0,0 +1,112 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To run this recipe, please use the following command:
+# torchrun --nproc-per-node=8 recipes/llm_finetune/finetune.py --config examples/llm_finetune/mistral/ministral3_3b_squad.yaml
+# Adjust --nproc-per-node to the number of GPUs available on your host machine.
+
+
+recipe: TrainFinetuneRecipeForNextTokenPrediction
+
+step_scheduler:
+  global_batch_size: 64
+  local_batch_size: 8
+  ckpt_every_steps: 50
+  val_every_steps: 50  # will run every x number of gradient steps
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: mistralai/Ministral-3-3B-Instruct-2512
+
+# torch.compile configuration
+compile:
+  enabled: false
+  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
+  fullgraph: false
+  dynamic: true  # Set to false for better performance with fixed shapes
+  backend: null  # Use default backend (inductor)
+
+distributed:
+  strategy: fsdp2
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
+
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: train
+
+packed_sequence:
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+  shuffle: false
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: validation
+  limit_dataset_samples: 64
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0
+
+lr_scheduler:
+  lr_decay_style: cosine
+  min_lr: 1.0e-6
+
+ci:
+  time: "00:15:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-3
+    distributed.tp_size: 2
+    tokenizer_name: mistralai/Ministral-3-3B-Instruct-2512
+    cross_tp_size: 2
+    cross_tp_kl_threshold: 5e-3
+    dataset.limit_dataset_samples: 500
+    validation_dataset.limit_dataset_samples: 500
+
+# Uncomment and configure for W&B logging
+# wandb:
+#   project: <your_wandb_project>
+#   entity: <your_wandb_entity>
+#   name: <your_wandb_exp_name>
+#   save_dir: <your_wandb_save_dir>
@@ -0,0 +1,127 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To run this recipe, please use the following command:
+# torchrun --nproc-per-node=8 recipes/llm_finetune/finetune.py --config examples/llm_finetune/mistral/ministral3_3b_squad_peft.yaml
+# Adjust --nproc-per-node to the number of GPUs available on your host machine.
+
+
+recipe: TrainFinetuneRecipeForNextTokenPrediction
+
+step_scheduler:
+  global_batch_size: 256
+  local_batch_size: 8
+  ckpt_every_steps: 50
+  val_every_steps: 50  # will run every x number of gradient steps
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: mistralai/Ministral-3-3B-Instruct-2512
+  torch_dtype: bf16
+
+checkpoint:
+  enabled: false
+  checkpoint_dir: checkpoints/
+  model_save_format: safetensors # torch_save or safetensors
+  save_consolidated: false # saves the model in a consolidated safetensors format. Requires model_save_format to be safetensors.
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  match_all_linear: True
+  dim: 8
+  alpha: 32
+  use_triton: True
+  # dtype needs a fix to resolve to type instead of string
+  # lora_dtype: torch.bfloat16
+
+# torch.compile configuration
+compile:
+  enabled: false
+  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
+  fullgraph: false
+  dynamic: true  # Set to false for better performance with fixed shapes
+  backend: null  # Use default backend (inductor)
+
+distributed:
+  strategy: fsdp2
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
+
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: train
+
+packed_sequence:
+  # Set packed_sequence_size > 0 to run with packed sequences
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+  shuffle: false
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: validation
+  limit_dataset_samples: 64
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0
+
+lr_scheduler:
+  lr_decay_style: cosine
+  min_lr: 1.0e-6
+
+ci:
+  time: "00:15:00"
+  checkpoint_robustness:
+    hf_kl_threshold: 5e-3
+    distributed.tp_size: 2
+    tokenizer_name: mistralai/Ministral-3-3B-Instruct-2512
+    dataset.limit_dataset_samples: 500
+    validation_dataset.limit_dataset_samples: 500
+
+# Uncomment and configure for W&B logging
+# wandb:
+#   project: <your_wandb_project>
+#   entity: <your_wandb_entity>
+#   name: <your_wandb_exp_name>
+#   save_dir: <your_wandb_save_dir>