1818from pathlib import Path
1919
2020from _test_utils .examples .run_command import extend_cmd_parts , run_example_command
21- from _test_utils .torch .distributed .utils import get_free_port
2221from _test_utils .torch .puzzletron .utils import create_and_save_small_hf_model
23- from _test_utils .torch .transformers_models import get_tiny_qwen3 , get_tiny_tokenizer
22+ from _test_utils .torch .transformers_models import create_tiny_qwen3_dir , get_tiny_tokenizer
2423
2524from modelopt .torch .puzzletron .anymodel import convert_model
2625
2726
2827def test_distill_and_convert (tmp_path : Path , num_gpus ):
29- # vocab_size=128 ensures divisibility by any TP size up to 128
30- teacher_hf_path = tmp_path / "tiny_qwen3"
31- get_tiny_tokenizer ().save_pretrained (teacher_hf_path )
32- get_tiny_qwen3 (vocab_size = 128 ).save_pretrained (teacher_hf_path )
33-
34- tp_size = num_gpus
28+ teacher_hf_path = create_tiny_qwen3_dir (tmp_path , with_tokenizer = True )
3529 train_iters = 5
3630 distill_output_dir = tmp_path / "distill_output"
3731 distill_cmd_parts = extend_cmd_parts (
38- ["torchrun" , f"--nproc_per_node={ tp_size } " , "distill.py" , "--use_mock_data" ],
32+ ["torchrun" , f"--nproc_per_node={ num_gpus } " , "distill.py" , "--use_mock_data" ],
3933 student_hf_path = teacher_hf_path ,
4034 teacher_hf_path = teacher_hf_path ,
4135 output_dir = distill_output_dir ,
42- tp_size = tp_size ,
36+ tp_size = num_gpus ,
37+ pp_size = 1 ,
4338 seq_length = 32 ,
4439 mbs = 1 ,
4540 gbs = 4 ,
@@ -88,41 +83,24 @@ def test_distill_puzzletron_anymodel(tmp_path: Path, num_gpus):
8883 tmp_path
8984 )
9085
91- output_dir = tmp_path / "distill_output"
92-
93- tp_size = num_gpus
9486 train_iters = 5
95-
96- cmd_parts = [
97- "torchrun" ,
98- f"--nproc_per_node={ tp_size } " ,
99- "--master-addr" ,
100- "127.0.0.1" ,
101- "--master-port" ,
102- str (get_free_port ()),
103- "distill.py" ,
104- "--use_mock_data" ,
105- ]
106- extend_cmd_parts (
107- cmd_parts ,
87+ output_dir = tmp_path / "distill_output"
88+ cmd_parts = extend_cmd_parts (
89+ ["torchrun" , f"--nproc_per_node={ num_gpus } " , "distill.py" , "--use_mock_data" ],
10890 student_hf_path = student_anymodel_dir ,
10991 teacher_hf_path = teacher_hf_dir ,
11092 output_dir = output_dir ,
111- tp_size = tp_size ,
93+ tp_size = num_gpus ,
11294 pp_size = 1 ,
113- seq_length = 128 ,
114- split = "99,1,0" ,
95+ seq_length = 32 ,
11596 mbs = 1 ,
11697 gbs = 4 ,
11798 train_iters = train_iters ,
118- lr = 0.0001 ,
119- min_lr = 1e-5 ,
12099 lr_warmup_iters = 2 ,
121- eval_interval = 100 ,
122- eval_iters = 0 ,
123- log_interval = 5 ,
100+ eval_interval = 5 ,
101+ eval_iters = 1 ,
102+ log_interval = 1 ,
124103 )
125-
126104 run_example_command (cmd_parts , example_path = "megatron_bridge" )
127105
128106 run_config_path = output_dir / "checkpoints" / f"iter_{ train_iters :07d} " / "run_config.yaml"
@@ -135,20 +113,13 @@ def _prepare_puzzletron_anymodel_student_and_teacher(tmp_path: Path) -> tuple[Pa
135113 teacher_hf_dir = tmp_path / "teacher_hf"
136114
137115 tokenizer = get_tiny_tokenizer ()
138- vocab_size = 128 # must be divisible by TP size
139116
140117 create_and_save_small_hf_model (
141- output_path = str (student_hf_dir ),
142- tokenizer = tokenizer ,
143- hf_model_name = "Qwen/Qwen3-0.6B" ,
144- vocab_size = vocab_size ,
118+ output_path = str (student_hf_dir ), tokenizer = tokenizer , hf_model_name = "Qwen/Qwen3-0.6B"
145119 )
146120
147121 create_and_save_small_hf_model (
148- output_path = str (teacher_hf_dir ),
149- tokenizer = tokenizer ,
150- hf_model_name = "Qwen/Qwen3-0.6B" ,
151- vocab_size = vocab_size ,
122+ output_path = str (teacher_hf_dir ), tokenizer = tokenizer , hf_model_name = "Qwen/Qwen3-0.6B"
152123 )
153124
154125 student_anymodel_dir = tmp_path / "student_anymodel"
0 commit comments