refactoring fsdp2 tests

pstjohn · pstjohn · commit a4d691f1fd3b · 2026-02-26T07:11:04.000-08:00
Signed-off-by: Peter St. John &lt;pstjohn@nvidia.com&gt;
diff --git a/tests/pytorch/distributed/run_fsdp2_model.py b/tests/pytorch/distributed/run_fsdp2_model.py
@@ -9,12 +9,8 @@
 import argparse
 
 import transformer_engine.pytorch as te
-from transformer_engine.common.recipe import (
-    Format,
-    DelayedScaling,
-    Float8CurrentScaling,
-    MXFP8BlockScaling,
-)
+import transformer_engine.common.recipe
+from transformer_engine.common.recipe import Format
 
 import torch
 import torch.distributed as dist
@@ -43,7 +39,10 @@ def _parse_args(argv=None, namespace=None):
     parser.add_argument("--seq-length", type=int, default=128, help="Sequence length of input")
     parser.add_argument("--params-dtype", type=str, default="float32", help="Parameter dtype.")
     parser.add_argument(
-        "--fp8-init", action="store_true", default=False, help="Initialize primary weights in FP8."
+        "--fp8-init",
+        action="store_true",
+        default=False,
+        help="Initialize primary weights in FP8.",
     )
     parser.add_argument(
         "--recipe",
@@ -111,14 +110,7 @@ def get_te_layer_from_string(layer_name):
 
 
 def get_recipe_from_string(recipe, fp8_format=Format.HYBRID):
-    if recipe == "delayed_scaling":
-        return DelayedScaling(fp8_format=fp8_format, amax_history_len=16, amax_compute_algo="max")
-    elif recipe == "current_scaling":
-        return Float8CurrentScaling(fp8_format=fp8_format)
-    elif recipe == "mx_fp8_block_scaling":
-        return MXFP8BlockScaling(fp8_format=fp8_format)
-    else:
-        raise ValueError(f"Unknown quantizer type: {recipe}")
+    return getattr(transformer_engine.common.recipe, recipe)(fp8_format=fp8_format)
 
 
 def init_te_model(config):
@@ -292,13 +284,13 @@ def _train(args):
         build_model_context_args["enabled"] = True
         build_model_context_args["recipe"] = fp8_recipe
 
-    dist_print(f"Memory before model init: {torch.cuda.memory_allocated(device)/1e6} MB")
+    dist_print(f"Memory before model init: {torch.cuda.memory_allocated(device) / 1e6} MB")
     # Create the model on the meta/cuda device as per args
     with build_model_context(**build_model_context_args):
         model, inp_shape, out_shape = init_te_model(args)
     dist_print(
         f"Memory after model init on device {args.device}:"
-        f" {torch.cuda.memory_allocated(device)/1e6} MB"
+        f" {torch.cuda.memory_allocated(device) / 1e6} MB"
     )
 
     # Creating a DeviceMesh for fully_shard
@@ -319,7 +311,7 @@ def _train(args):
         dist_print(f" Sharded parameters materialized and initialized on cuda device.")
 
     dist_print(
-        f"FSDP2 model in cuda, memory allocated: {torch.cuda.memory_allocated(device)/1e6} MB"
+        f"FSDP2 model in cuda, memory allocated: {torch.cuda.memory_allocated(device) / 1e6} MB"
     )
 
     optimizer = optim.Adam(model.parameters(), lr=1e-3)
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -3,18 +3,47 @@
 # See LICENSE for license information.
 
 import os
-import pytest
 import subprocess
 from pathlib import Path
-import transformer_engine.pytorch as te
 
+import pytest
 import torch
 
+import transformer_engine.pytorch as te
+from transformer_engine.pytorch import fp8
 
-fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True)
-mxfp8_available, reason_for_no_mxfp8 = te.is_mxfp8_available(return_reason=True)
 NUM_PROCS: int = torch.cuda.device_count()
 
+# Each entry: (recipe_class_name, hydra_overrides, check_fn)
+_FP8_RECIPE_CONFIGS = [
+    ("DelayedScaling", fp8.check_fp8_support),
+    ("Float8CurrentScaling", fp8.check_fp8_support),
+    ("Float8BlockScaling", fp8.check_fp8_block_scaling_support),
+    ("MXFP8BlockScaling", fp8.check_mxfp8_support),
+    ["NVFP4BlockScaling", fp8.check_nvfp4_support],
+]
+
+
+def _parametrize_fp8_recipes():
+    """Generate pytest.param objects with xfail marks for unsupported FP8 recipes."""
+    params = []
+    for name, check_fn in _FP8_RECIPE_CONFIGS:
+        supported, reason = check_fn()
+        params.append(
+            pytest.param(
+                name,
+                id=name,
+                marks=pytest.mark.xfail(condition=not supported, reason=reason),
+            )
+        )
+    return params
+
+
+@pytest.fixture(params=_parametrize_fp8_recipes())
+def fp_recipe(request):
+    """Parametrized fixture providing FP8 recipe Hydra overrides for each supported TE recipe."""
+    return request.param
+
 
 def _run_test(fp_init, sharding_dims, recipe, layer_type):
     test_path = Path(__file__).parent.resolve() / "run_fsdp2_model.py"
@@ -32,28 +61,17 @@ def _run_test(fp_init, sharding_dims, recipe, layer_type):
     test_cmd += ["--recipe", recipe]
     test_cmd += ["--layer-type", layer_type]
 
-    result = subprocess.run(test_cmd, env=os.environ, check=True)
+    subprocess.run(test_cmd, env=os.environ, check=True)
 
 
 @pytest.mark.skipif(NUM_PROCS < 4, reason="Requires 4+ GPUs")
 @pytest.mark.skipif(NUM_PROCS % 2 != 0, reason="Requires even number of GPUs")
 @pytest.mark.skipif(not te.torch_version() >= (2, 4, 0), reason="Requires PyTorch 2.4.0+")
 @pytest.mark.parametrize("sharding_dims", ([NUM_PROCS], [2, NUM_PROCS // 2]))
 @pytest.mark.parametrize("fp8_init", (False, True))
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
 @pytest.mark.parametrize("layer_type", ("LayerNormLinear", "TransformerLayer"))
-def test_distributed(fp8_init, sharding_dims, recipe, layer_type):
-
-    # Skip invalid configurations
-    if torch.cuda.device_count() < 4:
-        pytest.skip("FSDP2 test requires at least 4 GPUs")
-
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    elif not fp8_available:
-        pytest.skip(reason_for_no_fp8)
-
-    _run_test(fp8_init, sharding_dims, recipe, layer_type)
+def test_distributed(fp8_init, sharding_dims, fp_recipe, layer_type):
+    _run_test(fp8_init, sharding_dims, fp_recipe, layer_type)
 
 
 ## ── FusedAdam + FSDP2 tests ─────────────────────────────────────────
@@ -77,80 +95,48 @@ def _run_fused_adam_test(test_name, recipe="delayed_scaling"):
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_fused_adam_fp8_master_weights(recipe):
+def test_fsdp2_fused_adam_fp8_master_weights(fp_recipe):
     """FusedAdam(master_weights=True) + FSDP2 + quantized_model_init."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    _run_fused_adam_test("fused_adam_fp8_master_weights", recipe)
+    _run_fused_adam_test("fused_adam_fp8_master_weights", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_fused_adam_bf16(recipe):
+def test_fsdp2_fused_adam_bf16(fp_recipe):
     """FusedAdam(master_weights=True) + FSDP2 + bf16 params (no FP8)."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    elif not fp8_available:
-        pytest.skip(reason_for_no_fp8)
-    _run_fused_adam_test("fused_adam_bf16", recipe)
+    _run_fused_adam_test("fused_adam_bf16", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_fused_adam_fp8_no_master(recipe):
+def test_fsdp2_fused_adam_fp8_no_master(fp_recipe):
     """FusedAdam(master_weights=False) + FSDP2 + FP8 params."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    _run_fused_adam_test("fused_adam_fp8_no_master", recipe)
+    _run_fused_adam_test("fused_adam_fp8_no_master", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_fused_adam_bf16_store_param_remainders(recipe):
+def test_fsdp2_fused_adam_bf16_store_param_remainders(fp_recipe):
     """FusedAdam(master_weights=True, store_param_remainders=True) + FSDP2 + bf16."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    elif not fp8_available:
-        pytest.skip(reason_for_no_fp8)
-    _run_fused_adam_test("fused_adam_bf16_store_param_remainders", recipe)
+    _run_fused_adam_test("fused_adam_bf16_store_param_remainders", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_dcp_save_load(recipe):
+def test_fsdp2_dcp_save_load(fp_recipe):
     """Distributed checkpoint save/load with FSDP2 + FP8 + FusedAdam."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    _run_fused_adam_test("dcp_save_load", recipe)
+    _run_fused_adam_test("dcp_save_load", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_dcp_output_parity(recipe):
+def test_fsdp2_dcp_output_parity(fp_recipe):
     """DCP save/load round-trip into a fresh model produces identical outputs."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    _run_fused_adam_test("dcp_output_parity", recipe)
+    _run_fused_adam_test("dcp_output_parity", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
-def test_fsdp2_safetensors_fp32_export(recipe):
+def test_fsdp2_safetensors_fp32_export(fp_recipe):
     """Export FP32 model from optimizer master weights to safetensors."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    _run_fused_adam_test("safetensors_fp32_export", recipe)
+    _run_fused_adam_test("safetensors_fp32_export", fp_recipe)
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
-@pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
-@pytest.mark.parametrize("recipe", ("delayed_scaling", "current_scaling", "mx_fp8_block_scaling"))
 @pytest.mark.xfail(
     reason=(
         "fuse_wgrad_accumulation is incompatible with vanilla FSDP2: "
@@ -161,11 +147,9 @@ def test_fsdp2_safetensors_fp32_export(recipe):
     raises=subprocess.CalledProcessError,
     strict=True,
 )
-def test_fsdp2_fuse_wgrad_accumulation(recipe):
+def test_fsdp2_fuse_wgrad_accumulation(fp_recipe):
     """fuse_wgrad_accumulation=True + FSDP2 -- expected to fail."""
-    if recipe == "mx_fp8_block_scaling" and not mxfp8_available:
-        pytest.skip(reason_for_no_mxfp8)
-    _run_fused_adam_test("fuse_wgrad_accumulation", recipe)
+    _run_fused_adam_test("fuse_wgrad_accumulation", fp_recipe)
 
 
 def test_dummy() -> None:
@@ -175,3 +159,10 @@ def test_dummy() -> None:
 
     """
     pass
+
+
+"""
+TODO:
+ - async DCP tests
+
+"""