diff --git a/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml b/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml index 8f4943d7..f18523ed 100644 --- a/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml +++ b/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml @@ -341,12 +341,90 @@ mig-configs: "1g.24gb": 2 "2g.48gb": 1 + # P6-B300 (Blackwell Ultra, 288GB HBM3e, ~269GB usable) profiles + # Profiles: 1g.34gb (x7), 1g.67gb (x4), 2g.67gb (x3), 3g.135gb (x2), 4g.135gb (x1), 7g.269gb (x1) + # Upstream ref: NVIDIA GPU Operator v25.3.0, device-filter 0x318210DE + + all-1g.34gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 7 + all-1g.67gb: - devices: all mig-enabled: true mig-devices: "1g.67gb": 4 + all-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.67gb": 3 + + all-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.135gb": 2 + + all-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "4g.135gb": 1 + + all-7g.269gb: + - devices: all + mig-enabled: true + mig-devices: + "7g.269gb": 1 + + mixed-1-3g.135gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.135gb": 1 + "4g.135gb": 1 + + mixed-1-1g.34gb-1-2g.67gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 1 + "4g.135gb": 1 + + mixed-3-1g.34gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "4g.135gb": 1 + + mixed-1-1g.34gb-1-2g.67gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 1 + "3g.135gb": 1 + + mixed-3-1g.34gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "3g.135gb": 1 + + mixed-2-2g.67gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.67gb": 2 + "3g.135gb": 1 + mixed-2-1g.34gb-1-2g.67gb-1-3g.135gb: - devices: all mig-enabled: true @@ -354,3 +432,31 @@ mig-configs: "1g.34gb": 2 "2g.67gb": 1 "3g.135gb": 1 + + mixed-4-1g.34gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 4 + "3g.135gb": 1 + + mixed-1-1g.34gb-3-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 3 + + mixed-3-1g.34gb-2-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "2g.67gb": 2 + + mixed-5-1g.34gb-1-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 5 + "2g.67gb": 1 diff --git a/src/sagemaker/hyperpod/training/constants.py b/src/sagemaker/hyperpod/training/constants.py index 29f58fa8..59230365 100644 --- a/src/sagemaker/hyperpod/training/constants.py +++ b/src/sagemaker/hyperpod/training/constants.py @@ -132,6 +132,7 @@ 'ml.p5e.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'ml.p5en.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'p6-b200.48xlarge': ['mig-1g.23gb', 'mig-1g.45gb', 'mig-2g.45gb', 'mig-3g.90gb', 'mig-4g.90gb', 'mig-7g.180gb'], + 'ml.p6-b300.48xlarge': ['mig-1g.34gb', 'mig-1g.67gb', 'mig-2g.67gb', 'mig-3g.135gb', 'mig-4g.135gb', 'mig-7g.269gb'], 'ml.p6e-gb200.36xlarge': ['mig-1g.23gb', 'mig-1g.47gb', 'mig-2g.47gb', 'mig-3g.93gb', 'mig-4g.93gb', 'mig-7g.186gb'], 'ml.g7e.2xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], 'ml.g7e.4xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], diff --git a/test/unit_tests/cli/test_accelerator_partition_util.py b/test/unit_tests/cli/test_accelerator_partition_util.py index b43a44ea..296fe65c 100644 --- a/test/unit_tests/cli/test_accelerator_partition_util.py +++ b/test/unit_tests/cli/test_accelerator_partition_util.py @@ -1,9 +1,11 @@ from sagemaker.hyperpod.training.accelerator_partition_util import ( _extract_gpu_slices_from_accelerator_partition_type, _get_accelerator_partition, + _get_accelerator_partition_defaults, _set_default_accelerator_partition_val, _validate_accelerator_partition, ) +from sagemaker.hyperpod.training.constants import INSTANCE_TYPE_MIG_PROFILES import pytest from unittest.mock import patch, MagicMock @@ -73,15 +75,55 @@ def test_set_default_accelerator_partition_values(self, input_count, input_limit ("mig-1g.5gb", None, None, 2, "ml.p4d.24xlarge", False, lambda e: "accelerator_partition_type cannot be used together with node_count." == e), # Invalid instance type combination ("mig-1g.5gb", None, None, None, "ml.c5.large", False, lambda e: "does not support accelerator partitions" in e), + # B200: valid profile accepted (requires #399 for ml. prefix fix) + ("mig-1g.23gb", None, None, None, "ml.p6-b200.48xlarge", True, lambda e: e == ""), + # B200: cross-architecture profile rejected + ("mig-1g.5gb", None, None, None, "ml.p6-b200.48xlarge", False, lambda e: "not supported on instance type" in e), + # B300: valid profile accepted + ("mig-1g.34gb", None, None, None, "ml.p6-b300.48xlarge", True, lambda e: e == ""), + # B300: cross-architecture profile rejected + ("mig-1g.5gb", None, None, None, "ml.p6-b300.48xlarge", False, lambda e: "not supported on instance type" in e), ] ) @patch('sagemaker.hyperpod.training.accelerator_partition_util.KubernetesClient') def test_validate_accelerator_partition_fields(self, mock_k8s_client, partition_type, accelerators, accelerators_limit, node_count, instance_type, expected_valid, error_check): - # Mock cluster to have no MIG resources for most tests mock_node = MagicMock() - mock_node.status.allocatable = {} + allocatable = {f"nvidia.com/{partition_type}": "1"} if expected_valid and partition_type else {} + mock_node.status.allocatable = allocatable mock_k8s_client.return_value.get_core_v1_api.return_value.list_node.return_value.items = [mock_node] valid, error = _validate_accelerator_partition(partition_type, accelerators, accelerators_limit, node_count, instance_type) assert valid is expected_valid assert error_check(error) + + @pytest.mark.parametrize( + "instance_type", + list(INSTANCE_TYPE_MIG_PROFILES.keys()), + ) + def test_instance_type_profiles_not_empty(self, instance_type): + """Every instance type in the MIG mapping must have at least one profile.""" + assert len(INSTANCE_TYPE_MIG_PROFILES[instance_type]) > 0 + + @pytest.mark.parametrize( + "instance_type,partition_type,partition_count,expected_cpu,expected_memory", + [ + # One representative profile per MIG-capable instance type (smallest profile, max count). + # Guards that INSTANCE_RESOURCES has correct cpu/gpu/memory for each instance type. + ("ml.p4d.24xlarge", "mig-1g.5gb", 7, "12.0", "144.0Gi"), + ("ml.p4de.24xlarge", "mig-1g.10gb", 7, "12.0", "144.0Gi"), + ("ml.p5.48xlarge", "mig-1g.10gb", 7, "24.0", "256.0Gi"), + ("ml.p5e.48xlarge", "mig-1g.18gb", 7, "24.0", "256.0Gi"), + ("ml.p5en.48xlarge", "mig-1g.18gb", 7, "24.0", "256.0Gi"), + ("ml.p6-b200.48xlarge", "mig-1g.23gb", 7, "24.0", "256.0Gi"), # requires #399 + ("ml.p6-b300.48xlarge", "mig-1g.34gb", 7, "24.0", "512.0Gi"), + ("ml.p6e-gb200.36xlarge", "mig-1g.23gb", 7, "36.0", "240.0Gi"), + ("ml.g7e.48xlarge", "mig-1g.24gb", 4, "13.0", "146.0Gi"), + ], + ) + def test_accelerator_partition_defaults(self, instance_type, partition_type, partition_count, expected_cpu, expected_memory): + """Verify CPU/memory defaults for one profile per MIG-capable instance type.""" + defaults = _get_accelerator_partition_defaults( + instance_type, partition_type, partition_count + ) + assert defaults["cpu"] == expected_cpu + assert defaults["memory"] == expected_memory