Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -341,16 +341,122 @@ mig-configs:
"1g.24gb": 2
"2g.48gb": 1

# P6-B300 (Blackwell Ultra, 288GB HBM3e, ~269GB usable) profiles
# Profiles: 1g.34gb (x7), 1g.67gb (x4), 2g.67gb (x3), 3g.135gb (x2), 4g.135gb (x1), 7g.269gb (x1)
# Upstream ref: NVIDIA GPU Operator v25.3.0, device-filter 0x318210DE

all-1g.34gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 7

all-1g.67gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.67gb": 4

all-2g.67gb:
- devices: all
mig-enabled: true
mig-devices:
"2g.67gb": 3

all-3g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"3g.135gb": 2

all-4g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"4g.135gb": 1

all-7g.269gb:
- devices: all
mig-enabled: true
mig-devices:
"7g.269gb": 1

mixed-1-3g.135gb-1-4g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"3g.135gb": 1
"4g.135gb": 1

mixed-1-1g.34gb-1-2g.67gb-1-4g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 1
"2g.67gb": 1
"4g.135gb": 1

mixed-3-1g.34gb-1-4g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 3
"4g.135gb": 1

mixed-1-1g.34gb-1-2g.67gb-1-3g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 1
"2g.67gb": 1
"3g.135gb": 1

mixed-3-1g.34gb-1-3g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 3
"3g.135gb": 1

mixed-2-2g.67gb-1-3g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"2g.67gb": 2
"3g.135gb": 1

mixed-2-1g.34gb-1-2g.67gb-1-3g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 2
"2g.67gb": 1
"3g.135gb": 1

mixed-4-1g.34gb-1-3g.135gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 4
"3g.135gb": 1

mixed-1-1g.34gb-3-2g.67gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 1
"2g.67gb": 3

mixed-3-1g.34gb-2-2g.67gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 3
"2g.67gb": 2

mixed-5-1g.34gb-1-2g.67gb:
- devices: all
mig-enabled: true
mig-devices:
"1g.34gb": 5
"2g.67gb": 1
1 change: 1 addition & 0 deletions src/sagemaker/hyperpod/training/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
'ml.p5e.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'],
'ml.p5en.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'],
'p6-b200.48xlarge': ['mig-1g.23gb', 'mig-1g.45gb', 'mig-2g.45gb', 'mig-3g.90gb', 'mig-4g.90gb', 'mig-7g.180gb'],
'ml.p6-b300.48xlarge': ['mig-1g.34gb', 'mig-1g.67gb', 'mig-2g.67gb', 'mig-3g.135gb', 'mig-4g.135gb', 'mig-7g.269gb'],
'ml.p6e-gb200.36xlarge': ['mig-1g.23gb', 'mig-1g.47gb', 'mig-2g.47gb', 'mig-3g.93gb', 'mig-4g.93gb', 'mig-7g.186gb'],
'ml.g7e.2xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'],
'ml.g7e.4xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'],
Expand Down
46 changes: 44 additions & 2 deletions test/unit_tests/cli/test_accelerator_partition_util.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from sagemaker.hyperpod.training.accelerator_partition_util import (
_extract_gpu_slices_from_accelerator_partition_type,
_get_accelerator_partition,
_get_accelerator_partition_defaults,
_set_default_accelerator_partition_val,
_validate_accelerator_partition,
)
from sagemaker.hyperpod.training.constants import INSTANCE_TYPE_MIG_PROFILES
import pytest
from unittest.mock import patch, MagicMock

Expand Down Expand Up @@ -73,15 +75,55 @@ def test_set_default_accelerator_partition_values(self, input_count, input_limit
("mig-1g.5gb", None, None, 2, "ml.p4d.24xlarge", False, lambda e: "accelerator_partition_type cannot be used together with node_count." == e),
# Invalid instance type combination
("mig-1g.5gb", None, None, None, "ml.c5.large", False, lambda e: "does not support accelerator partitions" in e),
# B200: valid profile accepted (requires #399 for ml. prefix fix)
("mig-1g.23gb", None, None, None, "ml.p6-b200.48xlarge", True, lambda e: e == ""),
# B200: cross-architecture profile rejected
("mig-1g.5gb", None, None, None, "ml.p6-b200.48xlarge", False, lambda e: "not supported on instance type" in e),
# B300: valid profile accepted
("mig-1g.34gb", None, None, None, "ml.p6-b300.48xlarge", True, lambda e: e == ""),
# B300: cross-architecture profile rejected
("mig-1g.5gb", None, None, None, "ml.p6-b300.48xlarge", False, lambda e: "not supported on instance type" in e),
]
)
@patch('sagemaker.hyperpod.training.accelerator_partition_util.KubernetesClient')
def test_validate_accelerator_partition_fields(self, mock_k8s_client, partition_type, accelerators, accelerators_limit, node_count, instance_type, expected_valid, error_check):
# Mock cluster to have no MIG resources for most tests
mock_node = MagicMock()
mock_node.status.allocatable = {}
allocatable = {f"nvidia.com/{partition_type}": "1"} if expected_valid and partition_type else {}
mock_node.status.allocatable = allocatable
mock_k8s_client.return_value.get_core_v1_api.return_value.list_node.return_value.items = [mock_node]

valid, error = _validate_accelerator_partition(partition_type, accelerators, accelerators_limit, node_count, instance_type)
assert valid is expected_valid
assert error_check(error)

@pytest.mark.parametrize(
"instance_type",
list(INSTANCE_TYPE_MIG_PROFILES.keys()),
)
def test_instance_type_profiles_not_empty(self, instance_type):
"""Every instance type in the MIG mapping must have at least one profile."""
assert len(INSTANCE_TYPE_MIG_PROFILES[instance_type]) > 0

@pytest.mark.parametrize(
"instance_type,partition_type,partition_count,expected_cpu,expected_memory",
[
# One representative profile per MIG-capable instance type (smallest profile, max count).
# Guards that INSTANCE_RESOURCES has correct cpu/gpu/memory for each instance type.
("ml.p4d.24xlarge", "mig-1g.5gb", 7, "12.0", "144.0Gi"),
("ml.p4de.24xlarge", "mig-1g.10gb", 7, "12.0", "144.0Gi"),
("ml.p5.48xlarge", "mig-1g.10gb", 7, "24.0", "256.0Gi"),
("ml.p5e.48xlarge", "mig-1g.18gb", 7, "24.0", "256.0Gi"),
("ml.p5en.48xlarge", "mig-1g.18gb", 7, "24.0", "256.0Gi"),
("ml.p6-b200.48xlarge", "mig-1g.23gb", 7, "24.0", "256.0Gi"), # requires #399
("ml.p6-b300.48xlarge", "mig-1g.34gb", 7, "24.0", "512.0Gi"),
("ml.p6e-gb200.36xlarge", "mig-1g.23gb", 7, "36.0", "240.0Gi"),
("ml.g7e.48xlarge", "mig-1g.24gb", 4, "13.0", "146.0Gi"),
],
)
def test_accelerator_partition_defaults(self, instance_type, partition_type, partition_count, expected_cpu, expected_memory):
"""Verify CPU/memory defaults for one profile per MIG-capable instance type."""
defaults = _get_accelerator_partition_defaults(
instance_type, partition_type, partition_count
)
assert defaults["cpu"] == expected_cpu
assert defaults["memory"] == expected_memory
Loading