From c98fd6e41b2d14e2ccc723f44d47846f25a177b8 Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Fri, 27 Mar 2026 12:20:53 +0000 Subject: [PATCH 1/6] Add MIG profile support for ml.p6-b300.48xlarge (Blackwell Ultra) Add ml.p6-b300.48xlarge to INSTANCE_TYPE_MIG_PROFILES in constants.py with the correct B300 MIG profiles derived from the NVIDIA GPU Operator v25.3.0 upstream ConfigMap (device-filter 0x318210DE): - mig-1g.34gb, mig-1g.67gb, mig-2g.67gb - mig-3g.135gb, mig-4g.135gb, mig-7g.269gb Also add the corresponding uniform and mixed MIG partition profiles to the Helm chart default-mig-config.yaml ConfigMap, following the same pattern used for existing GPU types (H100, H200, B200). The B300 GPU (288GB HBM3e, ~269GB usable) was already registered in INSTANCE_RESOURCES but had no MIG profile mapping, causing HyperPod MIG validation to reject accelerator partition requests on this instance type. --- .../config/default-mig-config.yaml | 106 ++++++++++++++++++ src/sagemaker/hyperpod/training/constants.py | 1 + 2 files changed, 107 insertions(+) diff --git a/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml b/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml index 8f4943d7..f18523ed 100644 --- a/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml +++ b/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml @@ -341,12 +341,90 @@ mig-configs: "1g.24gb": 2 "2g.48gb": 1 + # P6-B300 (Blackwell Ultra, 288GB HBM3e, ~269GB usable) profiles + # Profiles: 1g.34gb (x7), 1g.67gb (x4), 2g.67gb (x3), 3g.135gb (x2), 4g.135gb (x1), 7g.269gb (x1) + # Upstream ref: NVIDIA GPU Operator v25.3.0, device-filter 0x318210DE + + all-1g.34gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 7 + all-1g.67gb: - devices: all mig-enabled: true mig-devices: "1g.67gb": 4 + all-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.67gb": 3 + + all-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.135gb": 2 + + all-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "4g.135gb": 1 + + all-7g.269gb: + - devices: all + mig-enabled: true + mig-devices: + "7g.269gb": 1 + + mixed-1-3g.135gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.135gb": 1 + "4g.135gb": 1 + + mixed-1-1g.34gb-1-2g.67gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 1 + "4g.135gb": 1 + + mixed-3-1g.34gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "4g.135gb": 1 + + mixed-1-1g.34gb-1-2g.67gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 1 + "3g.135gb": 1 + + mixed-3-1g.34gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "3g.135gb": 1 + + mixed-2-2g.67gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.67gb": 2 + "3g.135gb": 1 + mixed-2-1g.34gb-1-2g.67gb-1-3g.135gb: - devices: all mig-enabled: true @@ -354,3 +432,31 @@ mig-configs: "1g.34gb": 2 "2g.67gb": 1 "3g.135gb": 1 + + mixed-4-1g.34gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 4 + "3g.135gb": 1 + + mixed-1-1g.34gb-3-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 3 + + mixed-3-1g.34gb-2-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "2g.67gb": 2 + + mixed-5-1g.34gb-1-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 5 + "2g.67gb": 1 diff --git a/src/sagemaker/hyperpod/training/constants.py b/src/sagemaker/hyperpod/training/constants.py index 29f58fa8..59230365 100644 --- a/src/sagemaker/hyperpod/training/constants.py +++ b/src/sagemaker/hyperpod/training/constants.py @@ -132,6 +132,7 @@ 'ml.p5e.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'ml.p5en.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'p6-b200.48xlarge': ['mig-1g.23gb', 'mig-1g.45gb', 'mig-2g.45gb', 'mig-3g.90gb', 'mig-4g.90gb', 'mig-7g.180gb'], + 'ml.p6-b300.48xlarge': ['mig-1g.34gb', 'mig-1g.67gb', 'mig-2g.67gb', 'mig-3g.135gb', 'mig-4g.135gb', 'mig-7g.269gb'], 'ml.p6e-gb200.36xlarge': ['mig-1g.23gb', 'mig-1g.47gb', 'mig-2g.47gb', 'mig-3g.93gb', 'mig-4g.93gb', 'mig-7g.186gb'], 'ml.g7e.2xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], 'ml.g7e.4xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], From cd5bc5db0e838cea6e18337d14f49760d68ec0ae Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sat, 28 Mar 2026 00:06:36 +0000 Subject: [PATCH 2/6] Add unit tests for B300 MIG profile validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Covers ml.p6-b300.48xlarge MIG profile support added in PR #398: - Profile presence in INSTANCE_TYPE_MIG_PROFILES - Complete profile list verification (6 profiles) - All profiles in ALLOWED_ACCELERATOR_PARTITION_TYPES - GPU slice extraction for all B300 profiles (1g→1, 2g→2, ..., 7g→7) - CPU/memory default calculation for each profile at max instances - Validation acceptance for valid B300 profiles - Validation rejection for invalid profiles on B300 instance type --- .../cli/test_accelerator_partition_util.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/test/unit_tests/cli/test_accelerator_partition_util.py b/test/unit_tests/cli/test_accelerator_partition_util.py index b43a44ea..56b1c4db 100644 --- a/test/unit_tests/cli/test_accelerator_partition_util.py +++ b/test/unit_tests/cli/test_accelerator_partition_util.py @@ -1,9 +1,14 @@ from sagemaker.hyperpod.training.accelerator_partition_util import ( _extract_gpu_slices_from_accelerator_partition_type, _get_accelerator_partition, + _get_accelerator_partition_defaults, _set_default_accelerator_partition_val, _validate_accelerator_partition, ) +from sagemaker.hyperpod.training.constants import ( + ALLOWED_ACCELERATOR_PARTITION_TYPES, + INSTANCE_TYPE_MIG_PROFILES, +) import pytest from unittest.mock import patch, MagicMock @@ -85,3 +90,85 @@ def test_validate_accelerator_partition_fields(self, mock_k8s_client, partition_ valid, error = _validate_accelerator_partition(partition_type, accelerators, accelerators_limit, node_count, instance_type) assert valid is expected_valid assert error_check(error) + + +class TestB300MigProfiles: + """Tests for NVIDIA B300 (Blackwell Ultra) MIG profile support.""" + + def test_b300_in_instance_type_mig_profiles(self): + assert "ml.p6-b300.48xlarge" in INSTANCE_TYPE_MIG_PROFILES + + def test_b300_profiles_complete(self): + profiles = INSTANCE_TYPE_MIG_PROFILES["ml.p6-b300.48xlarge"] + expected = [ + "mig-1g.34gb", + "mig-1g.67gb", + "mig-2g.67gb", + "mig-3g.135gb", + "mig-4g.135gb", + "mig-7g.269gb", + ] + assert profiles == expected + + def test_b300_profiles_in_allowed_set(self): + for profile in INSTANCE_TYPE_MIG_PROFILES["ml.p6-b300.48xlarge"]: + assert profile in ALLOWED_ACCELERATOR_PARTITION_TYPES + + @pytest.mark.parametrize( + "partition_type,expected_slices", + [ + ("mig-1g.34gb", 1), + ("mig-2g.67gb", 2), + ("mig-3g.135gb", 3), + ("mig-4g.135gb", 4), + ("mig-7g.269gb", 7), + ], + ) + def test_extract_gpu_slices_b300(self, partition_type, expected_slices): + assert _extract_gpu_slices_from_accelerator_partition_type(partition_type) == expected_slices + + @pytest.mark.parametrize( + "partition_type,partition_count", + [ + ("mig-1g.34gb", 7), + ("mig-1g.67gb", 4), + ("mig-2g.67gb", 3), + ("mig-3g.135gb", 2), + ("mig-4g.135gb", 1), + ("mig-7g.269gb", 1), + ], + ) + def test_accelerator_partition_defaults_b300(self, partition_type, partition_count): + """Verify CPU/memory defaults are calculated proportionally for B300 MIG profiles.""" + defaults = _get_accelerator_partition_defaults( + "ml.p6-b300.48xlarge", partition_type, partition_count + ) + assert "cpu" in defaults + assert "memory" in defaults + assert float(defaults["cpu"]) > 0 + assert float(defaults["memory"].replace("Gi", "")) > 0 + + @pytest.mark.parametrize( + "partition_type,expected_valid,error_check", + [ + ("mig-1g.34gb", True, lambda e: e == ""), + ("mig-3g.135gb", True, lambda e: e == ""), + ("mig-7g.269gb", True, lambda e: e == ""), + ("mig-1g.5gb", False, lambda e: "not supported on instance type" in e), + ], + ) + @patch("sagemaker.hyperpod.training.accelerator_partition_util.KubernetesClient") + def test_validate_b300_partition( + self, mock_k8s_client, partition_type, expected_valid, error_check + ): + mock_node = MagicMock() + mock_node.status.allocatable = {f"nvidia.com/{partition_type}": "1"} + mock_k8s_client.return_value.get_core_v1_api.return_value.list_node.return_value.items = [ + mock_node + ] + + valid, error = _validate_accelerator_partition( + partition_type, None, None, None, "ml.p6-b300.48xlarge" + ) + assert valid is expected_valid + assert error_check(error) From e27a807754c52a1d23cb70f1646ee65a33473181 Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sat, 28 Mar 2026 00:25:11 +0000 Subject: [PATCH 3/6] Remove redundant tests and strengthen assertions - Delete test_b300_in_instance_type_mig_profiles (subsumed by test_b300_profiles_complete which KeyErrors on missing key) - Delete test_b300_profiles_in_allowed_set (tautological: the allowed set is computed as union of all profile values) - Delete test_extract_gpu_slices_b300 (instance-type-agnostic regex already covered by existing parametrized tests) - Replace > 0 assertions with exact expected values in test_accelerator_partition_defaults_b300 - Fix misleading mock in test_validate_b300_partition: use empty allocatable for the invalid-profile case since validation fails at static parameter check before cluster check - Remove unused ALLOWED_ACCELERATOR_PARTITION_TYPES import --- .../cli/test_accelerator_partition_util.py | 54 ++++++------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/test/unit_tests/cli/test_accelerator_partition_util.py b/test/unit_tests/cli/test_accelerator_partition_util.py index 56b1c4db..7e8076d7 100644 --- a/test/unit_tests/cli/test_accelerator_partition_util.py +++ b/test/unit_tests/cli/test_accelerator_partition_util.py @@ -5,10 +5,7 @@ _set_default_accelerator_partition_val, _validate_accelerator_partition, ) -from sagemaker.hyperpod.training.constants import ( - ALLOWED_ACCELERATOR_PARTITION_TYPES, - INSTANCE_TYPE_MIG_PROFILES, -) +from sagemaker.hyperpod.training.constants import INSTANCE_TYPE_MIG_PROFILES import pytest from unittest.mock import patch, MagicMock @@ -93,10 +90,7 @@ def test_validate_accelerator_partition_fields(self, mock_k8s_client, partition_ class TestB300MigProfiles: - """Tests for NVIDIA B300 (Blackwell Ultra) MIG profile support.""" - - def test_b300_in_instance_type_mig_profiles(self): - assert "ml.p6-b300.48xlarge" in INSTANCE_TYPE_MIG_PROFILES + """Tests for B300 (Blackwell Ultra) MIG profile constants and defaults.""" def test_b300_profiles_complete(self): profiles = INSTANCE_TYPE_MIG_PROFILES["ml.p6-b300.48xlarge"] @@ -110,43 +104,24 @@ def test_b300_profiles_complete(self): ] assert profiles == expected - def test_b300_profiles_in_allowed_set(self): - for profile in INSTANCE_TYPE_MIG_PROFILES["ml.p6-b300.48xlarge"]: - assert profile in ALLOWED_ACCELERATOR_PARTITION_TYPES - - @pytest.mark.parametrize( - "partition_type,expected_slices", - [ - ("mig-1g.34gb", 1), - ("mig-2g.67gb", 2), - ("mig-3g.135gb", 3), - ("mig-4g.135gb", 4), - ("mig-7g.269gb", 7), - ], - ) - def test_extract_gpu_slices_b300(self, partition_type, expected_slices): - assert _extract_gpu_slices_from_accelerator_partition_type(partition_type) == expected_slices - @pytest.mark.parametrize( - "partition_type,partition_count", + "partition_type,partition_count,expected_cpu,expected_memory", [ - ("mig-1g.34gb", 7), - ("mig-1g.67gb", 4), - ("mig-2g.67gb", 3), - ("mig-3g.135gb", 2), - ("mig-4g.135gb", 1), - ("mig-7g.269gb", 1), + ("mig-1g.34gb", 7, "24.0", "512.0Gi"), + ("mig-1g.67gb", 4, "13.0", "292.0Gi"), + ("mig-2g.67gb", 3, "20.0", "438.0Gi"), + ("mig-3g.135gb", 2, "20.0", "438.0Gi"), + ("mig-4g.135gb", 1, "13.0", "292.0Gi"), + ("mig-7g.269gb", 1, "24.0", "512.0Gi"), ], ) - def test_accelerator_partition_defaults_b300(self, partition_type, partition_count): - """Verify CPU/memory defaults are calculated proportionally for B300 MIG profiles.""" + def test_accelerator_partition_defaults_b300(self, partition_type, partition_count, expected_cpu, expected_memory): + """Verify CPU/memory defaults match the deterministic ratio formula for B300.""" defaults = _get_accelerator_partition_defaults( "ml.p6-b300.48xlarge", partition_type, partition_count ) - assert "cpu" in defaults - assert "memory" in defaults - assert float(defaults["cpu"]) > 0 - assert float(defaults["memory"].replace("Gi", "")) > 0 + assert defaults["cpu"] == expected_cpu + assert defaults["memory"] == expected_memory @pytest.mark.parametrize( "partition_type,expected_valid,error_check", @@ -162,7 +137,8 @@ def test_validate_b300_partition( self, mock_k8s_client, partition_type, expected_valid, error_check ): mock_node = MagicMock() - mock_node.status.allocatable = {f"nvidia.com/{partition_type}": "1"} + allocatable = {f"nvidia.com/{partition_type}": "1"} if expected_valid else {} + mock_node.status.allocatable = allocatable mock_k8s_client.return_value.get_core_v1_api.return_value.list_node.return_value.items = [ mock_node ] From 6a77b966055fc3cb453f0bcb8700796708c3c97c Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sat, 28 Mar 2026 00:28:06 +0000 Subject: [PATCH 4/6] Merge B300 tests into TestAcceleratorPartitionUtil Eliminate the separate TestB300MigProfiles class. B300 tests now extend the existing parametrized cases in TestAcceleratorPartitionUtil: - B300 valid/invalid profile cases added to test_validate_accelerator_partition_fields - B300 defaults with exact values added to test_accelerator_partition_defaults (instance-type-parametrized) - test_instance_type_profiles_not_empty iterates all instance types in INSTANCE_TYPE_MIG_PROFILES as a data-driven guard This pattern scales to future instance types without adding new test classes. --- .../cli/test_accelerator_partition_util.py | 77 ++++++------------- 1 file changed, 24 insertions(+), 53 deletions(-) diff --git a/test/unit_tests/cli/test_accelerator_partition_util.py b/test/unit_tests/cli/test_accelerator_partition_util.py index 7e8076d7..05b41471 100644 --- a/test/unit_tests/cli/test_accelerator_partition_util.py +++ b/test/unit_tests/cli/test_accelerator_partition_util.py @@ -75,76 +75,47 @@ def test_set_default_accelerator_partition_values(self, input_count, input_limit ("mig-1g.5gb", None, None, 2, "ml.p4d.24xlarge", False, lambda e: "accelerator_partition_type cannot be used together with node_count." == e), # Invalid instance type combination ("mig-1g.5gb", None, None, None, "ml.c5.large", False, lambda e: "does not support accelerator partitions" in e), + # B300: valid profile accepted + ("mig-1g.34gb", None, None, None, "ml.p6-b300.48xlarge", True, lambda e: e == ""), + # B300: cross-architecture profile rejected + ("mig-1g.5gb", None, None, None, "ml.p6-b300.48xlarge", False, lambda e: "not supported on instance type" in e), ] ) @patch('sagemaker.hyperpod.training.accelerator_partition_util.KubernetesClient') def test_validate_accelerator_partition_fields(self, mock_k8s_client, partition_type, accelerators, accelerators_limit, node_count, instance_type, expected_valid, error_check): - # Mock cluster to have no MIG resources for most tests mock_node = MagicMock() - mock_node.status.allocatable = {} + allocatable = {f"nvidia.com/{partition_type}": "1"} if expected_valid and partition_type else {} + mock_node.status.allocatable = allocatable mock_k8s_client.return_value.get_core_v1_api.return_value.list_node.return_value.items = [mock_node] valid, error = _validate_accelerator_partition(partition_type, accelerators, accelerators_limit, node_count, instance_type) assert valid is expected_valid assert error_check(error) - -class TestB300MigProfiles: - """Tests for B300 (Blackwell Ultra) MIG profile constants and defaults.""" - - def test_b300_profiles_complete(self): - profiles = INSTANCE_TYPE_MIG_PROFILES["ml.p6-b300.48xlarge"] - expected = [ - "mig-1g.34gb", - "mig-1g.67gb", - "mig-2g.67gb", - "mig-3g.135gb", - "mig-4g.135gb", - "mig-7g.269gb", - ] - assert profiles == expected + @pytest.mark.parametrize( + "instance_type", + list(INSTANCE_TYPE_MIG_PROFILES.keys()), + ) + def test_instance_type_profiles_not_empty(self, instance_type): + """Every instance type in the MIG mapping must have at least one profile.""" + assert len(INSTANCE_TYPE_MIG_PROFILES[instance_type]) > 0 @pytest.mark.parametrize( - "partition_type,partition_count,expected_cpu,expected_memory", + "instance_type,partition_type,partition_count,expected_cpu,expected_memory", [ - ("mig-1g.34gb", 7, "24.0", "512.0Gi"), - ("mig-1g.67gb", 4, "13.0", "292.0Gi"), - ("mig-2g.67gb", 3, "20.0", "438.0Gi"), - ("mig-3g.135gb", 2, "20.0", "438.0Gi"), - ("mig-4g.135gb", 1, "13.0", "292.0Gi"), - ("mig-7g.269gb", 1, "24.0", "512.0Gi"), + # B300 (Blackwell Ultra) — all profiles at max instance count + ("ml.p6-b300.48xlarge", "mig-1g.34gb", 7, "24.0", "512.0Gi"), + ("ml.p6-b300.48xlarge", "mig-1g.67gb", 4, "13.0", "292.0Gi"), + ("ml.p6-b300.48xlarge", "mig-2g.67gb", 3, "20.0", "438.0Gi"), + ("ml.p6-b300.48xlarge", "mig-3g.135gb", 2, "20.0", "438.0Gi"), + ("ml.p6-b300.48xlarge", "mig-4g.135gb", 1, "13.0", "292.0Gi"), + ("ml.p6-b300.48xlarge", "mig-7g.269gb", 1, "24.0", "512.0Gi"), ], ) - def test_accelerator_partition_defaults_b300(self, partition_type, partition_count, expected_cpu, expected_memory): - """Verify CPU/memory defaults match the deterministic ratio formula for B300.""" + def test_accelerator_partition_defaults(self, instance_type, partition_type, partition_count, expected_cpu, expected_memory): + """Verify CPU/memory defaults match the deterministic ratio formula.""" defaults = _get_accelerator_partition_defaults( - "ml.p6-b300.48xlarge", partition_type, partition_count + instance_type, partition_type, partition_count ) assert defaults["cpu"] == expected_cpu assert defaults["memory"] == expected_memory - - @pytest.mark.parametrize( - "partition_type,expected_valid,error_check", - [ - ("mig-1g.34gb", True, lambda e: e == ""), - ("mig-3g.135gb", True, lambda e: e == ""), - ("mig-7g.269gb", True, lambda e: e == ""), - ("mig-1g.5gb", False, lambda e: "not supported on instance type" in e), - ], - ) - @patch("sagemaker.hyperpod.training.accelerator_partition_util.KubernetesClient") - def test_validate_b300_partition( - self, mock_k8s_client, partition_type, expected_valid, error_check - ): - mock_node = MagicMock() - allocatable = {f"nvidia.com/{partition_type}": "1"} if expected_valid else {} - mock_node.status.allocatable = allocatable - mock_k8s_client.return_value.get_core_v1_api.return_value.list_node.return_value.items = [ - mock_node - ] - - valid, error = _validate_accelerator_partition( - partition_type, None, None, None, "ml.p6-b300.48xlarge" - ) - assert valid is expected_valid - assert error_check(error) From 6b1ab017151ff48ff051db61e9c7a604ba485c8d Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sat, 28 Mar 2026 00:32:51 +0000 Subject: [PATCH 5/6] Add B200 MIG test cases (depends on #399 for ml. prefix fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add B200 (Blackwell) test coverage alongside B300: - 2 validation cases: valid profile accepted, cross-arch rejected - 6 defaults cases with exact CPU/memory values B200 validation tests will fail until #399 merges (fixes the p6-b200.48xlarge → ml.p6-b200.48xlarge key). B200 defaults tests pass immediately since INSTANCE_RESOURCES already uses the ml. key. --- .../unit_tests/cli/test_accelerator_partition_util.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/unit_tests/cli/test_accelerator_partition_util.py b/test/unit_tests/cli/test_accelerator_partition_util.py index 05b41471..9cb73a37 100644 --- a/test/unit_tests/cli/test_accelerator_partition_util.py +++ b/test/unit_tests/cli/test_accelerator_partition_util.py @@ -75,6 +75,10 @@ def test_set_default_accelerator_partition_values(self, input_count, input_limit ("mig-1g.5gb", None, None, 2, "ml.p4d.24xlarge", False, lambda e: "accelerator_partition_type cannot be used together with node_count." == e), # Invalid instance type combination ("mig-1g.5gb", None, None, None, "ml.c5.large", False, lambda e: "does not support accelerator partitions" in e), + # B200: valid profile accepted (requires #399 for ml. prefix fix) + ("mig-1g.23gb", None, None, None, "ml.p6-b200.48xlarge", True, lambda e: e == ""), + # B200: cross-architecture profile rejected + ("mig-1g.5gb", None, None, None, "ml.p6-b200.48xlarge", False, lambda e: "not supported on instance type" in e), # B300: valid profile accepted ("mig-1g.34gb", None, None, None, "ml.p6-b300.48xlarge", True, lambda e: e == ""), # B300: cross-architecture profile rejected @@ -103,6 +107,13 @@ def test_instance_type_profiles_not_empty(self, instance_type): @pytest.mark.parametrize( "instance_type,partition_type,partition_count,expected_cpu,expected_memory", [ + # B200 (Blackwell) — all profiles at max instance count (requires #399 for ml. prefix fix) + ("ml.p6-b200.48xlarge", "mig-1g.23gb", 7, "24.0", "256.0Gi"), + ("ml.p6-b200.48xlarge", "mig-1g.45gb", 4, "13.0", "146.0Gi"), + ("ml.p6-b200.48xlarge", "mig-2g.45gb", 3, "20.0", "219.0Gi"), + ("ml.p6-b200.48xlarge", "mig-3g.90gb", 2, "20.0", "219.0Gi"), + ("ml.p6-b200.48xlarge", "mig-4g.90gb", 1, "13.0", "146.0Gi"), + ("ml.p6-b200.48xlarge", "mig-7g.180gb", 1, "24.0", "256.0Gi"), # B300 (Blackwell Ultra) — all profiles at max instance count ("ml.p6-b300.48xlarge", "mig-1g.34gb", 7, "24.0", "512.0Gi"), ("ml.p6-b300.48xlarge", "mig-1g.67gb", 4, "13.0", "292.0Gi"), From 1b00da5b902e4775a00482c644195a64fc3b691f Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Sat, 28 Mar 2026 00:39:14 +0000 Subject: [PATCH 6/6] Cover all MIG-capable instance types in defaults test Replace 12 B200/B300-only rows with 1 representative row per MIG-capable instance type (P4d, P4de, P5, P5e, P5en, B200, B300, GB200, G7e). Each row uses the smallest profile at max instance count, verifying that INSTANCE_RESOURCES has correct cpu/gpu/memory values for the ratio calculation. --- .../cli/test_accelerator_partition_util.py | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/test/unit_tests/cli/test_accelerator_partition_util.py b/test/unit_tests/cli/test_accelerator_partition_util.py index 9cb73a37..296fe65c 100644 --- a/test/unit_tests/cli/test_accelerator_partition_util.py +++ b/test/unit_tests/cli/test_accelerator_partition_util.py @@ -107,24 +107,21 @@ def test_instance_type_profiles_not_empty(self, instance_type): @pytest.mark.parametrize( "instance_type,partition_type,partition_count,expected_cpu,expected_memory", [ - # B200 (Blackwell) — all profiles at max instance count (requires #399 for ml. prefix fix) - ("ml.p6-b200.48xlarge", "mig-1g.23gb", 7, "24.0", "256.0Gi"), - ("ml.p6-b200.48xlarge", "mig-1g.45gb", 4, "13.0", "146.0Gi"), - ("ml.p6-b200.48xlarge", "mig-2g.45gb", 3, "20.0", "219.0Gi"), - ("ml.p6-b200.48xlarge", "mig-3g.90gb", 2, "20.0", "219.0Gi"), - ("ml.p6-b200.48xlarge", "mig-4g.90gb", 1, "13.0", "146.0Gi"), - ("ml.p6-b200.48xlarge", "mig-7g.180gb", 1, "24.0", "256.0Gi"), - # B300 (Blackwell Ultra) — all profiles at max instance count + # One representative profile per MIG-capable instance type (smallest profile, max count). + # Guards that INSTANCE_RESOURCES has correct cpu/gpu/memory for each instance type. + ("ml.p4d.24xlarge", "mig-1g.5gb", 7, "12.0", "144.0Gi"), + ("ml.p4de.24xlarge", "mig-1g.10gb", 7, "12.0", "144.0Gi"), + ("ml.p5.48xlarge", "mig-1g.10gb", 7, "24.0", "256.0Gi"), + ("ml.p5e.48xlarge", "mig-1g.18gb", 7, "24.0", "256.0Gi"), + ("ml.p5en.48xlarge", "mig-1g.18gb", 7, "24.0", "256.0Gi"), + ("ml.p6-b200.48xlarge", "mig-1g.23gb", 7, "24.0", "256.0Gi"), # requires #399 ("ml.p6-b300.48xlarge", "mig-1g.34gb", 7, "24.0", "512.0Gi"), - ("ml.p6-b300.48xlarge", "mig-1g.67gb", 4, "13.0", "292.0Gi"), - ("ml.p6-b300.48xlarge", "mig-2g.67gb", 3, "20.0", "438.0Gi"), - ("ml.p6-b300.48xlarge", "mig-3g.135gb", 2, "20.0", "438.0Gi"), - ("ml.p6-b300.48xlarge", "mig-4g.135gb", 1, "13.0", "292.0Gi"), - ("ml.p6-b300.48xlarge", "mig-7g.269gb", 1, "24.0", "512.0Gi"), + ("ml.p6e-gb200.36xlarge", "mig-1g.23gb", 7, "36.0", "240.0Gi"), + ("ml.g7e.48xlarge", "mig-1g.24gb", 4, "13.0", "146.0Gi"), ], ) def test_accelerator_partition_defaults(self, instance_type, partition_type, partition_count, expected_cpu, expected_memory): - """Verify CPU/memory defaults match the deterministic ratio formula.""" + """Verify CPU/memory defaults for one profile per MIG-capable instance type.""" defaults = _get_accelerator_partition_defaults( instance_type, partition_type, partition_count )