From c98fd6e41b2d14e2ccc723f44d47846f25a177b8 Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Fri, 27 Mar 2026 12:20:53 +0000 Subject: [PATCH] Add MIG profile support for ml.p6-b300.48xlarge (Blackwell Ultra) Add ml.p6-b300.48xlarge to INSTANCE_TYPE_MIG_PROFILES in constants.py with the correct B300 MIG profiles derived from the NVIDIA GPU Operator v25.3.0 upstream ConfigMap (device-filter 0x318210DE): - mig-1g.34gb, mig-1g.67gb, mig-2g.67gb - mig-3g.135gb, mig-4g.135gb, mig-7g.269gb Also add the corresponding uniform and mixed MIG partition profiles to the Helm chart default-mig-config.yaml ConfigMap, following the same pattern used for existing GPU types (H100, H200, B200). The B300 GPU (288GB HBM3e, ~269GB usable) was already registered in INSTANCE_RESOURCES but had no MIG profile mapping, causing HyperPod MIG validation to reject accelerator partition requests on this instance type. --- .../config/default-mig-config.yaml | 106 ++++++++++++++++++ src/sagemaker/hyperpod/training/constants.py | 1 + 2 files changed, 107 insertions(+) diff --git a/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml b/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml index 8f4943d7..f18523ed 100644 --- a/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml +++ b/helm_chart/HyperPodHelmChart/charts/gpu-operator/config/default-mig-config.yaml @@ -341,12 +341,90 @@ mig-configs: "1g.24gb": 2 "2g.48gb": 1 + # P6-B300 (Blackwell Ultra, 288GB HBM3e, ~269GB usable) profiles + # Profiles: 1g.34gb (x7), 1g.67gb (x4), 2g.67gb (x3), 3g.135gb (x2), 4g.135gb (x1), 7g.269gb (x1) + # Upstream ref: NVIDIA GPU Operator v25.3.0, device-filter 0x318210DE + + all-1g.34gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 7 + all-1g.67gb: - devices: all mig-enabled: true mig-devices: "1g.67gb": 4 + all-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.67gb": 3 + + all-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.135gb": 2 + + all-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "4g.135gb": 1 + + all-7g.269gb: + - devices: all + mig-enabled: true + mig-devices: + "7g.269gb": 1 + + mixed-1-3g.135gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.135gb": 1 + "4g.135gb": 1 + + mixed-1-1g.34gb-1-2g.67gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 1 + "4g.135gb": 1 + + mixed-3-1g.34gb-1-4g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "4g.135gb": 1 + + mixed-1-1g.34gb-1-2g.67gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 1 + "3g.135gb": 1 + + mixed-3-1g.34gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "3g.135gb": 1 + + mixed-2-2g.67gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.67gb": 2 + "3g.135gb": 1 + mixed-2-1g.34gb-1-2g.67gb-1-3g.135gb: - devices: all mig-enabled: true @@ -354,3 +432,31 @@ mig-configs: "1g.34gb": 2 "2g.67gb": 1 "3g.135gb": 1 + + mixed-4-1g.34gb-1-3g.135gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 4 + "3g.135gb": 1 + + mixed-1-1g.34gb-3-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 1 + "2g.67gb": 3 + + mixed-3-1g.34gb-2-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 3 + "2g.67gb": 2 + + mixed-5-1g.34gb-1-2g.67gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.34gb": 5 + "2g.67gb": 1 diff --git a/src/sagemaker/hyperpod/training/constants.py b/src/sagemaker/hyperpod/training/constants.py index 29f58fa8..59230365 100644 --- a/src/sagemaker/hyperpod/training/constants.py +++ b/src/sagemaker/hyperpod/training/constants.py @@ -132,6 +132,7 @@ 'ml.p5e.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'ml.p5en.48xlarge': ['mig-1g.18gb', 'mig-1g.35gb', 'mig-2g.35gb', 'mig-3g.71gb', 'mig-4g.71gb', 'mig-7g.141gb'], 'p6-b200.48xlarge': ['mig-1g.23gb', 'mig-1g.45gb', 'mig-2g.45gb', 'mig-3g.90gb', 'mig-4g.90gb', 'mig-7g.180gb'], + 'ml.p6-b300.48xlarge': ['mig-1g.34gb', 'mig-1g.67gb', 'mig-2g.67gb', 'mig-3g.135gb', 'mig-4g.135gb', 'mig-7g.269gb'], 'ml.p6e-gb200.36xlarge': ['mig-1g.23gb', 'mig-1g.47gb', 'mig-2g.47gb', 'mig-3g.93gb', 'mig-4g.93gb', 'mig-7g.186gb'], 'ml.g7e.2xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'], 'ml.g7e.4xlarge': ['mig-1g.24gb', 'mig-2g.48gb', 'mig-4g.96gb'],