From 1bcce28760bc7a1df2354e29a89ce6f8a1a97dd3 Mon Sep 17 00:00:00 2001 From: Sophia Huang Date: Wed, 3 Dec 2025 11:53:26 -0800 Subject: [PATCH 1/6] Update documentation for elastic training arguments --- README.md | 6 ++++++ doc/cli/training/cli_training.md | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/README.md b/README.md index 81c0aabf..368a4bc6 100644 --- a/README.md +++ b/README.md @@ -364,6 +364,12 @@ hyp create hyp-pytorch-job \ | `--accelerator-partition-limit` | INTEGER | No | Limit for the number of accelerator partitions (minimum: 1) | | `--preferred-topology` | TEXT | No | Preferred topology annotation for scheduling | | `--required-topology` | TEXT | No | Required topology annotation for scheduling | +| `--max-node-count` | INTEGER | No | Maximum number of nodes| +| `--elastic-replica-increment-step` | INTEGER | No | Scaling step size for elastic training. Provide either this or elastic-replica-discrete-values| +| `--elastic-graceful-shutdown-timeout-in-seconds` | INTEGER | No | Graceful shutdown timeout in seconds for elastic scaling operations| +| `--elastic-scaling-timeout-in-seconds` | INTEGER | No | Scaling timeout for elastic training| +| `--elastic-scale-up-snooze-time-in-seconds` | INTEGER | No | Timeout period after job restart during which no scale up/workload admission is allowed| +| `--elastic-replica-discrete-values` | ARRAY | No | Alternative to elastic-replica-increment-step. Provides exact values for total replicas count| | `--debug` | FLAG | No | Enable debug mode (default: false) | #### List Available Accelerator Partition Types diff --git a/doc/cli/training/cli_training.md b/doc/cli/training/cli_training.md index 905ec54b..087ebcba 100644 --- a/doc/cli/training/cli_training.md +++ b/doc/cli/training/cli_training.md @@ -206,6 +206,12 @@ hyp create hyp-pytorch-job [OPTIONS] | `--memory-limit` | FLOAT | No | Limit for the amount of memory in GiB | | `--preferred-topology` | TEXT | No | Preferred topology annotation for scheduling | | `--required-topology` | TEXT | No | Required topology annotation for scheduling | +| `--max-node-count` | INTEGER | No | Maximum number of nodes| +| `--elastic-replica-increment-step` | INTEGER | No | Scaling step size for elastic training. Provide either this or elastic-replica-discrete-values| +| `--elastic-graceful-shutdown-timeout-in-seconds` | INTEGER | No | Graceful shutdown timeout in seconds for elastic scaling operations| +| `--elastic-scaling-timeout-in-seconds` | INTEGER | No | Scaling timeout for elastic training| +| `--elastic-scale-up-snooze-time-in-seconds` | INTEGER | No | Timeout period after job restart during which no scale up/workload admission is allowed| +| `--elastic-replica-discrete-values` | ARRAY | No | Alternative to elastic-replica-increment-step. Provides exact values for total replicas count| | `--debug` | FLAG | No | Enable debug mode (default: false) | ### Volume Configuration From 2ef916fa6d82ee655d67e1cb94a9a427c00ac5a4 Mon Sep 17 00:00:00 2001 From: Sophia Huang Date: Wed, 3 Dec 2025 12:11:04 -0800 Subject: [PATCH 2/6] nit: Add detail descriptions for array type --- README.md | 2 +- doc/cli/training/cli_training.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 368a4bc6..d9dc6ea8 100644 --- a/README.md +++ b/README.md @@ -369,7 +369,7 @@ hyp create hyp-pytorch-job \ | `--elastic-graceful-shutdown-timeout-in-seconds` | INTEGER | No | Graceful shutdown timeout in seconds for elastic scaling operations| | `--elastic-scaling-timeout-in-seconds` | INTEGER | No | Scaling timeout for elastic training| | `--elastic-scale-up-snooze-time-in-seconds` | INTEGER | No | Timeout period after job restart during which no scale up/workload admission is allowed| -| `--elastic-replica-discrete-values` | ARRAY | No | Alternative to elastic-replica-increment-step. Provides exact values for total replicas count| +| `--elastic-replica-discrete-values` | ARRAY | No | Alternative to elastic-replica-increment-step. Provides exact values for total replicas count (array of integers)| | `--debug` | FLAG | No | Enable debug mode (default: false) | #### List Available Accelerator Partition Types diff --git a/doc/cli/training/cli_training.md b/doc/cli/training/cli_training.md index 087ebcba..8f5bffe1 100644 --- a/doc/cli/training/cli_training.md +++ b/doc/cli/training/cli_training.md @@ -211,7 +211,7 @@ hyp create hyp-pytorch-job [OPTIONS] | `--elastic-graceful-shutdown-timeout-in-seconds` | INTEGER | No | Graceful shutdown timeout in seconds for elastic scaling operations| | `--elastic-scaling-timeout-in-seconds` | INTEGER | No | Scaling timeout for elastic training| | `--elastic-scale-up-snooze-time-in-seconds` | INTEGER | No | Timeout period after job restart during which no scale up/workload admission is allowed| -| `--elastic-replica-discrete-values` | ARRAY | No | Alternative to elastic-replica-increment-step. Provides exact values for total replicas count| +| `--elastic-replica-discrete-values` | ARRAY | No | Alternative to elastic-replica-increment-step. Provides exact values for total replicas count (array of integers)| | `--debug` | FLAG | No | Enable debug mode (default: false) | ### Volume Configuration From de768b4d190ca3bbe4be4c965aa0d5c43994b4c1 Mon Sep 17 00:00:00 2001 From: Sophia Huang Date: Tue, 9 Dec 2025 11:40:12 -0800 Subject: [PATCH 3/6] Add efa support for training jobs --- .../v1_1/model.py | 22 +- .../v1_1/schema.json | 10 + .../v1_1/template.py | 10 +- src/sagemaker/hyperpod/training/constants.py | 228 +++++++++--------- .../hyperpod/training/hyperpod_pytorch_job.py | 23 +- .../training/quota_allocation_util.py | 44 +++- 6 files changed, 209 insertions(+), 128 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 085f07a3..5fa71dc3 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -195,6 +195,16 @@ class PyTorchJobConfig(BaseModel): default=None, description="Limit for the amount of memory in GiB", ) + efa: Optional[int] = Field( + default=None, + description="Number of EFA interfaces for the instance", + ge=0 + ) + efa_limit: Optional[int] = Field( + default=None, + description="Limit for the number of EFA interfaces", + ge=0 + ) accelerator_partition_type: Optional[str] = Field( default=None, description="Type of accelerator partition" @@ -453,23 +463,27 @@ def build_dict(**kwargs): requests_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None + memory=str(self.memory) if self.memory else None, + **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {} ) limits_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None + memory=str(self.memory_limit) if self.memory_limit else None, + **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {} ) else: requests_value = build_dict( accelerators=str(self.accelerators) if self.accelerators else None, vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None + memory=str(self.memory) if self.memory else None, + **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {} ) limits_value = build_dict( accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None + memory=str(self.memory_limit) if self.memory_limit else None, + **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {} ) # Build container diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json index 83bd6120..f5f87b24 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -305,6 +305,16 @@ "minimum": 0, "description": "Limit for the amount of memory in GiB" }, + "efa": { + "type": "integer", + "minimum": 0, + "description": "Number of EFA interfaces for the instance" + }, + "efa_limit": { + "type": "integer", + "minimum": 0, + "description": "Limit for the number of EFA interfaces" + }, "accelerator_partition_type": { "type": "string", "enum": [ diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py index 1a61f6df..63278d2c 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py @@ -97,14 +97,14 @@ {%- if memory %} memory: {{ memory }}Gi {%- endif %} -{%- if (node_count and node_count > 1) %} - vpc.amazonaws.com/efa: 1 +{%- if efa and efa > 0 %} + vpc.amazonaws.com/efa: {{ efa }} {%- endif %} {%- else %} requests: nvidia.com/gpu: "0" {%- endif %} -{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit %} +{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit or efa_limit%} limits: {%- if accelerator_partition_type and accelerator_partition_limit %} nvidia.com/{{ accelerator_partition_type }}: {{ accelerator_partition_limit }} @@ -117,8 +117,8 @@ {%- if memory_limit %} memory: {{ memory_limit }}Gi {%- endif %} -{%- if (node_count and node_count > 1) %} - vpc.amazonaws.com/efa: 1 +{%- if efa_limit and efa_limit > 0 %} + vpc.amazonaws.com/efa: {{ efa_limit }} {%- endif %} {%- else %} limits: diff --git a/src/sagemaker/hyperpod/training/constants.py b/src/sagemaker/hyperpod/training/constants.py index 32fdc8a2..3fa55217 100644 --- a/src/sagemaker/hyperpod/training/constants.py +++ b/src/sagemaker/hyperpod/training/constants.py @@ -1,119 +1,119 @@ # TODO: currently there is no API for instances and they are hardcoded; post GA work with partner team on adding support for such API INSTANCE_RESOURCES = { - "ml.p4d.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152}, - "ml.p4de.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152}, - "ml.p5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, - "ml.p5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.trn1.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512}, - "ml.trn1n.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512}, - "ml.g5.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16}, - "ml.g5.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32}, - "ml.g5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64}, - "ml.g5.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.g5.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192}, - "ml.g5.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g5.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384}, - "ml.g5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768}, - "ml.g6.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16}, - "ml.g6.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32}, - "ml.g6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64}, - "ml.g6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.g6.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g6.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192}, - "ml.g6.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384}, - "ml.g6.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768}, - "ml.gr6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.gr6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g6e.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 32}, - "ml.g6e.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 64}, - "ml.g6e.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.g6e.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g6e.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 512}, - "ml.g6e.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 384}, - "ml.g6e.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 768}, - "ml.g6e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 1536}, - "ml.p5e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, - "ml.p5en.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, - "ml.trn2.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 16, "memory": 2048}, - "ml.p6e-gb200.36xlarge": {"cpu": 144, "gpu": 4, "trainium": 0, "memory": 960}, - "ml.p6-b200.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2024}, - "ml.c5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, - "ml.c5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.c5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.c5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.c5.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 72}, - "ml.c5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.c5.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 144}, - "ml.c5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.c5n.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 5}, - "ml.c5n.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 21}, - "ml.c5n.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 42}, - "ml.c5n.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.c5n.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.m5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.m5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.m5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.m5.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.m5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m5.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.t3.medium": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, - "ml.t3.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.t3.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.t3.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.c6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, - "ml.c6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.c6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.c6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.c6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.c6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.c6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.c6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.c6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.m6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.m6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.m6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.m6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.m6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.m6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 512}, - "ml.r6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.r6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.r6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.r6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.r6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.r6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.r6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512}, - "ml.r6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}, - "ml.r6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 1024}, - "ml.m7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.m7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.m7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.m7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.m7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.m7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.m7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 768}, - "ml.r7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.r7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.r7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.r7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.r7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.r7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.r7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512}, - "ml.r7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}, - "ml.r7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 1536}, - "ml.i3en.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.i3en.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.i3en.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.i3en.3xlarge": {"cpu": 12, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.i3en.6xlarge": {"cpu": 24, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.i3en.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.i3en.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768} + "ml.p4d.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152, "efa": 4}, + "ml.p4de.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152, "efa": 4}, + "ml.p5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048, "efa": 32}, + "ml.p5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.trn1.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512, "efa": 8}, + "ml.trn1n.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512, "efa": 16}, + "ml.g5.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16, "efa": 0}, + "ml.g5.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32, "efa": 0}, + "ml.g5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64, "efa": 0}, + "ml.g5.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128, "efa": 1}, + "ml.g5.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192, "efa": 1}, + "ml.g5.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g5.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384, "efa": 1}, + "ml.g5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768, "efa": 1}, + "ml.g6.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16, "efa": 0}, + "ml.g6.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32, "efa": 0}, + "ml.g6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64, "efa": 0}, + "ml.g6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128, "efa": 1}, + "ml.g6.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g6.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192, "efa": 1}, + "ml.g6.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384, "efa": 1}, + "ml.g6.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768, "efa": 1}, + "ml.gr6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128, "efa": 0}, + "ml.gr6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g6e.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 32, "efa": 0}, + "ml.g6e.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 64, "efa": 0}, + "ml.g6e.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128, "efa": 0}, + "ml.g6e.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g6e.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 512, "efa": 1}, + "ml.g6e.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 384, "efa": 1}, + "ml.g6e.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 768, "efa": 2}, + "ml.g6e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 1536, "efa": 4}, + "ml.p5e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048, "efa": 32}, + "ml.p5en.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048, "efa": 16}, + "ml.trn2.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 16, "memory": 2048, "efa": 0}, + "ml.p6e-gb200.36xlarge": {"cpu": 144, "gpu": 4, "trainium": 0, "memory": 960, "efa": 0}, + "ml.p6-b200.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2024, "efa": 8}, + "ml.c5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4, "efa": 0}, + "ml.c5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.c5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.c5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.c5.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 72, "efa": 0}, + "ml.c5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96, "efa": 0}, + "ml.c5.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 144, "efa": 0}, + "ml.c5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.c5n.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 5, "efa": 0}, + "ml.c5n.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 21, "efa": 0}, + "ml.c5n.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 42, "efa": 0}, + "ml.c5n.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 96, "efa": 1}, + "ml.c5n.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 192, "efa": 1}, + "ml.m5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.m5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.m5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.m5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.m5.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.m5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.m5.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.m5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.t3.medium": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4, "efa": 0}, + "ml.t3.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.t3.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.t3.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.c6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4, "efa": 0}, + "ml.c6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.c6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.c6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.c6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.c6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96, "efa": 0}, + "ml.c6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.c6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.c6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 256, "efa": 1}, + "ml.m6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.m6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.m6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.m6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.m6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.m6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.m6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.m6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.m6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 512, "efa": 1}, + "ml.r6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.r6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.r6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.r6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.r6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.r6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.r6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512, "efa": 0}, + "ml.r6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768, "efa": 0}, + "ml.r6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 1024, "efa": 1}, + "ml.m7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.m7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.m7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.m7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.m7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.m7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.m7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.m7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.m7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 768, "efa": 1}, + "ml.r7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.r7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.r7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.r7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.r7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.r7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.r7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512, "efa": 0}, + "ml.r7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768, "efa": 0}, + "ml.r7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 1536, "efa": 1}, + "ml.i3en.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.i3en.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.i3en.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.i3en.3xlarge": {"cpu": 12, "gpu": 0, "trainium": 0, "memory": 96, "efa": 0}, + "ml.i3en.6xlarge": {"cpu": 24, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.i3en.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384, "efa": 1}, + "ml.i3en.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768, "efa": 1} } # MIG profiles by instance type diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index c4d548be..cd9074c2 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -29,6 +29,7 @@ _resolve_default_memory_values, _set_default_accelerators_val, _validate_accelerators_inputs, + _validate_efa_inputs, _resolve_default_cpu_values, _trim_resource_requests, ) @@ -146,6 +147,16 @@ def _process_replica_resources(cls, data): acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit) _validate_accelerators_inputs(instance_type, acc_req, acc_lim) + efa = None + if requests.get('vpc.amazonaws.com/efa'): + efa = int(requests.get('vpc.amazonaws.com/efa')) + + efa_limit = None + if limits.get('vpc.amazonaws.com/efa'): + efa_limit = int(limits.get('vpc.amazonaws.com/efa')) + + _validate_efa_inputs(instance_type, efa, efa_limit) + accelerator_partition_type, accelerator_partition_count, accelerator_partition_limit = ( _get_accelerator_partition(requests, limits) ) @@ -158,8 +169,7 @@ def _process_replica_resources(cls, data): acc_partition_req, acc_partition_lim = _set_default_accelerator_partition_val(accelerator_partition_count, accelerator_partition_limit) - # Calculate resource values - requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req) + requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req, efa) if requests_values is None: requests_values = _get_resources_from_instance(instance_type, node_count=1) _trim_resource_requests(instance_type, requests_values) @@ -168,7 +178,14 @@ def _process_replica_resources(cls, data): elif NEURON_RESOURCE_KEY in requests_values: acc_lim = requests_values[NEURON_RESOURCE_KEY] - limits_values = _get_limits(instance_type, vcpu_limit, memory_limit, acc_lim, accelerator_partition_type, acc_partition_lim) + if efa is not None: + requests_values["vpc.amazonaws.com/efa"] = efa + + efa_lim = requests_values.get("vpc.amazonaws.com/efa") + if efa_lim is not None: + efa_lim = int(efa_lim) + + limits_values = _get_limits(instance_type, vcpu_limit, memory_limit, acc_lim, accelerator_partition_type, acc_partition_lim, efa_lim) _resolve_default_memory_values(instance_type, requests_values, limits_values) _resolve_default_cpu_values(instance_type, requests_values) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index 291bf3c2..ff09c125 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -33,7 +33,8 @@ def _get_resources_from_compute_quotas(instance_type: str, memory_in_gib: Optional[float], accelerators: Optional[int] = 0, accelerator_partition_type: Optional[str] = None, - accelerator_partition_count: Optional[int] = None) -> Optional[dict]: + accelerator_partition_count: Optional[int] = None, + efa: Optional[int] = None) -> Optional[dict]: has_accelerator_partition = accelerator_partition_type is not None and accelerator_partition_count is not None has_compute_resources = _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators) @@ -71,6 +72,13 @@ def _get_resources_from_compute_quotas(instance_type: str, memory_value = memory_in_gib or (gpu_ratio * instance.get("memory", 0)) result["memory"] = memory_value result[type_of_accelerator] = accelerators + + if efa is not None: + result["vpc.amazonaws.com/efa"] = efa + else: + efa_count = instance.get("efa", 0) + if efa_count > 0: + result["vpc.amazonaws.com/efa"] = efa_count else: result["cpu"] = vcpu or 0 @@ -99,6 +107,10 @@ def _get_resources_from_instance(instance_type: str, node_count: int) -> dict: if type_of_accelerator is not None: result[type_of_accelerator] = max_accelerator_per_instance * node_count + efa_count = instance.get("efa", 0) + if efa_count > 0: + result["vpc.amazonaws.com/efa"] = efa_count + result["cpu"] = f"{result['cpu']}" result["memory"] = f"{result['memory']}Gi" return result @@ -126,7 +138,7 @@ def _trim_resource_requests(instance_type: str, requests_values: dict) -> dict: return requests_values -def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int]) -> dict: +def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int], efa_limit: Optional[int] = None) -> dict: result = {} type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) @@ -145,6 +157,9 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l if memory_in_gib_limit is not None: result["memory"] = str(memory_in_gib_limit) + "Gi" + if efa_limit is not None and efa_limit > 0: + result["vpc.amazonaws.com/efa"] = efa_limit + return result @@ -214,6 +229,31 @@ def _validate_accelerators_inputs(instance_type: str, accelerators_request: int, raise ValueError('Requested accelerators exceeds capacity') +def _validate_efa_inputs(instance_type: str, efa_request: Optional[int], efa_limit: Optional[int]) -> None: + """Validate EFA inputs similar to accelerator validation.""" + instance = INSTANCE_RESOURCES.get(instance_type, {}) + max_efa_per_instance = instance.get("efa", 0) + + # Check if user provided EFA values but instance doesn't support EFA + if max_efa_per_instance == 0 and (efa_request is not None or efa_limit is not None): + raise ValueError( + f"Instance type {instance_type} does not support EFA, but EFA values were provided.") + + # Validate EFA values if instance supports EFA + if max_efa_per_instance > 0: + if efa_request is not None and efa_limit is not None: + if efa_request != efa_limit: + raise ValueError('EFA request must equal EFA limit') + if efa_limit > max_efa_per_instance: + raise ValueError(f'Requested EFA limit ({efa_limit}) exceeds instance capacity ({max_efa_per_instance})') + if efa_request > max_efa_per_instance: + raise ValueError(f'Requested EFA ({efa_request}) exceeds instance capacity ({max_efa_per_instance})') + elif efa_request is not None and efa_request > max_efa_per_instance: + raise ValueError(f'Requested EFA ({efa_request}) exceeds instance capacity ({max_efa_per_instance})') + elif efa_limit is not None and efa_limit > max_efa_per_instance: + raise ValueError(f'Requested EFA limit ({efa_limit}) exceeds instance capacity ({max_efa_per_instance})') + + def _set_default_accelerators_val(instance_type: Optional[str], accelerators_request: Optional[int], accelerators_limit: Optional[int]) -> Tuple[Optional[int], Optional[int]]: type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) if type_of_accelerator is not None: From 98e7703c2f8d06219d804446fadfc226e93f679c Mon Sep 17 00:00:00 2001 From: Sophia Huang Date: Tue, 9 Dec 2025 15:53:44 -0800 Subject: [PATCH 4/6] address comment and add unit test for efa support --- .../v1_1/model.py | 8 +-- .../cli/constants/command_constants.py | 1 + .../hyperpod/training/hyperpod_pytorch_job.py | 22 ++++--- .../training/quota_allocation_util.py | 9 +-- .../test_pytorch_job_template_model.py | 63 ++++++++++++++++--- 5 files changed, 76 insertions(+), 27 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 5fa71dc3..348f3f6d 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -464,26 +464,26 @@ def build_dict(**kwargs): **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, vcpu=str(self.vcpu) if self.vcpu else None, memory=str(self.memory) if self.memory else None, - **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {} + **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}, ) limits_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, memory=str(self.memory_limit) if self.memory_limit else None, - **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {} + **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}, ) else: requests_value = build_dict( accelerators=str(self.accelerators) if self.accelerators else None, vcpu=str(self.vcpu) if self.vcpu else None, memory=str(self.memory) if self.memory else None, - **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {} + **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}, ) limits_value = build_dict( accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, memory=str(self.memory_limit) if self.memory_limit else None, - **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {} + **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}, ) # Build container diff --git a/src/sagemaker/hyperpod/cli/constants/command_constants.py b/src/sagemaker/hyperpod/cli/constants/command_constants.py index 3fc96606..a944fb5f 100644 --- a/src/sagemaker/hyperpod/cli/constants/command_constants.py +++ b/src/sagemaker/hyperpod/cli/constants/command_constants.py @@ -45,6 +45,7 @@ SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes") NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu" NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice" +EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa" AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices" TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices" USER_NAME_LABEL_KEY = "sagemaker.user/created-by" diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index cd9074c2..23b827bb 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -1,7 +1,11 @@ from pydantic import ConfigDict, Field -from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL, NEURON_RESOURCE_LIMIT_KEY, \ - NVIDIA_GPU_RESOURCE_LIMIT_KEY +from sagemaker.hyperpod.cli.constants.command_constants import ( + INSTANCE_TYPE_LABEL, + NEURON_RESOURCE_LIMIT_KEY, + NVIDIA_GPU_RESOURCE_LIMIT_KEY, + EFA_RESOURCE_LIMIT_KEY, +) from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( _HyperPodPytorchJob, HyperPodPytorchJobStatus ) @@ -47,6 +51,7 @@ TRAINING_OPERATOR_LABEL = "hp-training-control-plane" NVIDIA_RESOURCE_KEY = NVIDIA_GPU_RESOURCE_LIMIT_KEY NEURON_RESOURCE_KEY = NEURON_RESOURCE_LIMIT_KEY +EFA_RESOURCE_KEY = EFA_RESOURCE_LIMIT_KEY class HyperPodPytorchJob(_HyperPodPytorchJob): """HyperPod PyTorch job for distributed training on Amazon SageMaker HyperPod clusters. @@ -148,12 +153,12 @@ def _process_replica_resources(cls, data): _validate_accelerators_inputs(instance_type, acc_req, acc_lim) efa = None - if requests.get('vpc.amazonaws.com/efa'): - efa = int(requests.get('vpc.amazonaws.com/efa')) + if requests.get(EFA_RESOURCE_KEY): + efa = int(requests.get(EFA_RESOURCE_KEY)) efa_limit = None - if limits.get('vpc.amazonaws.com/efa'): - efa_limit = int(limits.get('vpc.amazonaws.com/efa')) + if limits.get(EFA_RESOURCE_KEY): + efa_limit = int(limits.get(EFA_RESOURCE_KEY)) _validate_efa_inputs(instance_type, efa, efa_limit) @@ -178,10 +183,7 @@ def _process_replica_resources(cls, data): elif NEURON_RESOURCE_KEY in requests_values: acc_lim = requests_values[NEURON_RESOURCE_KEY] - if efa is not None: - requests_values["vpc.amazonaws.com/efa"] = efa - - efa_lim = requests_values.get("vpc.amazonaws.com/efa") + efa_lim = requests_values.get(EFA_RESOURCE_KEY) if efa_lim is not None: efa_lim = int(efa_lim) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index ff09c125..1d4fe250 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -73,12 +73,9 @@ def _get_resources_from_compute_quotas(instance_type: str, result["memory"] = memory_value result[type_of_accelerator] = accelerators - if efa is not None: - result["vpc.amazonaws.com/efa"] = efa - else: - efa_count = instance.get("efa", 0) - if efa_count > 0: - result["vpc.amazonaws.com/efa"] = efa_count + efa_count = efa or instance.get("efa", 0) + if efa_count > 0: + result["vpc.amazonaws.com/efa"] = efa_count else: result["cpu"] = vcpu or 0 diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py index 043d2024..4895c436 100644 --- a/test/unit_tests/training/test_pytorch_job_template_model.py +++ b/test/unit_tests/training/test_pytorch_job_template_model.py @@ -45,22 +45,71 @@ class TestPyTorchJobConfigEFA(unittest.TestCase): # # Should also have GPU resources # self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8") - def test_no_node_count_no_efa(self): - """Test that jobs without node_count don't get EFA resources""" + def test_instance_without_efa_support_no_efa(self): + """Test that instances without EFA support don't get EFA (ml.g5.xlarge doesn't support EFA)""" + from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob + config = PyTorchJobConfig( - job_name="test-no-node-count", + job_name="test-no-efa-support", image="pytorch:latest", accelerators=1, instance_type="ml.g5.xlarge" ) - + job = config.to_domain() - container = job.replicaSpecs[0].template.spec.containers[0] - - # Should not have EFA resources + # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys + job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job) + container = job_with_resources.replicaSpecs[0].template.spec.containers[0] + + # Should not have EFA resources (instance doesn't support it) self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests) self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits) + # Should have GPU resources + self.assertIn("nvidia.com/gpu", container.resources.requests) + + def test_accelerators_with_efa_support_gets_default_efa(self): + """Test that specifying accelerators on EFA-capable instance gets EFA from constants""" + from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob + + config = PyTorchJobConfig( + job_name="test-accelerators-default-efa", + image="pytorch:latest", + accelerators=4, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys + job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job) + container = job_with_resources.replicaSpecs[0].template.spec.containers[0] + + # Should have EFA from constants + self.assertIn("vpc.amazonaws.com/efa", container.resources.requests) + self.assertIn("vpc.amazonaws.com/efa", container.resources.limits) + self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 4) + + def test_user_specified_efa_overrides_default(self): + """Test that user-specified EFA value overrides the default from constants""" + from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob + + config = PyTorchJobConfig( + job_name="test-custom-efa", + image="pytorch:latest", + accelerators=4, + efa=2, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys + job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job) + container = job_with_resources.replicaSpecs[0].template.spec.containers[0] + + # Should use user-specified EFA value + self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 2) + self.assertEqual(int(container.resources.limits["vpc.amazonaws.com/efa"]), 2) + # def test_multi_node_with_memory_and_cpu(self): # """Test EFA with other resource types""" # config = PyTorchJobConfig( From 930eded7714d4fa7d30e64fa03dc13daec6ead7c Mon Sep 17 00:00:00 2001 From: Sophia Huang Date: Tue, 9 Dec 2025 17:04:43 -0800 Subject: [PATCH 5/6] fix: add efa check in quota allocation test --- .../cli/test_quota_allocation_util.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py index 94245604..4a2cb79b 100644 --- a/test/unit_tests/cli/test_quota_allocation_util.py +++ b/test/unit_tests/cli/test_quota_allocation_util.py @@ -110,8 +110,8 @@ def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_ def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_half(self): result = _get_resources_from_compute_quotas("ml.g6e.48xlarge", None, None, 4) - # ml.g5.xlarge has 8 GPU, 192 CPUs, 1536GiB memory - assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4} + # ml.g6e.48xlarge has 8 GPU, 192 CPUs, 1536GiB memory, 4 EFA + assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4, "vpc.amazonaws.com/efa": 4} def test_get_resources_from_compute_quotas_gpu_instance_all_params(self): result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, 8.0, 1) @@ -119,9 +119,9 @@ def test_get_resources_from_compute_quotas_gpu_instance_all_params(self): def test_get_resources_from_compute_quotas_trainium_instance(self): result = _get_resources_from_compute_quotas("ml.trn1.32xlarge", None, None, 8) - # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory + # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory, 8 EFA # 8 trainium is half, so we should get half of CPU and memory - assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8} + assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8, "vpc.amazonaws.com/efa": 8} def test_get_resources_from_compute_quotas_cpu_only_instance(self): result = _get_resources_from_compute_quotas("ml.c5.large", 1.0, 2.0, 1) @@ -142,14 +142,15 @@ def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self): @pytest.mark.parametrize( "instance_type,node_count,expected", [ - # GPU instances - ("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8}), - ("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16}), + # GPU instances with EFA support + ("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8, "vpc.amazonaws.com/efa": 4}), + ("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16, "vpc.amazonaws.com/efa": 4}), + # GPU instances without EFA support ("ml.g5.xlarge", 1, {"cpu": "4", "memory": "16Gi", "nvidia.com/gpu": 1}), ("ml.g5.xlarge", 3, {"cpu": "12", "memory": "48Gi", "nvidia.com/gpu": 3}), # Trainium instances - ("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16}), - ("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32}), + ("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16, "vpc.amazonaws.com/efa": 8}), + ("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32, "vpc.amazonaws.com/efa": 8}), # CPU-only instances ("ml.c5.large", 1, {"cpu": "2", "memory": "4Gi"}), ("ml.c5.large", 5, {"cpu": "10", "memory": "20Gi"}), From 8d60407ce75d27fa0bb24fd9a80e6747457d0e33 Mon Sep 17 00:00:00 2001 From: Sophia Huang Date: Mon, 15 Dec 2025 11:26:57 -0800 Subject: [PATCH 6/6] Modify efa arg name and fix gpu integ test --- .../v1_1/model.py | 12 +++---- .../v1_1/schema.json | 4 +-- .../hyperpod/training/hyperpod_pytorch_job.py | 12 +++---- .../training/quota_allocation_util.py | 34 +++++++++---------- .../training/cli/test_gpu_quota_allocation.py | 16 ++++----- .../test_pytorch_job_template_model.py | 2 +- 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 348f3f6d..d1770c6e 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -195,12 +195,12 @@ class PyTorchJobConfig(BaseModel): default=None, description="Limit for the amount of memory in GiB", ) - efa: Optional[int] = Field( + efa_interfaces: Optional[int] = Field( default=None, description="Number of EFA interfaces for the instance", ge=0 ) - efa_limit: Optional[int] = Field( + efa_interfaces_limit: Optional[int] = Field( default=None, description="Limit for the number of EFA interfaces", ge=0 @@ -464,26 +464,26 @@ def build_dict(**kwargs): **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, vcpu=str(self.vcpu) if self.vcpu else None, memory=str(self.memory) if self.memory else None, - **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {}, ) limits_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, memory=str(self.memory_limit) if self.memory_limit else None, - **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {}, ) else: requests_value = build_dict( accelerators=str(self.accelerators) if self.accelerators else None, vcpu=str(self.vcpu) if self.vcpu else None, memory=str(self.memory) if self.memory else None, - **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {}, ) limits_value = build_dict( accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, memory=str(self.memory_limit) if self.memory_limit else None, - **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {}, ) # Build container diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json index f5f87b24..d19ec4de 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -305,12 +305,12 @@ "minimum": 0, "description": "Limit for the amount of memory in GiB" }, - "efa": { + "efa_interfaces": { "type": "integer", "minimum": 0, "description": "Number of EFA interfaces for the instance" }, - "efa_limit": { + "efa_interfaces_limit": { "type": "integer", "minimum": 0, "description": "Limit for the number of EFA interfaces" diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index 23b827bb..dbd82528 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -152,15 +152,15 @@ def _process_replica_resources(cls, data): acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit) _validate_accelerators_inputs(instance_type, acc_req, acc_lim) - efa = None + efa_interfaces = None if requests.get(EFA_RESOURCE_KEY): - efa = int(requests.get(EFA_RESOURCE_KEY)) + efa_interfaces = int(requests.get(EFA_RESOURCE_KEY)) - efa_limit = None + efa_interfaces_limit = None if limits.get(EFA_RESOURCE_KEY): - efa_limit = int(limits.get(EFA_RESOURCE_KEY)) + efa_interfaces_limit = int(limits.get(EFA_RESOURCE_KEY)) - _validate_efa_inputs(instance_type, efa, efa_limit) + _validate_efa_inputs(instance_type, efa_interfaces, efa_interfaces_limit) accelerator_partition_type, accelerator_partition_count, accelerator_partition_limit = ( _get_accelerator_partition(requests, limits) @@ -174,7 +174,7 @@ def _process_replica_resources(cls, data): acc_partition_req, acc_partition_lim = _set_default_accelerator_partition_val(accelerator_partition_count, accelerator_partition_limit) - requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req, efa) + requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req, efa_interfaces) if requests_values is None: requests_values = _get_resources_from_instance(instance_type, node_count=1) _trim_resource_requests(instance_type, requests_values) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index 1d4fe250..93c3258a 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -34,7 +34,7 @@ def _get_resources_from_compute_quotas(instance_type: str, accelerators: Optional[int] = 0, accelerator_partition_type: Optional[str] = None, accelerator_partition_count: Optional[int] = None, - efa: Optional[int] = None) -> Optional[dict]: + efa_interfaces: Optional[int] = None) -> Optional[dict]: has_accelerator_partition = accelerator_partition_type is not None and accelerator_partition_count is not None has_compute_resources = _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators) @@ -73,7 +73,7 @@ def _get_resources_from_compute_quotas(instance_type: str, result["memory"] = memory_value result[type_of_accelerator] = accelerators - efa_count = efa or instance.get("efa", 0) + efa_count = efa_interfaces or instance.get("efa", 0) if efa_count > 0: result["vpc.amazonaws.com/efa"] = efa_count @@ -135,7 +135,7 @@ def _trim_resource_requests(instance_type: str, requests_values: dict) -> dict: return requests_values -def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int], efa_limit: Optional[int] = None) -> dict: +def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int], efa_interfaces_limit: Optional[int] = None) -> dict: result = {} type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) @@ -154,8 +154,8 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l if memory_in_gib_limit is not None: result["memory"] = str(memory_in_gib_limit) + "Gi" - if efa_limit is not None and efa_limit > 0: - result["vpc.amazonaws.com/efa"] = efa_limit + if efa_interfaces_limit is not None and efa_interfaces_limit > 0: + result["vpc.amazonaws.com/efa"] = efa_interfaces_limit return result @@ -226,29 +226,29 @@ def _validate_accelerators_inputs(instance_type: str, accelerators_request: int, raise ValueError('Requested accelerators exceeds capacity') -def _validate_efa_inputs(instance_type: str, efa_request: Optional[int], efa_limit: Optional[int]) -> None: +def _validate_efa_inputs(instance_type: str, efa_interfaces: Optional[int], efa_interfaces_limit: Optional[int]) -> None: """Validate EFA inputs similar to accelerator validation.""" instance = INSTANCE_RESOURCES.get(instance_type, {}) max_efa_per_instance = instance.get("efa", 0) # Check if user provided EFA values but instance doesn't support EFA - if max_efa_per_instance == 0 and (efa_request is not None or efa_limit is not None): + if max_efa_per_instance == 0 and (efa_interfaces is not None or efa_interfaces_limit is not None): raise ValueError( f"Instance type {instance_type} does not support EFA, but EFA values were provided.") # Validate EFA values if instance supports EFA if max_efa_per_instance > 0: - if efa_request is not None and efa_limit is not None: - if efa_request != efa_limit: + if efa_interfaces is not None and efa_interfaces_limit is not None: + if efa_interfaces != efa_interfaces_limit: raise ValueError('EFA request must equal EFA limit') - if efa_limit > max_efa_per_instance: - raise ValueError(f'Requested EFA limit ({efa_limit}) exceeds instance capacity ({max_efa_per_instance})') - if efa_request > max_efa_per_instance: - raise ValueError(f'Requested EFA ({efa_request}) exceeds instance capacity ({max_efa_per_instance})') - elif efa_request is not None and efa_request > max_efa_per_instance: - raise ValueError(f'Requested EFA ({efa_request}) exceeds instance capacity ({max_efa_per_instance})') - elif efa_limit is not None and efa_limit > max_efa_per_instance: - raise ValueError(f'Requested EFA limit ({efa_limit}) exceeds instance capacity ({max_efa_per_instance})') + if efa_interfaces_limit > max_efa_per_instance: + raise ValueError(f'Requested EFA limit ({efa_interfaces_limit}) exceeds instance capacity ({max_efa_per_instance})') + if efa_interfaces > max_efa_per_instance: + raise ValueError(f'Requested EFA ({efa_interfaces}) exceeds instance capacity ({max_efa_per_instance})') + elif efa_interfaces is not None and efa_interfaces > max_efa_per_instance: + raise ValueError(f'Requested EFA ({efa_interfaces}) exceeds instance capacity ({max_efa_per_instance})') + elif efa_interfaces_limit is not None and efa_interfaces_limit > max_efa_per_instance: + raise ValueError(f'Requested EFA limit ({efa_interfaces_limit}) exceeds instance capacity ({max_efa_per_instance})') def _set_default_accelerators_val(instance_type: Optional[str], accelerators_request: Optional[int], accelerators_limit: Optional[int]) -> Tuple[Optional[int], Optional[int]]: diff --git a/test/integration_tests/training/cli/test_gpu_quota_allocation.py b/test/integration_tests/training/cli/test_gpu_quota_allocation.py index 506c387b..a2a5e912 100644 --- a/test/integration_tests/training/cli/test_gpu_quota_allocation.py +++ b/test/integration_tests/training/cli/test_gpu_quota_allocation.py @@ -53,8 +53,8 @@ def test_create_job_with_integer_quota_parameters(self, test_job_name): result = execute_command(describe_cmd) logger.info(f"describe result: {result}") assert result.returncode == 0 - assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", @@ -103,8 +103,8 @@ def test_create_job_with_float_quota_parameters(self, test_job_name): ] result = execute_command(describe_cmd) assert result.returncode == 0 - assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", @@ -149,8 +149,8 @@ def test_create_job_with_only_accelerators_parameter(self, test_job_name): ] result = execute_command(describe_cmd) assert result.returncode == 0 - assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", @@ -196,8 +196,8 @@ def test_create_job_with_accelerators_memory_parameters(self, test_job_name): time.sleep(5) assert result.returncode == 0 - assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '29', 'memory': '2040109465600m', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '29', 'memory': '2040109465600m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py index 4895c436..4ee462bb 100644 --- a/test/unit_tests/training/test_pytorch_job_template_model.py +++ b/test/unit_tests/training/test_pytorch_job_template_model.py @@ -97,7 +97,7 @@ def test_user_specified_efa_overrides_default(self): job_name="test-custom-efa", image="pytorch:latest", accelerators=4, - efa=2, + efa_interfaces=2, instance_type="ml.p4d.24xlarge" )