diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 085f07a3..d1770c6e 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -195,6 +195,16 @@ class PyTorchJobConfig(BaseModel): default=None, description="Limit for the amount of memory in GiB", ) + efa_interfaces: Optional[int] = Field( + default=None, + description="Number of EFA interfaces for the instance", + ge=0 + ) + efa_interfaces_limit: Optional[int] = Field( + default=None, + description="Limit for the number of EFA interfaces", + ge=0 + ) accelerator_partition_type: Optional[str] = Field( default=None, description="Type of accelerator partition" @@ -453,23 +463,27 @@ def build_dict(**kwargs): requests_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None + memory=str(self.memory) if self.memory else None, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {}, ) limits_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None + memory=str(self.memory_limit) if self.memory_limit else None, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {}, ) else: requests_value = build_dict( accelerators=str(self.accelerators) if self.accelerators else None, vcpu=str(self.vcpu) if self.vcpu else None, - memory=str(self.memory) if self.memory else None + memory=str(self.memory) if self.memory else None, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {}, ) limits_value = build_dict( accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, - memory=str(self.memory_limit) if self.memory_limit else None + memory=str(self.memory_limit) if self.memory_limit else None, + **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {}, ) # Build container diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json index 83bd6120..d19ec4de 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -305,6 +305,16 @@ "minimum": 0, "description": "Limit for the amount of memory in GiB" }, + "efa_interfaces": { + "type": "integer", + "minimum": 0, + "description": "Number of EFA interfaces for the instance" + }, + "efa_interfaces_limit": { + "type": "integer", + "minimum": 0, + "description": "Limit for the number of EFA interfaces" + }, "accelerator_partition_type": { "type": "string", "enum": [ diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py index 1a61f6df..63278d2c 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py @@ -97,14 +97,14 @@ {%- if memory %} memory: {{ memory }}Gi {%- endif %} -{%- if (node_count and node_count > 1) %} - vpc.amazonaws.com/efa: 1 +{%- if efa and efa > 0 %} + vpc.amazonaws.com/efa: {{ efa }} {%- endif %} {%- else %} requests: nvidia.com/gpu: "0" {%- endif %} -{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit %} +{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit or efa_limit%} limits: {%- if accelerator_partition_type and accelerator_partition_limit %} nvidia.com/{{ accelerator_partition_type }}: {{ accelerator_partition_limit }} @@ -117,8 +117,8 @@ {%- if memory_limit %} memory: {{ memory_limit }}Gi {%- endif %} -{%- if (node_count and node_count > 1) %} - vpc.amazonaws.com/efa: 1 +{%- if efa_limit and efa_limit > 0 %} + vpc.amazonaws.com/efa: {{ efa_limit }} {%- endif %} {%- else %} limits: diff --git a/src/sagemaker/hyperpod/cli/constants/command_constants.py b/src/sagemaker/hyperpod/cli/constants/command_constants.py index 3fc96606..a944fb5f 100644 --- a/src/sagemaker/hyperpod/cli/constants/command_constants.py +++ b/src/sagemaker/hyperpod/cli/constants/command_constants.py @@ -45,6 +45,7 @@ SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes") NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu" NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice" +EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa" AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices" TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices" USER_NAME_LABEL_KEY = "sagemaker.user/created-by" diff --git a/src/sagemaker/hyperpod/training/constants.py b/src/sagemaker/hyperpod/training/constants.py index 32fdc8a2..3fa55217 100644 --- a/src/sagemaker/hyperpod/training/constants.py +++ b/src/sagemaker/hyperpod/training/constants.py @@ -1,119 +1,119 @@ # TODO: currently there is no API for instances and they are hardcoded; post GA work with partner team on adding support for such API INSTANCE_RESOURCES = { - "ml.p4d.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152}, - "ml.p4de.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152}, - "ml.p5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, - "ml.p5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.trn1.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512}, - "ml.trn1n.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512}, - "ml.g5.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16}, - "ml.g5.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32}, - "ml.g5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64}, - "ml.g5.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.g5.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192}, - "ml.g5.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g5.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384}, - "ml.g5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768}, - "ml.g6.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16}, - "ml.g6.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32}, - "ml.g6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64}, - "ml.g6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.g6.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g6.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192}, - "ml.g6.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384}, - "ml.g6.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768}, - "ml.gr6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.gr6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g6e.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 32}, - "ml.g6e.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 64}, - "ml.g6e.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128}, - "ml.g6e.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256}, - "ml.g6e.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 512}, - "ml.g6e.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 384}, - "ml.g6e.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 768}, - "ml.g6e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 1536}, - "ml.p5e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, - "ml.p5en.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048}, - "ml.trn2.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 16, "memory": 2048}, - "ml.p6e-gb200.36xlarge": {"cpu": 144, "gpu": 4, "trainium": 0, "memory": 960}, - "ml.p6-b200.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2024}, - "ml.c5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, - "ml.c5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.c5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.c5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.c5.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 72}, - "ml.c5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.c5.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 144}, - "ml.c5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.c5n.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 5}, - "ml.c5n.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 21}, - "ml.c5n.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 42}, - "ml.c5n.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.c5n.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.m5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.m5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.m5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.m5.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.m5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m5.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.t3.medium": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, - "ml.t3.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.t3.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.t3.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.c6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4}, - "ml.c6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.c6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.c6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.c6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.c6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.c6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.c6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.c6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.m6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.m6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.m6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.m6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.m6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.m6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 512}, - "ml.r6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.r6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.r6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.r6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.r6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.r6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.r6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512}, - "ml.r6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}, - "ml.r6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 1024}, - "ml.m7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8}, - "ml.m7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.m7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.m7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.m7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.m7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.m7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.m7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.m7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 768}, - "ml.r7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.r7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.r7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.r7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128}, - "ml.r7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256}, - "ml.r7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.r7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512}, - "ml.r7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}, - "ml.r7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 1536}, - "ml.i3en.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16}, - "ml.i3en.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32}, - "ml.i3en.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64}, - "ml.i3en.3xlarge": {"cpu": 12, "gpu": 0, "trainium": 0, "memory": 96}, - "ml.i3en.6xlarge": {"cpu": 24, "gpu": 0, "trainium": 0, "memory": 192}, - "ml.i3en.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384}, - "ml.i3en.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768} + "ml.p4d.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152, "efa": 4}, + "ml.p4de.24xlarge": {"cpu": 96, "gpu": 8, "trainium": 0, "memory": 1152, "efa": 4}, + "ml.p5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048, "efa": 32}, + "ml.p5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.trn1.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512, "efa": 8}, + "ml.trn1n.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 16, "memory": 512, "efa": 16}, + "ml.g5.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16, "efa": 0}, + "ml.g5.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32, "efa": 0}, + "ml.g5.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64, "efa": 0}, + "ml.g5.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128, "efa": 1}, + "ml.g5.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192, "efa": 1}, + "ml.g5.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g5.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384, "efa": 1}, + "ml.g5.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768, "efa": 1}, + "ml.g6.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 16, "efa": 0}, + "ml.g6.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 32, "efa": 0}, + "ml.g6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 64, "efa": 0}, + "ml.g6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 128, "efa": 1}, + "ml.g6.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g6.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 192, "efa": 1}, + "ml.g6.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 384, "efa": 1}, + "ml.g6.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 768, "efa": 1}, + "ml.gr6.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128, "efa": 0}, + "ml.gr6.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g6e.xlarge": {"cpu": 4, "gpu": 1, "trainium": 0, "memory": 32, "efa": 0}, + "ml.g6e.2xlarge": {"cpu": 8, "gpu": 1, "trainium": 0, "memory": 64, "efa": 0}, + "ml.g6e.4xlarge": {"cpu": 16, "gpu": 1, "trainium": 0, "memory": 128, "efa": 0}, + "ml.g6e.8xlarge": {"cpu": 32, "gpu": 1, "trainium": 0, "memory": 256, "efa": 1}, + "ml.g6e.16xlarge": {"cpu": 64, "gpu": 1, "trainium": 0, "memory": 512, "efa": 1}, + "ml.g6e.12xlarge": {"cpu": 48, "gpu": 4, "trainium": 0, "memory": 384, "efa": 1}, + "ml.g6e.24xlarge": {"cpu": 96, "gpu": 4, "trainium": 0, "memory": 768, "efa": 2}, + "ml.g6e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 1536, "efa": 4}, + "ml.p5e.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048, "efa": 32}, + "ml.p5en.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2048, "efa": 16}, + "ml.trn2.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 16, "memory": 2048, "efa": 0}, + "ml.p6e-gb200.36xlarge": {"cpu": 144, "gpu": 4, "trainium": 0, "memory": 960, "efa": 0}, + "ml.p6-b200.48xlarge": {"cpu": 192, "gpu": 8, "trainium": 0, "memory": 2024, "efa": 8}, + "ml.c5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4, "efa": 0}, + "ml.c5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.c5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.c5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.c5.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 72, "efa": 0}, + "ml.c5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96, "efa": 0}, + "ml.c5.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 144, "efa": 0}, + "ml.c5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.c5n.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 5, "efa": 0}, + "ml.c5n.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 21, "efa": 0}, + "ml.c5n.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 42, "efa": 0}, + "ml.c5n.9xlarge": {"cpu": 36, "gpu": 0, "trainium": 0, "memory": 96, "efa": 1}, + "ml.c5n.18xlarge": {"cpu": 72, "gpu": 0, "trainium": 0, "memory": 192, "efa": 1}, + "ml.m5.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.m5.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.m5.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.m5.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.m5.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.m5.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.m5.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.m5.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.t3.medium": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4, "efa": 0}, + "ml.t3.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.t3.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.t3.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.c6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 4, "efa": 0}, + "ml.c6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.c6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.c6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.c6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.c6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 96, "efa": 0}, + "ml.c6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.c6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.c6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 256, "efa": 1}, + "ml.m6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.m6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.m6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.m6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.m6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.m6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.m6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.m6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.m6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 512, "efa": 1}, + "ml.r6i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.r6i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.r6i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.r6i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.r6i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.r6i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.r6i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512, "efa": 0}, + "ml.r6i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768, "efa": 0}, + "ml.r6i.32xlarge": {"cpu": 128, "gpu": 0, "trainium": 0, "memory": 1024, "efa": 1}, + "ml.m7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 8, "efa": 0}, + "ml.m7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.m7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.m7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.m7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.m7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.m7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.m7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.m7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 768, "efa": 1}, + "ml.r7i.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.r7i.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.r7i.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.r7i.4xlarge": {"cpu": 16, "gpu": 0, "trainium": 0, "memory": 128, "efa": 0}, + "ml.r7i.8xlarge": {"cpu": 32, "gpu": 0, "trainium": 0, "memory": 256, "efa": 0}, + "ml.r7i.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384, "efa": 0}, + "ml.r7i.16xlarge": {"cpu": 64, "gpu": 0, "trainium": 0, "memory": 512, "efa": 0}, + "ml.r7i.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768, "efa": 0}, + "ml.r7i.48xlarge": {"cpu": 192, "gpu": 0, "trainium": 0, "memory": 1536, "efa": 1}, + "ml.i3en.large": {"cpu": 2, "gpu": 0, "trainium": 0, "memory": 16, "efa": 0}, + "ml.i3en.xlarge": {"cpu": 4, "gpu": 0, "trainium": 0, "memory": 32, "efa": 0}, + "ml.i3en.2xlarge": {"cpu": 8, "gpu": 0, "trainium": 0, "memory": 64, "efa": 0}, + "ml.i3en.3xlarge": {"cpu": 12, "gpu": 0, "trainium": 0, "memory": 96, "efa": 0}, + "ml.i3en.6xlarge": {"cpu": 24, "gpu": 0, "trainium": 0, "memory": 192, "efa": 0}, + "ml.i3en.12xlarge": {"cpu": 48, "gpu": 0, "trainium": 0, "memory": 384, "efa": 1}, + "ml.i3en.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768, "efa": 1} } # MIG profiles by instance type diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index c4d548be..dbd82528 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -1,7 +1,11 @@ from pydantic import ConfigDict, Field -from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL, NEURON_RESOURCE_LIMIT_KEY, \ - NVIDIA_GPU_RESOURCE_LIMIT_KEY +from sagemaker.hyperpod.cli.constants.command_constants import ( + INSTANCE_TYPE_LABEL, + NEURON_RESOURCE_LIMIT_KEY, + NVIDIA_GPU_RESOURCE_LIMIT_KEY, + EFA_RESOURCE_LIMIT_KEY, +) from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import ( _HyperPodPytorchJob, HyperPodPytorchJobStatus ) @@ -29,6 +33,7 @@ _resolve_default_memory_values, _set_default_accelerators_val, _validate_accelerators_inputs, + _validate_efa_inputs, _resolve_default_cpu_values, _trim_resource_requests, ) @@ -46,6 +51,7 @@ TRAINING_OPERATOR_LABEL = "hp-training-control-plane" NVIDIA_RESOURCE_KEY = NVIDIA_GPU_RESOURCE_LIMIT_KEY NEURON_RESOURCE_KEY = NEURON_RESOURCE_LIMIT_KEY +EFA_RESOURCE_KEY = EFA_RESOURCE_LIMIT_KEY class HyperPodPytorchJob(_HyperPodPytorchJob): """HyperPod PyTorch job for distributed training on Amazon SageMaker HyperPod clusters. @@ -146,6 +152,16 @@ def _process_replica_resources(cls, data): acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit) _validate_accelerators_inputs(instance_type, acc_req, acc_lim) + efa_interfaces = None + if requests.get(EFA_RESOURCE_KEY): + efa_interfaces = int(requests.get(EFA_RESOURCE_KEY)) + + efa_interfaces_limit = None + if limits.get(EFA_RESOURCE_KEY): + efa_interfaces_limit = int(limits.get(EFA_RESOURCE_KEY)) + + _validate_efa_inputs(instance_type, efa_interfaces, efa_interfaces_limit) + accelerator_partition_type, accelerator_partition_count, accelerator_partition_limit = ( _get_accelerator_partition(requests, limits) ) @@ -158,8 +174,7 @@ def _process_replica_resources(cls, data): acc_partition_req, acc_partition_lim = _set_default_accelerator_partition_val(accelerator_partition_count, accelerator_partition_limit) - # Calculate resource values - requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req) + requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req, efa_interfaces) if requests_values is None: requests_values = _get_resources_from_instance(instance_type, node_count=1) _trim_resource_requests(instance_type, requests_values) @@ -168,7 +183,11 @@ def _process_replica_resources(cls, data): elif NEURON_RESOURCE_KEY in requests_values: acc_lim = requests_values[NEURON_RESOURCE_KEY] - limits_values = _get_limits(instance_type, vcpu_limit, memory_limit, acc_lim, accelerator_partition_type, acc_partition_lim) + efa_lim = requests_values.get(EFA_RESOURCE_KEY) + if efa_lim is not None: + efa_lim = int(efa_lim) + + limits_values = _get_limits(instance_type, vcpu_limit, memory_limit, acc_lim, accelerator_partition_type, acc_partition_lim, efa_lim) _resolve_default_memory_values(instance_type, requests_values, limits_values) _resolve_default_cpu_values(instance_type, requests_values) diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index 291bf3c2..93c3258a 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -33,7 +33,8 @@ def _get_resources_from_compute_quotas(instance_type: str, memory_in_gib: Optional[float], accelerators: Optional[int] = 0, accelerator_partition_type: Optional[str] = None, - accelerator_partition_count: Optional[int] = None) -> Optional[dict]: + accelerator_partition_count: Optional[int] = None, + efa_interfaces: Optional[int] = None) -> Optional[dict]: has_accelerator_partition = accelerator_partition_type is not None and accelerator_partition_count is not None has_compute_resources = _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators) @@ -71,6 +72,10 @@ def _get_resources_from_compute_quotas(instance_type: str, memory_value = memory_in_gib or (gpu_ratio * instance.get("memory", 0)) result["memory"] = memory_value result[type_of_accelerator] = accelerators + + efa_count = efa_interfaces or instance.get("efa", 0) + if efa_count > 0: + result["vpc.amazonaws.com/efa"] = efa_count else: result["cpu"] = vcpu or 0 @@ -99,6 +104,10 @@ def _get_resources_from_instance(instance_type: str, node_count: int) -> dict: if type_of_accelerator is not None: result[type_of_accelerator] = max_accelerator_per_instance * node_count + efa_count = instance.get("efa", 0) + if efa_count > 0: + result["vpc.amazonaws.com/efa"] = efa_count + result["cpu"] = f"{result['cpu']}" result["memory"] = f"{result['memory']}Gi" return result @@ -126,7 +135,7 @@ def _trim_resource_requests(instance_type: str, requests_values: dict) -> dict: return requests_values -def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int]) -> dict: +def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int], efa_interfaces_limit: Optional[int] = None) -> dict: result = {} type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) @@ -145,6 +154,9 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l if memory_in_gib_limit is not None: result["memory"] = str(memory_in_gib_limit) + "Gi" + if efa_interfaces_limit is not None and efa_interfaces_limit > 0: + result["vpc.amazonaws.com/efa"] = efa_interfaces_limit + return result @@ -214,6 +226,31 @@ def _validate_accelerators_inputs(instance_type: str, accelerators_request: int, raise ValueError('Requested accelerators exceeds capacity') +def _validate_efa_inputs(instance_type: str, efa_interfaces: Optional[int], efa_interfaces_limit: Optional[int]) -> None: + """Validate EFA inputs similar to accelerator validation.""" + instance = INSTANCE_RESOURCES.get(instance_type, {}) + max_efa_per_instance = instance.get("efa", 0) + + # Check if user provided EFA values but instance doesn't support EFA + if max_efa_per_instance == 0 and (efa_interfaces is not None or efa_interfaces_limit is not None): + raise ValueError( + f"Instance type {instance_type} does not support EFA, but EFA values were provided.") + + # Validate EFA values if instance supports EFA + if max_efa_per_instance > 0: + if efa_interfaces is not None and efa_interfaces_limit is not None: + if efa_interfaces != efa_interfaces_limit: + raise ValueError('EFA request must equal EFA limit') + if efa_interfaces_limit > max_efa_per_instance: + raise ValueError(f'Requested EFA limit ({efa_interfaces_limit}) exceeds instance capacity ({max_efa_per_instance})') + if efa_interfaces > max_efa_per_instance: + raise ValueError(f'Requested EFA ({efa_interfaces}) exceeds instance capacity ({max_efa_per_instance})') + elif efa_interfaces is not None and efa_interfaces > max_efa_per_instance: + raise ValueError(f'Requested EFA ({efa_interfaces}) exceeds instance capacity ({max_efa_per_instance})') + elif efa_interfaces_limit is not None and efa_interfaces_limit > max_efa_per_instance: + raise ValueError(f'Requested EFA limit ({efa_interfaces_limit}) exceeds instance capacity ({max_efa_per_instance})') + + def _set_default_accelerators_val(instance_type: Optional[str], accelerators_request: Optional[int], accelerators_limit: Optional[int]) -> Tuple[Optional[int], Optional[int]]: type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type) if type_of_accelerator is not None: diff --git a/test/integration_tests/training/cli/test_gpu_quota_allocation.py b/test/integration_tests/training/cli/test_gpu_quota_allocation.py index 506c387b..a2a5e912 100644 --- a/test/integration_tests/training/cli/test_gpu_quota_allocation.py +++ b/test/integration_tests/training/cli/test_gpu_quota_allocation.py @@ -53,8 +53,8 @@ def test_create_job_with_integer_quota_parameters(self, test_job_name): result = execute_command(describe_cmd) logger.info(f"describe result: {result}") assert result.returncode == 0 - assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", @@ -103,8 +103,8 @@ def test_create_job_with_float_quota_parameters(self, test_job_name): ] result = execute_command(describe_cmd) assert result.returncode == 0 - assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", @@ -149,8 +149,8 @@ def test_create_job_with_only_accelerators_parameter(self, test_job_name): ] result = execute_command(describe_cmd) assert result.returncode == 0 - assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", @@ -196,8 +196,8 @@ def test_create_job_with_accelerators_memory_parameters(self, test_job_name): time.sleep(5) assert result.returncode == 0 - assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout - assert " Requests: {'cpu': '29', 'memory': '2040109465600m', 'nvidia.com/gpu': '1'}" in result.stdout + assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout + assert " Requests: {'cpu': '29', 'memory': '2040109465600m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout delete_cmd = [ "hyp", "delete", "hyp-pytorch-job", diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py index 94245604..4a2cb79b 100644 --- a/test/unit_tests/cli/test_quota_allocation_util.py +++ b/test/unit_tests/cli/test_quota_allocation_util.py @@ -110,8 +110,8 @@ def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_ def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_half(self): result = _get_resources_from_compute_quotas("ml.g6e.48xlarge", None, None, 4) - # ml.g5.xlarge has 8 GPU, 192 CPUs, 1536GiB memory - assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4} + # ml.g6e.48xlarge has 8 GPU, 192 CPUs, 1536GiB memory, 4 EFA + assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4, "vpc.amazonaws.com/efa": 4} def test_get_resources_from_compute_quotas_gpu_instance_all_params(self): result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, 8.0, 1) @@ -119,9 +119,9 @@ def test_get_resources_from_compute_quotas_gpu_instance_all_params(self): def test_get_resources_from_compute_quotas_trainium_instance(self): result = _get_resources_from_compute_quotas("ml.trn1.32xlarge", None, None, 8) - # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory + # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory, 8 EFA # 8 trainium is half, so we should get half of CPU and memory - assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8} + assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8, "vpc.amazonaws.com/efa": 8} def test_get_resources_from_compute_quotas_cpu_only_instance(self): result = _get_resources_from_compute_quotas("ml.c5.large", 1.0, 2.0, 1) @@ -142,14 +142,15 @@ def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self): @pytest.mark.parametrize( "instance_type,node_count,expected", [ - # GPU instances - ("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8}), - ("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16}), + # GPU instances with EFA support + ("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8, "vpc.amazonaws.com/efa": 4}), + ("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16, "vpc.amazonaws.com/efa": 4}), + # GPU instances without EFA support ("ml.g5.xlarge", 1, {"cpu": "4", "memory": "16Gi", "nvidia.com/gpu": 1}), ("ml.g5.xlarge", 3, {"cpu": "12", "memory": "48Gi", "nvidia.com/gpu": 3}), # Trainium instances - ("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16}), - ("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32}), + ("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16, "vpc.amazonaws.com/efa": 8}), + ("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32, "vpc.amazonaws.com/efa": 8}), # CPU-only instances ("ml.c5.large", 1, {"cpu": "2", "memory": "4Gi"}), ("ml.c5.large", 5, {"cpu": "10", "memory": "20Gi"}), diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py index 043d2024..4ee462bb 100644 --- a/test/unit_tests/training/test_pytorch_job_template_model.py +++ b/test/unit_tests/training/test_pytorch_job_template_model.py @@ -45,22 +45,71 @@ class TestPyTorchJobConfigEFA(unittest.TestCase): # # Should also have GPU resources # self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8") - def test_no_node_count_no_efa(self): - """Test that jobs without node_count don't get EFA resources""" + def test_instance_without_efa_support_no_efa(self): + """Test that instances without EFA support don't get EFA (ml.g5.xlarge doesn't support EFA)""" + from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob + config = PyTorchJobConfig( - job_name="test-no-node-count", + job_name="test-no-efa-support", image="pytorch:latest", accelerators=1, instance_type="ml.g5.xlarge" ) - + job = config.to_domain() - container = job.replicaSpecs[0].template.spec.containers[0] - - # Should not have EFA resources + # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys + job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job) + container = job_with_resources.replicaSpecs[0].template.spec.containers[0] + + # Should not have EFA resources (instance doesn't support it) self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests) self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits) + # Should have GPU resources + self.assertIn("nvidia.com/gpu", container.resources.requests) + + def test_accelerators_with_efa_support_gets_default_efa(self): + """Test that specifying accelerators on EFA-capable instance gets EFA from constants""" + from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob + + config = PyTorchJobConfig( + job_name="test-accelerators-default-efa", + image="pytorch:latest", + accelerators=4, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys + job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job) + container = job_with_resources.replicaSpecs[0].template.spec.containers[0] + + # Should have EFA from constants + self.assertIn("vpc.amazonaws.com/efa", container.resources.requests) + self.assertIn("vpc.amazonaws.com/efa", container.resources.limits) + self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 4) + + def test_user_specified_efa_overrides_default(self): + """Test that user-specified EFA value overrides the default from constants""" + from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob + + config = PyTorchJobConfig( + job_name="test-custom-efa", + image="pytorch:latest", + accelerators=4, + efa_interfaces=2, + instance_type="ml.p4d.24xlarge" + ) + + job = config.to_domain() + # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys + job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job) + container = job_with_resources.replicaSpecs[0].template.spec.containers[0] + + # Should use user-specified EFA value + self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 2) + self.assertEqual(int(container.resources.limits["vpc.amazonaws.com/efa"]), 2) + # def test_multi_node_with_memory_and_cpu(self): # """Test EFA with other resource types""" # config = PyTorchJobConfig(