Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,16 @@ class PyTorchJobConfig(BaseModel):
default=None,
description="Limit for the amount of memory in GiB",
)
efa_interfaces: Optional[int] = Field(
default=None,
description="Number of EFA interfaces for the instance",
ge=0
)
efa_interfaces_limit: Optional[int] = Field(
default=None,
description="Limit for the number of EFA interfaces",
ge=0
)
accelerator_partition_type: Optional[str] = Field(
default=None,
description="Type of accelerator partition"
Expand Down Expand Up @@ -453,23 +463,27 @@ def build_dict(**kwargs):
requests_value = build_dict(
**{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
vcpu=str(self.vcpu) if self.vcpu else None,
memory=str(self.memory) if self.memory else None
memory=str(self.memory) if self.memory else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
)
limits_value = build_dict(
**{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
memory=str(self.memory_limit) if self.memory_limit else None
memory=str(self.memory_limit) if self.memory_limit else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
)
else:
requests_value = build_dict(
accelerators=str(self.accelerators) if self.accelerators else None,
vcpu=str(self.vcpu) if self.vcpu else None,
memory=str(self.memory) if self.memory else None
memory=str(self.memory) if self.memory else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
)
limits_value = build_dict(
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
memory=str(self.memory_limit) if self.memory_limit else None
memory=str(self.memory_limit) if self.memory_limit else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
)

# Build container
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,16 @@
"minimum": 0,
"description": "Limit for the amount of memory in GiB"
},
"efa_interfaces": {
"type": "integer",
"minimum": 0,
"description": "Number of EFA interfaces for the instance"
},
"efa_interfaces_limit": {
"type": "integer",
"minimum": 0,
"description": "Limit for the number of EFA interfaces"
},
"accelerator_partition_type": {
"type": "string",
"enum": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,14 @@
{%- if memory %}
memory: {{ memory }}Gi
{%- endif %}
{%- if (node_count and node_count > 1) %}
vpc.amazonaws.com/efa: 1
{%- if efa and efa > 0 %}
vpc.amazonaws.com/efa: {{ efa }}
{%- endif %}
{%- else %}
requests:
nvidia.com/gpu: "0"
{%- endif %}
{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit %}
{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit or efa_limit%}
limits:
{%- if accelerator_partition_type and accelerator_partition_limit %}
nvidia.com/{{ accelerator_partition_type }}: {{ accelerator_partition_limit }}
Expand All @@ -117,8 +117,8 @@
{%- if memory_limit %}
memory: {{ memory_limit }}Gi
{%- endif %}
{%- if (node_count and node_count > 1) %}
vpc.amazonaws.com/efa: 1
{%- if efa_limit and efa_limit > 0 %}
vpc.amazonaws.com/efa: {{ efa_limit }}
{%- endif %}
{%- else %}
limits:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes")
NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu"
NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice"
EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa"
AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices"
TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices"
USER_NAME_LABEL_KEY = "sagemaker.user/created-by"
Expand Down
Loading
Loading