aws · pintaoz-aws · Dec 18, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
@@ -195,6 +195,16 @@ class PyTorchJobConfig(BaseModel):
         default=None,
         description="Limit for the amount of memory in GiB",
     )
+    efa_interfaces: Optional[int] = Field(
+        default=None,
+        description="Number of EFA interfaces for the instance",
+        ge=0
+    )
+    efa_interfaces_limit: Optional[int] = Field(
+        default=None,
+        description="Limit for the number of EFA interfaces",
+        ge=0
+    )
     accelerator_partition_type: Optional[str] = Field(
         default=None,
         description="Type of accelerator partition"
@@ -453,23 +463,27 @@ def build_dict(**kwargs):
                 requests_value = build_dict(
                     **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
                     vcpu=str(self.vcpu) if self.vcpu else None,
-                    memory=str(self.memory) if self.memory else None
+                    memory=str(self.memory) if self.memory else None,
+                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
                 )
                 limits_value = build_dict(
                     **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
                     vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
-                    memory=str(self.memory_limit) if self.memory_limit else None
+                    memory=str(self.memory_limit) if self.memory_limit else None,
+                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
                 )
             else:
                 requests_value = build_dict(
                     accelerators=str(self.accelerators) if self.accelerators else None,
                     vcpu=str(self.vcpu) if self.vcpu else None,
-                    memory=str(self.memory) if self.memory else None
+                    memory=str(self.memory) if self.memory else None,
+                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
                 )
                 limits_value = build_dict(
                     accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
                     vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
-                    memory=str(self.memory_limit) if self.memory_limit else None
+                    memory=str(self.memory_limit) if self.memory_limit else None,
+                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
                 )
 
         # Build container

@@ -305,6 +305,16 @@
       "minimum": 0,
       "description": "Limit for the amount of memory in GiB"
     },
+    "efa_interfaces": {
+      "type": "integer",
+      "minimum": 0,
+      "description": "Number of EFA interfaces for the instance"
+    },
+    "efa_interfaces_limit": {
+      "type": "integer",
+      "minimum": 0,
+      "description": "Limit for the number of EFA interfaces"
+    },
     "accelerator_partition_type": {
       "type": "string",
       "enum": [

@@ -97,14 +97,14 @@
 {%-             if memory %}
                   memory: {{ memory }}Gi
 {%-             endif %}
-{%-             if (node_count and node_count > 1) %}
-                  vpc.amazonaws.com/efa: 1
+{%-             if efa and efa > 0 %}
+                  vpc.amazonaws.com/efa: {{ efa }}
 {%-             endif %}
 {%-           else %}
                 requests:
                   nvidia.com/gpu: "0"
 {%-           endif %}
-{%-           if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit %}
+{%-           if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit or efa_limit%}
                 limits:
 {%-             if accelerator_partition_type and accelerator_partition_limit %}
                   nvidia.com/{{ accelerator_partition_type }}: {{ accelerator_partition_limit }}
@@ -117,8 +117,8 @@
 {%-             if memory_limit %}
                   memory: {{ memory_limit }}Gi
 {%-             endif %}
-{%-             if (node_count and node_count > 1) %}
-                  vpc.amazonaws.com/efa: 1
+{%-             if efa_limit and efa_limit > 0 %}
+                  vpc.amazonaws.com/efa: {{ efa_limit }}
 {%-             endif %}
 {%-           else %}
                 limits:

@@ -45,6 +45,7 @@
 SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes")
 NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu"
 NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice"
+EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa"
 AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices"
 TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices"
 USER_NAME_LABEL_KEY = "sagemaker.user/created-by"