Require --instance-type when specifying accelerator resources (#317)

FarhanTejani · FarhanTejani · commit 21b5e32da3af · 2026-03-18T17:10:45.000-07:00
diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py
@@ -149,6 +149,9 @@ def _process_replica_resources(cls, data):
             elif limits.get(NEURON_RESOURCE_KEY):
                 accelerators_limit = int(limits.get(NEURON_RESOURCE_KEY))
 
+            if instance_type is None and (accelerators is not None or accelerators_limit is not None):
+                raise ValueError("--instance-type is required when specifying accelerator resources")
+
             acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit)
             _validate_accelerators_inputs(instance_type, acc_req, acc_lim)
 
diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py
@@ -146,9 +146,6 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l
     if accelerators_limit is not None:
         if type_of_accelerator is not None:
             result[type_of_accelerator] = accelerators_limit
-        else: 
-            # user specified accelerator limit but the instance type wasn't found, set limit to 0 as a precaution 
-            result["nvidia.com/gpu"] = 0
     if accelerator_partition_limit is not None:
         result[f"nvidia.com/{accelerator_partition_type}"] = accelerator_partition_limit
     if memory_in_gib_limit is not None:
diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py
@@ -193,18 +193,18 @@ def test_get_limits_trainium_instance(self):
 
     def test_get_limits_cpu_only_instance(self):
         result = _get_limits("ml.c5.large", 2.0, 8.0, 1, None, None)
-        # CPU-only instance should set accelerator limit to 0 as precaution
-        assert result == {"cpu": "2.0", "memory": "8.0Gi", "nvidia.com/gpu": 0}
+        # CPU-only instance has no accelerator type, so accelerator limit is dropped
+        assert result == {"cpu": "2.0", "memory": "8.0Gi"}
 
     def test_get_limits_invalid_instance_type(self):
         result = _get_limits("invalid-instance", 4.0, 16.0, 2, None, None)
-        # Invalid instance type should set accelerator limit to 0 as precaution
-        assert result == {"cpu": "4.0", "memory": "16.0Gi", "nvidia.com/gpu": 0}
+        # Invalid instance type has no accelerator type, so accelerator limit is dropped
+        assert result == {"cpu": "4.0", "memory": "16.0Gi"}
 
     def test_get_limits_cpu_instance_r7i(self):
         result = _get_limits("ml.r7i.48xlarge", 16.0, 64.0, 2, None, None)
-        # CPU-only instance (ml.r7i.48xlarge) should set accelerator limit to 0 as precaution
-        assert result == {"cpu": "16.0", "memory": "64.0Gi", "nvidia.com/gpu": 0}
+        # CPU-only instance has no accelerator type, so accelerator limit is dropped
+        assert result == {"cpu": "16.0", "memory": "64.0Gi"}
 
     def test_is_valid_no_instance_type_with_resources(self):
         valid, message = _is_valid(4.0, 16.0, None, None, None, None)