Skip to content

Commit 21b5e32

Browse files
committed
Require --instance-type when specifying accelerator resources (#317)
1 parent 4406d08 commit 21b5e32

3 files changed

Lines changed: 9 additions & 9 deletions

File tree

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,9 @@ def _process_replica_resources(cls, data):
149149
elif limits.get(NEURON_RESOURCE_KEY):
150150
accelerators_limit = int(limits.get(NEURON_RESOURCE_KEY))
151151

152+
if instance_type is None and (accelerators is not None or accelerators_limit is not None):
153+
raise ValueError("--instance-type is required when specifying accelerator resources")
154+
152155
acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit)
153156
_validate_accelerators_inputs(instance_type, acc_req, acc_lim)
154157

src/sagemaker/hyperpod/training/quota_allocation_util.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,6 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l
146146
if accelerators_limit is not None:
147147
if type_of_accelerator is not None:
148148
result[type_of_accelerator] = accelerators_limit
149-
else:
150-
# user specified accelerator limit but the instance type wasn't found, set limit to 0 as a precaution
151-
result["nvidia.com/gpu"] = 0
152149
if accelerator_partition_limit is not None:
153150
result[f"nvidia.com/{accelerator_partition_type}"] = accelerator_partition_limit
154151
if memory_in_gib_limit is not None:

test/unit_tests/cli/test_quota_allocation_util.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,18 +193,18 @@ def test_get_limits_trainium_instance(self):
193193

194194
def test_get_limits_cpu_only_instance(self):
195195
result = _get_limits("ml.c5.large", 2.0, 8.0, 1, None, None)
196-
# CPU-only instance should set accelerator limit to 0 as precaution
197-
assert result == {"cpu": "2.0", "memory": "8.0Gi", "nvidia.com/gpu": 0}
196+
# CPU-only instance has no accelerator type, so accelerator limit is dropped
197+
assert result == {"cpu": "2.0", "memory": "8.0Gi"}
198198

199199
def test_get_limits_invalid_instance_type(self):
200200
result = _get_limits("invalid-instance", 4.0, 16.0, 2, None, None)
201-
# Invalid instance type should set accelerator limit to 0 as precaution
202-
assert result == {"cpu": "4.0", "memory": "16.0Gi", "nvidia.com/gpu": 0}
201+
# Invalid instance type has no accelerator type, so accelerator limit is dropped
202+
assert result == {"cpu": "4.0", "memory": "16.0Gi"}
203203

204204
def test_get_limits_cpu_instance_r7i(self):
205205
result = _get_limits("ml.r7i.48xlarge", 16.0, 64.0, 2, None, None)
206-
# CPU-only instance (ml.r7i.48xlarge) should set accelerator limit to 0 as precaution
207-
assert result == {"cpu": "16.0", "memory": "64.0Gi", "nvidia.com/gpu": 0}
206+
# CPU-only instance has no accelerator type, so accelerator limit is dropped
207+
assert result == {"cpu": "16.0", "memory": "64.0Gi"}
208208

209209
def test_is_valid_no_instance_type_with_resources(self):
210210
valid, message = _is_valid(4.0, 16.0, None, None, None, None)

0 commit comments

Comments
 (0)