Skip to content

Commit 4a3a679

Browse files
committed
Make --num-slices optional when using reservation
1 parent 78ae6c9 commit 4a3a679

10 files changed

Lines changed: 239 additions & 21 deletions

recipes/Cluster_create_RayCluster.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ $ xpk cluster create-ray --project=golden-project --zone=us-central1-a --cluster
1212
[XPK] Working on golden-project and us-central1-a
1313
[XPK] Task: `Get reservation golden-reservation` is implemented by the following command not running since it is a dry run.
1414
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status,deploymentType,resourcePolicies)"
15+
[XPK] Assessing reservation capacity to determine number of slices...
1516
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
1617
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
1718
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.

recipes/Cluster_create_private.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@ $ xpk cluster create-pathways --project=golden-project --zone=us-central1-a --cl
1010
[XPK] Starting xpk v0.0.0
1111
[XPK] Starting cluster create for cluster golden-cluster-private:
1212
[XPK] Working on golden-project and us-central1-a
13-
[XPK] Task: `Retrieve available pathways machine types` is implemented by the following command not running since it is a dry run.
14-
gcloud compute machine-types list --filter "guestCpus >= 49 AND memoryMb >= 238592 AND zone = 'us-central1-a'" --format="value(name)" --project=golden-project
1513
[XPK] Task: `Get reservation golden-reservation` is implemented by the following command not running since it is a dry run.
1614
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status,deploymentType,resourcePolicies)"
15+
[XPK] Assessing reservation capacity to determine number of slices...
16+
[XPK] Task: `Retrieve available pathways machine types` is implemented by the following command not running since it is a dry run.
17+
gcloud compute machine-types list --filter "guestCpus >= 49 AND memoryMb >= 238592 AND zone = 'us-central1-a'" --format="value(name)" --project=golden-project
1718
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
1819
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
1920
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.

recipes/Cluster_create_sub-slicing.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ $ SUB_SLICING_ENABLED=true xpk cluster create --project=golden-project --zone=us
1212
[XPK] Working on golden-project and us-central1-a
1313
[XPK] Task: `Get reservation golden-reservation` is implemented by the following command not running since it is a dry run.
1414
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status,deploymentType,resourcePolicies)"
15+
[XPK] Assessing reservation capacity to determine number of slices...
1516
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
1617
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
1718
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.

recipes/Cluster_create_super-slicing.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ $ DRY_RUN_RESERVATION_SUB_BLOCKS='[{"name": "sub0", "count": 16, "inUseCount": 0
1212
[XPK] Working on golden-project and us-central1-a
1313
[XPK] Task: `Get reservation golden-reservation` is implemented by the following command not running since it is a dry run.
1414
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status,deploymentType,resourcePolicies)"
15+
[XPK] Assessing reservation capacity to determine number of slices...
16+
[XPK] Task: `Count healthy fitting sub-blocks in block` is implemented by the following command not running since it is a dry run.
17+
gcloud beta compute reservations sub-blocks list golden-reservation --block-name=block --project=golden-project --zone=us-central1-a --filter="healthInfo.healthStatus=HEALTHY" --format="json(name,count,inUseCount)"
1518
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
1619
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
1720
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.
@@ -59,8 +62,6 @@ kubectl get configmap golden-cluster-resources-configmap -o=custom-columns="Conf
5962
[XPK] Existing node pool names ['0']
6063
[XPK] Task: `Retrieve resource policy` is implemented by the following command not running since it is a dry run.
6164
gcloud beta compute resource-policies describe tpu7x-128-4x4x4-ss-placement-policy --project=golden-project --region=us-central1
62-
[XPK] Task: `Count healthy fitting sub-blocks in block` is implemented by the following command not running since it is a dry run.
63-
gcloud beta compute reservations sub-blocks list golden-reservation --block-name=block --project=golden-project --zone=us-central1-a --filter="healthInfo.healthStatus=HEALTHY" --format="json(name,count,inUseCount)"
6465
[XPK] To complete NodepoolCreate-golden-cluster-np-0 we are executing gcloud beta container node-pools create golden-cluster-np-0 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/sub0 --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
6566
[XPK] To complete NodepoolCreate-golden-cluster-np-1 we are executing gcloud beta container node-pools create golden-cluster-np-1 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/sub1 --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15
6667
[XPK] To complete NodepoolCreate-golden-cluster-np-2 we are executing gcloud beta container node-pools create golden-cluster-np-2 --location=us-central1 --cluster=golden-cluster --project=golden-project --node-locations=us-central1-a --machine-type=tpu7x-standard-4t --host-maintenance-interval=AS_NEEDED --reservation-affinity=specific --reservation=golden-reservation/reservationBlocks/block/reservationSubBlocks/sub3 --placement-policy=tpu7x-128-4x4x4-ss-placement-policy --enable-gvnic --node-version=0 --num-nodes=16 --scopes=storage-full,gke-default,"https://www.googleapis.com/auth/cloud-platform" --max-pods-per-node 15

recipes/Cluster_create_with_gb200-4.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ $ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=gol
1212
[XPK] Working on golden-project and us-central1-a
1313
[XPK] Task: `Get reservation golden-reservation` is implemented by the following command not running since it is a dry run.
1414
gcloud beta compute reservations describe golden-reservation --project=golden-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status,deploymentType,resourcePolicies)"
15+
[XPK] Assessing reservation capacity to determine number of slices...
1516
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
1617
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
1718
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.

recipes/Cluster_create_with_shared_reservation.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ $ xpk cluster create --project=golden-project --zone=us-central1-a --cluster=gol
1212
[XPK] Working on golden-project and us-central1-a
1313
[XPK] Task: `Get reservation golden-reservation` is implemented by the following command not running since it is a dry run.
1414
gcloud beta compute reservations describe golden-reservation --project=reservation-project --zone=us-central1-a --format="json(specificReservation,aggregateReservation,status,deploymentType,resourcePolicies)"
15+
[XPK] Assessing reservation capacity to determine number of slices...
1516
[XPK] Task: `Determine server supported GKE versions for default gke version` is implemented by the following command not running since it is a dry run.
1617
gcloud container get-server-config --project=golden-project --region=us-central1 --flatten="channels" --filter="channels.channel=RAPID" --format="value(channels.defaultVersion)"
1718
[XPK] Task: `Determine server supported GKE versions for valid versions` is implemented by the following command not running since it is a dry run.

src/xpk/commands/cluster.py

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ def _validate_num_slices_and_set_default(
317317
args.num_cubes = total_available
318318

319319
args.num_slices = args.num_slices or args.num_cubes or 1
320+
args.num_nodes = args.num_nodes if args.num_nodes is not None else 2
320321

321322

322323
def cluster_create(args) -> None:
@@ -345,22 +346,49 @@ def cluster_create(args) -> None:
345346

346347
available_capacity = None
347348
if capacity_type == CapacityType.RESERVATION and args.reservation:
348-
xpk_print('Assessing reservation capacity to determine number of slices...')
349-
reservations = get_reservations_list(args)
350-
vms_per_pool = (
351-
args.num_nodes
352-
if system.accelerator_type == AcceleratorType.GPU
353-
else system.vms_per_slice
354-
)
355-
available_capacity, return_code = assess_available_slices(
356-
reservations,
357-
force_sub_block_targeting=args.super_slicing,
358-
system=system,
359-
vms_per_slice=vms_per_pool,
360-
)
361-
if return_code != 0:
362-
xpk_print('Error assessing available slices.')
363-
xpk_exit(return_code)
349+
if FeatureFlags.RESERVATIONS_VALIDATION_ENABLED or (
350+
FeatureFlags.OPTIONAL_NUM_SLICES
351+
and args.num_slices is None
352+
and args.num_cubes is None
353+
):
354+
xpk_print(
355+
'Assessing reservation capacity to determine number of slices...'
356+
)
357+
reservations = get_reservations_list(args)
358+
359+
if (
360+
system.accelerator_type == AcceleratorType.GPU
361+
and args.num_nodes is None
362+
):
363+
temp_capacity, return_code = assess_available_slices(
364+
reservations,
365+
force_sub_block_targeting=args.super_slicing,
366+
system=system,
367+
vms_per_slice=1,
368+
)
369+
if return_code != 0:
370+
xpk_print('Error assessing available VMs for GPU reservation.')
371+
xpk_exit(return_code)
372+
373+
total_vms = sum(cap.available_slices for cap in temp_capacity)
374+
if total_vms > 0:
375+
xpk_print(f'Automatically setting --num-nodes to {total_vms}')
376+
args.num_nodes = total_vms
377+
378+
vms_per_pool = (
379+
args.num_nodes
380+
if system.accelerator_type == AcceleratorType.GPU
381+
else system.vms_per_slice
382+
)
383+
available_capacity, return_code = assess_available_slices(
384+
reservations,
385+
force_sub_block_targeting=args.super_slicing,
386+
system=system,
387+
vms_per_slice=vms_per_pool,
388+
)
389+
if return_code != 0:
390+
xpk_print('Error assessing available slices.')
391+
xpk_exit(return_code)
364392

365393
_validate_cluster_create_args(args, system, available_capacity)
366394
_log_cluster_create_telemetry(args)

src/xpk/commands/cluster_test.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,3 +831,185 @@ def test_get_coredns_replica_count_upper_limit_is_15():
831831
default_pool_cpu_num_nodes=20,
832832
)
833833
assert _get_coredns_replica_count(args) == 15
834+
835+
836+
def test_validate_cluster_create_args_optional_num_nodes_gpu(
837+
mocks: _Mocks,
838+
):
839+
FeatureFlags.OPTIONAL_NUM_SLICES = True
840+
available_capacity = [
841+
ReservationCapacity(
842+
reservation=ReservationLink(project='p', zone='z', name='r'),
843+
available_slices=8,
844+
)
845+
]
846+
847+
args = construct_args(
848+
reservation='test-reservation',
849+
num_slices=None,
850+
num_cubes=None,
851+
num_nodes=None,
852+
)
853+
854+
_validate_cluster_create_args(args, GPU_TEST_SYSTEM, available_capacity)
855+
856+
assert args.num_nodes == 2
857+
assert args.num_slices == 8
858+
859+
860+
def test_validate_cluster_create_args_explicit_num_nodes_gpu(
861+
mocks: _Mocks,
862+
):
863+
FeatureFlags.OPTIONAL_NUM_SLICES = True
864+
available_capacity = [
865+
ReservationCapacity(
866+
reservation=ReservationLink(project='p', zone='z', name='r'),
867+
available_slices=4,
868+
)
869+
]
870+
871+
args = construct_args(
872+
reservation='test-reservation',
873+
num_slices=None,
874+
num_cubes=None,
875+
num_nodes=2,
876+
)
877+
878+
_validate_cluster_create_args(args, GPU_TEST_SYSTEM, available_capacity)
879+
880+
assert args.num_nodes == 2
881+
assert args.num_slices == 4
882+
883+
884+
def test_cluster_create_optional_num_nodes_gpu_without_explicit_arg(
885+
mocker,
886+
mocks: _Mocks,
887+
):
888+
FeatureFlags.OPTIONAL_NUM_SLICES = True
889+
args = construct_args(
890+
device_type='a3-highgpu-8g',
891+
reservation='test-reservation',
892+
num_slices=None,
893+
num_cubes=None,
894+
num_nodes=None,
895+
)
896+
mocker.patch(
897+
'xpk.commands.cluster.get_capacity_type',
898+
return_value=(CapacityType.RESERVATION, 0),
899+
)
900+
mocker.patch(
901+
'xpk.commands.cluster.get_reservations_list',
902+
return_value=['test-reservation'],
903+
)
904+
905+
def mock_assess_available_slices(*args, **kwargs):
906+
vms_per_slice = kwargs.get('vms_per_slice', 1)
907+
available_slices = 12 // vms_per_slice
908+
return [
909+
ReservationCapacity(
910+
reservation=ReservationLink(project='p', zone='z', name='r'),
911+
available_slices=available_slices,
912+
)
913+
], 0
914+
915+
mocker.patch(
916+
'xpk.commands.cluster.assess_available_slices',
917+
side_effect=mock_assess_available_slices,
918+
)
919+
mocker.patch('xpk.commands.cluster._validate_cluster_create_args')
920+
mocker.patch('xpk.commands.cluster._log_cluster_create_telemetry')
921+
mocker.patch(
922+
'xpk.commands.cluster.get_gke_server_config',
923+
return_value=(0, 'gke-server-config'),
924+
)
925+
mocker.patch(
926+
'xpk.commands.cluster.get_gke_control_plane_version',
927+
return_value=(0, '1.2.3'),
928+
)
929+
mocker.patch('xpk.commands.cluster._install_kueue', return_value=0)
930+
mocker.patch('xpk.commands.cluster.create_cluster_configmaps', return_value=0)
931+
mocker.patch(
932+
'xpk.commands.cluster.run_gke_cluster_create_command', return_value=0
933+
)
934+
mocker.patch(
935+
'xpk.commands.cluster.get_system_characteristics',
936+
return_value=(GPU_TEST_SYSTEM, 0),
937+
)
938+
mocker.patch(
939+
'xpk.commands.cluster.run_gke_node_pool_create_command', return_value=0
940+
)
941+
mocker.patch('xpk.commands.cluster.xpk_exit')
942+
943+
try:
944+
cluster_create(args)
945+
except SystemExit:
946+
pass
947+
948+
assert args.num_nodes == 12
949+
950+
951+
def test_cluster_create_optional_num_nodes_gpu_with_explicit_arg(
952+
mocker,
953+
mocks: _Mocks,
954+
):
955+
FeatureFlags.OPTIONAL_NUM_SLICES = True
956+
args = construct_args(
957+
device_type='a3-highgpu-8g',
958+
reservation='test-reservation',
959+
num_slices=None,
960+
num_cubes=None,
961+
num_nodes=3,
962+
)
963+
mocker.patch(
964+
'xpk.commands.cluster.get_capacity_type',
965+
return_value=(CapacityType.RESERVATION, 0),
966+
)
967+
mocker.patch(
968+
'xpk.commands.cluster.get_reservations_list',
969+
return_value=['test-reservation'],
970+
)
971+
972+
def mock_assess_available_slices(*args, **kwargs):
973+
vms_per_slice = kwargs.get('vms_per_slice', 1)
974+
available_slices = 12 // vms_per_slice
975+
return [
976+
ReservationCapacity(
977+
reservation=ReservationLink(project='p', zone='z', name='r'),
978+
available_slices=available_slices,
979+
)
980+
], 0
981+
982+
mocker.patch(
983+
'xpk.commands.cluster.assess_available_slices',
984+
side_effect=mock_assess_available_slices,
985+
)
986+
mocker.patch('xpk.commands.cluster._validate_cluster_create_args')
987+
mocker.patch('xpk.commands.cluster._log_cluster_create_telemetry')
988+
mocker.patch(
989+
'xpk.commands.cluster.get_gke_server_config',
990+
return_value=(0, 'gke-server-config'),
991+
)
992+
mocker.patch(
993+
'xpk.commands.cluster.get_gke_control_plane_version',
994+
return_value=(0, '1.2.3'),
995+
)
996+
mocker.patch('xpk.commands.cluster._install_kueue', return_value=0)
997+
mocker.patch('xpk.commands.cluster.create_cluster_configmaps', return_value=0)
998+
mocker.patch(
999+
'xpk.commands.cluster.run_gke_cluster_create_command', return_value=0
1000+
)
1001+
mocker.patch(
1002+
'xpk.commands.cluster.get_system_characteristics',
1003+
return_value=(GPU_TEST_SYSTEM, 0),
1004+
)
1005+
mocker.patch(
1006+
'xpk.commands.cluster.run_gke_node_pool_create_command', return_value=0
1007+
)
1008+
mocker.patch('xpk.commands.cluster.xpk_exit')
1009+
1010+
try:
1011+
cluster_create(args)
1012+
except SystemExit:
1013+
pass
1014+
1015+
assert args.num_nodes == 3

src/xpk/parser/cluster.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
117117
cluster_create_optional_arguments.add_argument(
118118
'--num-nodes',
119119
type=int,
120-
default=2,
120+
default=None,
121121
help='The number of nodes for a cluster, defaults to 2.',
122122
required=False,
123123
)

src/xpk/utils/feature_flags.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ def _get_boolean_flag(flag: str, default: bool) -> bool:
3131

3232

3333
class _FeatureFlags:
34+
"""Stores feature flags for the application."""
35+
3436
SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
3537
TELEMETRY_ENABLED = _get_boolean_flag("TELEMETRY_ENABLED", default=True)
3638
CRANE_WORKLOADS_ENABLED = _get_boolean_flag(

0 commit comments

Comments
 (0)