diff --git a/perfkitbenchmarker/container_service/kubernetes_cluster.py b/perfkitbenchmarker/container_service/kubernetes_cluster.py index d3e3aa0b2..98b190c77 100644 --- a/perfkitbenchmarker/container_service/kubernetes_cluster.py +++ b/perfkitbenchmarker/container_service/kubernetes_cluster.py @@ -189,7 +189,7 @@ def _ModifyPodSpecPlacementYaml( del name node_selectors = self.GetNodeSelectors(machine_type) if node_selectors: - pod_spec_yaml['nodeSelector'].update(node_selectors) + pod_spec_yaml.setdefault('nodeSelector', {}).update(node_selectors) @property def _ingress_manifest_path(self) -> str: diff --git a/perfkitbenchmarker/data/container/kubernetes_scale/aws-gpu-nodepool.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_scale/aws-gpu-nodepool.yaml.j2 new file mode 100644 index 000000000..9d6fce3d9 --- /dev/null +++ b/perfkitbenchmarker/data/container/kubernetes_scale/aws-gpu-nodepool.yaml.j2 @@ -0,0 +1,38 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: {{ gpu_nodepool_name | default('gpu') }} +spec: + disruption: + consolidateAfter: {{ gpu_consolidate_after | default('1m') }} + consolidationPolicy: {{ gpu_consolidation_policy | default('WhenEmptyOrUnderutilized') }} + limits: + cpu: {{ gpu_nodepool_cpu_limit | default(1000) }} + template: + metadata: + labels: + pkb_nodepool: {{ gpu_nodepool_label | default('gpu') }} + spec: + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: {{ karpenter_ec2nodeclass_name | default('default') }} + requirements: + - key: kubernetes.io/arch + operator: In + values: {{ gpu_arch | default(['amd64']) }} + - key: kubernetes.io/os + operator: In + values: {{ gpu_os | default(['linux']) }} + - key: karpenter.sh/capacity-type + operator: In + values: {{ gpu_capacity_types | default(['on-demand']) }} + - key: karpenter.k8s.aws/instance-category + operator: In + values: {{ gpu_instance_categories | default(['g']) }} + - key: karpenter.k8s.aws/instance-family + operator: In + values: {{ gpu_instance_families | default(['g6','g6e']) }} + taints: + - key: {{ gpu_taint_key | default('nvidia.com/gpu') }} + effect: NoSchedule diff --git a/perfkitbenchmarker/data/container/kubernetes_scale/kubernetes_scale.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_scale/kubernetes_scale.yaml.j2 index 9b79aa6cd..ef911f16d 100644 --- a/perfkitbenchmarker/data/container/kubernetes_scale/kubernetes_scale.yaml.j2 +++ b/perfkitbenchmarker/data/container/kubernetes_scale/kubernetes_scale.yaml.j2 @@ -20,6 +20,13 @@ spec: command: {{ Command }} {%- endif %} resources: + requests: + cpu: {{ CpuRequest }} + memory: {{ MemoryRequest }} + ephemeral-storage: {{ EphemeralStorageRequest }} + {%- if NvidiaGpuRequest %} + nvidia.com/gpu: {{ NvidiaGpuRequest }} + {%- endif %} limits: cpu: {{ CpuRequest }} memory: {{ MemoryRequest }} @@ -53,3 +60,8 @@ spec: operator: "Exists" effect: "NoExecute" tolerationSeconds: {{ PodTimeout }} + {%- if GpuTaintKey %} + - key: {{ GpuTaintKey }} + operator: Exists + effect: NoSchedule + {%- endif %} diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py index 8e8cb21f0..d0066651f 100644 --- a/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py @@ -89,16 +89,48 @@ def GetConfig(user_config): return config +def _IsEksKarpenterAwsGpu(cluster: container_service.KubernetesCluster) -> bool: + return bool( + virtual_machine.GPU_COUNT.value + and FLAGS.cloud.lower() == 'aws' + and getattr(cluster, 'CLUSTER_TYPE', None) == 'Karpenter' + ) + + +def _EnsureEksKarpenterGpuNodepool( + cluster: container_service.KubernetesCluster, +) -> None: + """Ensures a GPU NodePool exists for EKS Karpenter before applying workloads.""" + if not _IsEksKarpenterAwsGpu(cluster): + return + cluster.ApplyManifest( + 'container/kubernetes_scale/aws-gpu-nodepool.yaml.j2', + gpu_nodepool_name='gpu', + gpu_nodepool_label='gpu', + karpenter_ec2nodeclass_name='default', + gpu_instance_categories=['g'], + gpu_instance_families=['g6', 'g6e'], + gpu_capacity_types=['on-demand'], + gpu_arch=['amd64'], + gpu_os=['linux'], + gpu_taint_key='nvidia.com/gpu', + gpu_consolidate_after='1m', + gpu_consolidation_policy='WhenEmptyOrUnderutilized', + gpu_nodepool_cpu_limit=1000, + ) + + def Prepare(bm_spec: benchmark_spec.BenchmarkSpec): """Sets additional spec attributes.""" bm_spec.always_call_cleanup = True + assert bm_spec.container_cluster + _EnsureEksKarpenterGpuNodepool(bm_spec.container_cluster) def _GetRolloutCreationTime(rollout_name: str) -> int: """Returns the time when the rollout was created.""" out, _, _ = container_service.RunRetryableKubectlCommand([ - 'rollout', - 'history', + 'get', rollout_name, '-o', 'jsonpath={.metadata.creationTimestamp}', @@ -122,6 +154,7 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]: assert bm_spec.container_cluster cluster = bm_spec.container_cluster assert isinstance(cluster, container_service.KubernetesCluster) + cluster: container_service.KubernetesCluster = cluster # Warm up the cluster by creating a single pod. This compensates for # differences between Standard & Autopilot, where Standard already has 1 node @@ -180,8 +213,10 @@ def ScaleUpPods( max_wait_time = _GetScaleTimeout() resource_timeout = max_wait_time + 60 * 5 # 5 minutes after waiting to avoid # pod delete events from polluting data collection. - yaml_docs = cluster.ConvertManifestToYamlDicts( - MANIFEST_TEMPLATE, + + is_eks_karpenter_aws_gpu = _IsEksKarpenterAwsGpu(cluster) + + manifest_kwargs = dict( Name='kubernetes-scaleup', Replicas=num_new_pods, CpuRequest=CPUS_PER_POD.value, @@ -192,12 +227,26 @@ def ScaleUpPods( EphemeralStorageRequest='10Mi', RolloutTimeout=max_wait_time, PodTimeout=resource_timeout, + Cloud=FLAGS.cloud.lower(), + GpuTaintKey=None, + ) + + # GpuTaintKey is still needed for tolerations in the yaml template + if is_eks_karpenter_aws_gpu: + manifest_kwargs['GpuTaintKey'] = 'nvidia.com/gpu' + + yaml_docs = cluster.ConvertManifestToYamlDicts( + MANIFEST_TEMPLATE, + **manifest_kwargs, ) + + # Use ModifyPodSpecPlacementYaml to add nodeSelectors via GetNodeSelectors() cluster.ModifyPodSpecPlacementYaml( yaml_docs, 'kubernetes-scaleup', cluster.default_nodepool.machine_type, ) + resource_names = cluster.ApplyYaml(yaml_docs) assert resource_names @@ -390,7 +439,7 @@ def GetStatusConditionsForResourceType( def ConvertToEpochTime(timestamp: str) -> int: """Converts a timestamp to epoch time.""" # Example: 2024-11-08T23:44:36Z - return parser.parse(timestamp).timestamp() + return int(parser.parse(timestamp).timestamp()) def ParseStatusChanges( diff --git a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py index 11d2a5d3b..891d3f70b 100644 --- a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py +++ b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py @@ -313,9 +313,7 @@ def _ingress_manifest_path(self) -> str: """The path to the ingress manifest template file.""" return 'container/ingress.yaml.j2' - def _WaitForIngress( - self, name: str, namespace: str, port: int - ) -> str: + def _WaitForIngress(self, name: str, namespace: str, port: int) -> str: """Waits for an Ingress resource to be deployed to the cluster.""" del port self.WaitForResource( @@ -706,9 +704,7 @@ def _Create(self): }], }, 'iamIdentityMappings': [{ - 'arn': ( - f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}' - ), + 'arn': f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}', 'username': 'system:node:{{EC2PrivateDNSName}}', 'groups': ['system:bootstrappers', 'system:nodes'], }], @@ -1283,10 +1279,16 @@ def ResizeNodePool( def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]: """Gets the node selectors section of a yaml for the provider.""" - machine_family = util.GetMachineFamily(machine_type) - if machine_family: - return {'karpenter.k8s.aws/instance-family': machine_family} - return {} + selectors = {} + # If GPU is requested, use the GPU nodepool + if virtual_machine.GPU_TYPE.value: + selectors['karpenter.sh/nodepool'] = 'gpu' + else: + # Otherwise, use instance-family selector if machine_type is specified + machine_family = util.GetMachineFamily(machine_type) + if machine_family: + selectors['karpenter.k8s.aws/instance-family'] = machine_family + return selectors def GetNodePoolNames(self) -> list[str]: """Gets node pool names for the cluster.