Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
faeca24
Add GPU support to kubernetes_scale on EKS Karpenter
kiryl-filatau Dec 18, 2025
84b234b
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 5, 2026
2f77176
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 6, 2026
633ce80
Move EKS Karpenter GPU NodePool setup to Prepare()
kiryl-filatau Jan 7, 2026
71b9e47
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 8, 2026
e6994b0
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 9, 2026
b35fce0
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 12, 2026
8708dbc
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 13, 2026
1ae0f69
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 14, 2026
583f0a1
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 15, 2026
2b8985d
Refactor AWS EKS Karpenter GPU node selector application
kiryl-filatau Jan 15, 2026
7bed7ad
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 16, 2026
561e34a
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 20, 2026
ca8e619
Fix GPU toleration to apply only to EKS Karpenter clusters
kiryl-filatau Jan 20, 2026
d2a20b7
resolve conflict
kiryl-filatau Jan 20, 2026
d037155
pyink adjustments
kiryl-filatau Jan 20, 2026
faed023
adjust the style comments
kiryl-filatau Jan 20, 2026
8c4e933
Merge branch 'GoogleCloudPlatform:master' into master
vofish Jan 21, 2026
4213f64
pytype adjustments
kiryl-filatau Jan 21, 2026
13cc20a
Merge branch 'master' into feature/kubernetes-scale-to-1-gpu
kiryl-filatau Jan 21, 2026
326e088
add missed pytz to requirements.txt
kiryl-filatau Jan 21, 2026
4652938
Merge branch 'GoogleCloudPlatform:master' into master
kiryl-filatau Jan 21, 2026
5597d07
Merge branch 'master' into feature/kubernetes-scale-to-1-gpu
kiryl-filatau Jan 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion perfkitbenchmarker/container_service/kubernetes_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def _ModifyPodSpecPlacementYaml(
del name
node_selectors = self.GetNodeSelectors(machine_type)
if node_selectors:
pod_spec_yaml['nodeSelector'].update(node_selectors)
pod_spec_yaml.setdefault('nodeSelector', {}).update(node_selectors)

@property
def _ingress_manifest_path(self) -> str:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: {{ gpu_nodepool_name | default('gpu') }}
spec:
disruption:
consolidateAfter: {{ gpu_consolidate_after | default('1m') }}
consolidationPolicy: {{ gpu_consolidation_policy | default('WhenEmptyOrUnderutilized') }}
limits:
cpu: {{ gpu_nodepool_cpu_limit | default(1000) }}
template:
metadata:
labels:
pkb_nodepool: {{ gpu_nodepool_label | default('gpu') }}
spec:
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: {{ karpenter_ec2nodeclass_name | default('default') }}
requirements:
- key: kubernetes.io/arch
operator: In
values: {{ gpu_arch | default(['amd64']) }}
- key: kubernetes.io/os
operator: In
values: {{ gpu_os | default(['linux']) }}
- key: karpenter.sh/capacity-type
operator: In
values: {{ gpu_capacity_types | default(['on-demand']) }}
- key: karpenter.k8s.aws/instance-category
operator: In
values: {{ gpu_instance_categories | default(['g']) }}
- key: karpenter.k8s.aws/instance-family
operator: In
values: {{ gpu_instance_families | default(['g6','g6e']) }}
taints:
- key: {{ gpu_taint_key | default('nvidia.com/gpu') }}
effect: NoSchedule
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ spec:
command: {{ Command }}
{%- endif %}
resources:
requests:
cpu: {{ CpuRequest }}
memory: {{ MemoryRequest }}
ephemeral-storage: {{ EphemeralStorageRequest }}
{%- if NvidiaGpuRequest %}
nvidia.com/gpu: {{ NvidiaGpuRequest }}
{%- endif %}
limits:
cpu: {{ CpuRequest }}
memory: {{ MemoryRequest }}
Expand Down Expand Up @@ -53,3 +60,8 @@ spec:
operator: "Exists"
effect: "NoExecute"
tolerationSeconds: {{ PodTimeout }}
{%- if GpuTaintKey %}
- key: {{ GpuTaintKey }}
operator: Exists
effect: NoSchedule
{%- endif %}
59 changes: 54 additions & 5 deletions perfkitbenchmarker/linux_benchmarks/kubernetes_scale_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,48 @@ def GetConfig(user_config):
return config


def _IsEksKarpenterAwsGpu(cluster: container_service.KubernetesCluster) -> bool:
return bool(
virtual_machine.GPU_COUNT.value
and FLAGS.cloud.lower() == 'aws'
and getattr(cluster, 'CLUSTER_TYPE', None) == 'Karpenter'
)


def _EnsureEksKarpenterGpuNodepool(
cluster: container_service.KubernetesCluster,
) -> None:
"""Ensures a GPU NodePool exists for EKS Karpenter before applying workloads."""
if not _IsEksKarpenterAwsGpu(cluster):
return
cluster.ApplyManifest(
'container/kubernetes_scale/aws-gpu-nodepool.yaml.j2',
gpu_nodepool_name='gpu',
gpu_nodepool_label='gpu',
karpenter_ec2nodeclass_name='default',
gpu_instance_categories=['g'],
gpu_instance_families=['g6', 'g6e'],
gpu_capacity_types=['on-demand'],
gpu_arch=['amd64'],
gpu_os=['linux'],
gpu_taint_key='nvidia.com/gpu',
gpu_consolidate_after='1m',
gpu_consolidation_policy='WhenEmptyOrUnderutilized',
gpu_nodepool_cpu_limit=1000,
)


def Prepare(bm_spec: benchmark_spec.BenchmarkSpec):
"""Sets additional spec attributes."""
bm_spec.always_call_cleanup = True
assert bm_spec.container_cluster
_EnsureEksKarpenterGpuNodepool(bm_spec.container_cluster)


def _GetRolloutCreationTime(rollout_name: str) -> int:
"""Returns the time when the rollout was created."""
out, _, _ = container_service.RunRetryableKubectlCommand([
'rollout',
'history',
'get',
rollout_name,
'-o',
'jsonpath={.metadata.creationTimestamp}',
Expand All @@ -122,6 +154,7 @@ def Run(bm_spec: benchmark_spec.BenchmarkSpec) -> list[sample.Sample]:
assert bm_spec.container_cluster
cluster = bm_spec.container_cluster
assert isinstance(cluster, container_service.KubernetesCluster)
cluster: container_service.KubernetesCluster = cluster

# Warm up the cluster by creating a single pod. This compensates for
# differences between Standard & Autopilot, where Standard already has 1 node
Expand Down Expand Up @@ -180,8 +213,10 @@ def ScaleUpPods(
max_wait_time = _GetScaleTimeout()
resource_timeout = max_wait_time + 60 * 5 # 5 minutes after waiting to avoid
# pod delete events from polluting data collection.
yaml_docs = cluster.ConvertManifestToYamlDicts(
MANIFEST_TEMPLATE,

is_eks_karpenter_aws_gpu = _IsEksKarpenterAwsGpu(cluster)

manifest_kwargs = dict(
Name='kubernetes-scaleup',
Replicas=num_new_pods,
CpuRequest=CPUS_PER_POD.value,
Expand All @@ -192,12 +227,26 @@ def ScaleUpPods(
EphemeralStorageRequest='10Mi',
RolloutTimeout=max_wait_time,
PodTimeout=resource_timeout,
Cloud=FLAGS.cloud.lower(),
GpuTaintKey=None,
)

# GpuTaintKey is still needed for tolerations in the yaml template
if is_eks_karpenter_aws_gpu:
manifest_kwargs['GpuTaintKey'] = 'nvidia.com/gpu'

yaml_docs = cluster.ConvertManifestToYamlDicts(
MANIFEST_TEMPLATE,
**manifest_kwargs,
)

# Use ModifyPodSpecPlacementYaml to add nodeSelectors via GetNodeSelectors()
cluster.ModifyPodSpecPlacementYaml(
yaml_docs,
'kubernetes-scaleup',
cluster.default_nodepool.machine_type,
)

resource_names = cluster.ApplyYaml(yaml_docs)

assert resource_names
Expand Down Expand Up @@ -390,7 +439,7 @@ def GetStatusConditionsForResourceType(
def ConvertToEpochTime(timestamp: str) -> int:
"""Converts a timestamp to epoch time."""
# Example: 2024-11-08T23:44:36Z
return parser.parse(timestamp).timestamp()
return int(parser.parse(timestamp).timestamp())


def ParseStatusChanges(
Expand Down
22 changes: 12 additions & 10 deletions perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,7 @@ def _ingress_manifest_path(self) -> str:
"""The path to the ingress manifest template file."""
return 'container/ingress.yaml.j2'

def _WaitForIngress(
self, name: str, namespace: str, port: int
) -> str:
def _WaitForIngress(self, name: str, namespace: str, port: int) -> str:
"""Waits for an Ingress resource to be deployed to the cluster."""
del port
self.WaitForResource(
Expand Down Expand Up @@ -706,9 +704,7 @@ def _Create(self):
}],
},
'iamIdentityMappings': [{
'arn': (
f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}'
),
'arn': f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}',
'username': 'system:node:{{EC2PrivateDNSName}}',
'groups': ['system:bootstrappers', 'system:nodes'],
}],
Expand Down Expand Up @@ -1283,10 +1279,16 @@ def ResizeNodePool(

def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]:
"""Gets the node selectors section of a yaml for the provider."""
machine_family = util.GetMachineFamily(machine_type)
if machine_family:
return {'karpenter.k8s.aws/instance-family': machine_family}
return {}
selectors = {}
# If GPU is requested, use the GPU nodepool
if virtual_machine.GPU_TYPE.value:
selectors['karpenter.sh/nodepool'] = 'gpu'
else:
# Otherwise, use instance-family selector if machine_type is specified
machine_family = util.GetMachineFamily(machine_type)
if machine_family:
selectors['karpenter.k8s.aws/instance-family'] = machine_family
return selectors

def GetNodePoolNames(self) -> list[str]:
"""Gets node pool names for the cluster.
Expand Down