Skip to content

Commit 3c6f4e8

Browse files
committed
improvement(helm): update GPU device plugin and add cert-manager issuers
1 parent 7640fdf commit 3c6f4e8

File tree

3 files changed

+182
-32
lines changed

3 files changed

+182
-32
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
{{- if .Values.certManager.enabled }}
2+
{{- /*
3+
cert-manager Issuer Bootstrap Pattern
4+
5+
This implements the recommended pattern from cert-manager documentation:
6+
1. A self-signed ClusterIssuer (for bootstrapping the root CA only)
7+
2. A root CA Certificate (self-signed, used to sign other certificates)
8+
3. A CA ClusterIssuer (uses the root CA to sign certificates)
9+
10+
Reference: https://cert-manager.io/docs/configuration/selfsigned/
11+
*/ -}}
12+
13+
---
14+
# 1. Self-Signed ClusterIssuer (Bootstrap Only)
15+
# This issuer is used ONLY to create the root CA certificate.
16+
# It should NOT be used directly for application certificates.
17+
apiVersion: cert-manager.io/v1
18+
kind: ClusterIssuer
19+
metadata:
20+
name: {{ .Values.certManager.selfSignedIssuer.name }}
21+
labels:
22+
{{- include "sim.labels" . | nindent 4 }}
23+
app.kubernetes.io/component: cert-manager
24+
spec:
25+
selfSigned: {}
26+
27+
---
28+
# 2. Root CA Certificate
29+
# This certificate is signed by the self-signed issuer and becomes the root of trust.
30+
# The secret created here will be used by the CA issuer to sign certificates.
31+
apiVersion: cert-manager.io/v1
32+
kind: Certificate
33+
metadata:
34+
name: {{ .Values.certManager.rootCA.certificateName }}
35+
namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }}
36+
labels:
37+
{{- include "sim.labels" . | nindent 4 }}
38+
app.kubernetes.io/component: cert-manager
39+
spec:
40+
isCA: true
41+
commonName: {{ .Values.certManager.rootCA.commonName }}
42+
secretName: {{ .Values.certManager.rootCA.secretName }}
43+
duration: {{ .Values.certManager.rootCA.duration | default "87600h" }}
44+
renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }}
45+
privateKey:
46+
algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }}
47+
size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }}
48+
subject:
49+
organizations:
50+
{{- if .Values.certManager.rootCA.subject.organizations }}
51+
{{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }}
52+
{{- else }}
53+
- {{ .Release.Name }}
54+
{{- end }}
55+
issuerRef:
56+
name: {{ .Values.certManager.selfSignedIssuer.name }}
57+
kind: ClusterIssuer
58+
group: cert-manager.io
59+
60+
---
61+
# 3. CA ClusterIssuer
62+
# This is the issuer that should be used by applications to obtain certificates.
63+
# It signs certificates using the root CA created above.
64+
apiVersion: cert-manager.io/v1
65+
kind: ClusterIssuer
66+
metadata:
67+
name: {{ .Values.certManager.caIssuer.name }}
68+
labels:
69+
{{- include "sim.labels" . | nindent 4 }}
70+
app.kubernetes.io/component: cert-manager
71+
spec:
72+
ca:
73+
secretName: {{ .Values.certManager.rootCA.secretName }}
74+
{{- end }}

helm/sim/templates/gpu-device-plugin.yaml

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,37 @@
11
{{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }}
22
---
3-
# NVIDIA Device Plugin DaemonSet for GPU support
3+
# 1. ConfigMap for NVIDIA Device Plugin Configuration
4+
apiVersion: v1
5+
kind: ConfigMap
6+
metadata:
7+
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
8+
namespace: {{ .Release.Namespace }}
9+
labels:
10+
{{- include "sim.labels" . | nindent 4 }}
11+
app.kubernetes.io/component: nvidia-device-plugin
12+
data:
13+
config.yaml: |
14+
version: v1
15+
flags:
16+
{{- if eq .Values.ollama.gpu.strategy "mig" }}
17+
migStrategy: "single"
18+
{{- else }}
19+
migStrategy: "none"
20+
{{- end }}
21+
failOnInitError: false
22+
nvidiaDriverRoot: /host-proc/driver/nvidia
23+
plugin:
24+
passDeviceSpecs: true
25+
deviceListStrategy: envvar
26+
{{- if eq .Values.ollama.gpu.strategy "time-slicing" }}
27+
sharing:
28+
timeSlicing:
29+
resources:
30+
- name: nvidia.com/gpu
31+
replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 10 }}
32+
{{- end }}
33+
---
34+
# 2. NVIDIA Device Plugin DaemonSet for GPU support
435
apiVersion: apps/v1
536
kind: DaemonSet
637
metadata:
@@ -35,9 +66,6 @@ spec:
3566
# Only schedule on nodes with NVIDIA GPUs
3667
accelerator: nvidia
3768
priorityClassName: system-node-critical
38-
runtimeClassName: nvidia
39-
hostNetwork: true
40-
hostPID: true
4169
volumes:
4270
- name: device-plugin
4371
hostPath:
@@ -51,19 +79,16 @@ spec:
5179
- name: proc-driver-nvidia
5280
hostPath:
5381
path: /proc/driver/nvidia
82+
# Volume to mount the ConfigMap
83+
- name: nvidia-device-plugin-config
84+
configMap:
85+
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
5486
containers:
5587
- name: nvidia-device-plugin
56-
image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5
88+
image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2
5789
imagePullPolicy: Always
5890
args:
59-
- --mig-strategy=single
60-
- --pass-device-specs=true
61-
- --fail-on-init-error=false
62-
- --device-list-strategy=envvar
63-
- --nvidia-driver-root=/host-sys/fs/cgroup
64-
env:
65-
- name: NVIDIA_MIG_MONITOR_DEVICES
66-
value: all
91+
- "--config-file=/etc/device-plugin/config.yaml"
6792
securityContext:
6893
allowPrivilegeEscalation: false
6994
capabilities:
@@ -74,29 +99,23 @@ spec:
7499
- name: dev
75100
mountPath: /dev
76101
- name: sys
77-
mountPath: /host-sys
102+
mountPath: /sys
78103
readOnly: true
79104
- name: proc-driver-nvidia
80-
mountPath: /proc/driver/nvidia
105+
mountPath: /host-proc/driver/nvidia
106+
readOnly: true
107+
- name: nvidia-device-plugin-config
108+
mountPath: /etc/device-plugin/
81109
readOnly: true
82110
resources:
83111
requests:
84112
cpu: 50m
85-
memory: 10Mi
113+
memory: 20Mi
86114
limits:
87115
cpu: 50m
88-
memory: 20Mi
116+
memory: 50Mi
89117
{{- if .Values.nodeSelector }}
90118
nodeSelector:
91119
{{- toYaml .Values.nodeSelector | nindent 8 }}
92120
{{- end }}
93-
---
94-
# RuntimeClass for NVIDIA Container Runtime
95-
apiVersion: node.k8s.io/v1
96-
kind: RuntimeClass
97-
metadata:
98-
name: {{ include "sim.fullname" . }}-nvidia
99-
labels:
100-
{{- include "sim.labels" . | nindent 4 }}
101-
handler: nvidia
102-
{{- end }}
121+
{{- end }}

helm/sim/values.yaml

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -400,8 +400,10 @@ postgresql:
400400
algorithm: RSA # RSA or ECDSA
401401
size: 4096 # Key size in bits
402402
# Issuer reference (REQUIRED if tls.enabled is true)
403+
# By default, references the CA issuer created by certManager.caIssuer
404+
# Make sure certManager.enabled is true, or provide your own issuer
403405
issuerRef:
404-
name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer
406+
name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer
405407
kind: ClusterIssuer # ClusterIssuer or Issuer
406408
group: "" # Optional: cert-manager.io (leave empty for default)
407409
# Additional DNS names (optional)
@@ -463,20 +465,26 @@ externalDatabase:
463465
ollama:
464466
# Enable/disable Ollama deployment
465467
enabled: false
466-
468+
467469
# Image configuration
468470
image:
469471
repository: ollama/ollama
470472
tag: latest
471473
pullPolicy: Always
472-
474+
473475
# Number of replicas
474476
replicaCount: 1
475-
477+
476478
# GPU configuration
477479
gpu:
478480
enabled: false
479481
count: 1
482+
# GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing"
483+
# - mig: Hardware-level GPU partitioning (requires supported GPUs like A100)
484+
# - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs)
485+
strategy: "time-slicing"
486+
# Number of time-slicing replicas (only used when strategy is "time-slicing")
487+
timeSlicingReplicas: 5
480488

481489
# Node selector for GPU workloads (adjust labels based on your cluster configuration)
482490
nodeSelector:
@@ -1185,4 +1193,53 @@ externalSecrets:
11851193
# External database password (when using managed database services)
11861194
externalDatabase:
11871195
# Path to external database password in external store
1188-
password: ""
1196+
password: ""
1197+
1198+
# cert-manager configuration
1199+
# Prerequisites: Install cert-manager in your cluster first
1200+
# See: https://cert-manager.io/docs/installation/
1201+
#
1202+
# This implements the recommended CA bootstrap pattern from cert-manager:
1203+
# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA)
1204+
# 2. Root CA Certificate (self-signed, becomes the trust anchor)
1205+
# 3. CA ClusterIssuer (signs application certificates using root CA)
1206+
#
1207+
# Reference: https://cert-manager.io/docs/configuration/selfsigned/
1208+
certManager:
1209+
# Enable/disable cert-manager issuer resources
1210+
enabled: false
1211+
1212+
# Self-signed ClusterIssuer (used ONLY to bootstrap the root CA)
1213+
# Do not reference this issuer directly for application certificates
1214+
selfSignedIssuer:
1215+
name: "sim-selfsigned-bootstrap-issuer"
1216+
1217+
# Root CA Certificate configuration
1218+
# This certificate is signed by the self-signed issuer and used as the trust anchor
1219+
rootCA:
1220+
# Name of the Certificate resource
1221+
certificateName: "sim-root-ca"
1222+
# Namespace where the root CA certificate and secret will be created
1223+
# Must match cert-manager's cluster-resource-namespace (default: cert-manager)
1224+
namespace: "cert-manager"
1225+
# Common name for the root CA certificate
1226+
commonName: "sim-root-ca"
1227+
# Secret name where the root CA certificate and key will be stored
1228+
secretName: "sim-root-ca-secret"
1229+
# Certificate validity duration (default: 10 years)
1230+
duration: "87600h"
1231+
# Renew before expiry (default: 90 days)
1232+
renewBefore: "2160h"
1233+
# Private key configuration
1234+
privateKey:
1235+
algorithm: RSA
1236+
size: 4096
1237+
# Subject configuration
1238+
subject:
1239+
organizations: []
1240+
# If empty, defaults to the release name
1241+
1242+
# CA ClusterIssuer configuration
1243+
# This is the issuer that applications should reference for obtaining certificates
1244+
caIssuer:
1245+
name: "sim-ca-issuer"

0 commit comments

Comments
 (0)