From 3c6f4e8184d696ca5c4291bc1f54957a132bf0cd Mon Sep 17 00:00:00 2001 From: waleed Date: Tue, 27 Jan 2026 17:43:39 -0800 Subject: [PATCH 1/4] improvement(helm): update GPU device plugin and add cert-manager issuers --- helm/sim/templates/cert-manager-issuers.yaml | 74 ++++++++++++++++++++ helm/sim/templates/gpu-device-plugin.yaml | 73 ++++++++++++------- helm/sim/values.yaml | 67 ++++++++++++++++-- 3 files changed, 182 insertions(+), 32 deletions(-) create mode 100644 helm/sim/templates/cert-manager-issuers.yaml diff --git a/helm/sim/templates/cert-manager-issuers.yaml b/helm/sim/templates/cert-manager-issuers.yaml new file mode 100644 index 0000000000..856a466028 --- /dev/null +++ b/helm/sim/templates/cert-manager-issuers.yaml @@ -0,0 +1,74 @@ +{{- if .Values.certManager.enabled }} +{{- /* + cert-manager Issuer Bootstrap Pattern + + This implements the recommended pattern from cert-manager documentation: + 1. A self-signed ClusterIssuer (for bootstrapping the root CA only) + 2. A root CA Certificate (self-signed, used to sign other certificates) + 3. A CA ClusterIssuer (uses the root CA to sign certificates) + + Reference: https://cert-manager.io/docs/configuration/selfsigned/ +*/ -}} + +--- +# 1. Self-Signed ClusterIssuer (Bootstrap Only) +# This issuer is used ONLY to create the root CA certificate. +# It should NOT be used directly for application certificates. +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: {{ .Values.certManager.selfSignedIssuer.name }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: cert-manager +spec: + selfSigned: {} + +--- +# 2. Root CA Certificate +# This certificate is signed by the self-signed issuer and becomes the root of trust. +# The secret created here will be used by the CA issuer to sign certificates. +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ .Values.certManager.rootCA.certificateName }} + namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: cert-manager +spec: + isCA: true + commonName: {{ .Values.certManager.rootCA.commonName }} + secretName: {{ .Values.certManager.rootCA.secretName }} + duration: {{ .Values.certManager.rootCA.duration | default "87600h" }} + renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }} + privateKey: + algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }} + size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }} + subject: + organizations: + {{- if .Values.certManager.rootCA.subject.organizations }} + {{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }} + {{- else }} + - {{ .Release.Name }} + {{- end }} + issuerRef: + name: {{ .Values.certManager.selfSignedIssuer.name }} + kind: ClusterIssuer + group: cert-manager.io + +--- +# 3. CA ClusterIssuer +# This is the issuer that should be used by applications to obtain certificates. +# It signs certificates using the root CA created above. +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: {{ .Values.certManager.caIssuer.name }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: cert-manager +spec: + ca: + secretName: {{ .Values.certManager.rootCA.secretName }} +{{- end }} diff --git a/helm/sim/templates/gpu-device-plugin.yaml b/helm/sim/templates/gpu-device-plugin.yaml index df9a30b3d2..3b2f0b59dc 100644 --- a/helm/sim/templates/gpu-device-plugin.yaml +++ b/helm/sim/templates/gpu-device-plugin.yaml @@ -1,6 +1,37 @@ {{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }} --- -# NVIDIA Device Plugin DaemonSet for GPU support +# 1. ConfigMap for NVIDIA Device Plugin Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config + namespace: {{ .Release.Namespace }} + labels: + {{- include "sim.labels" . | nindent 4 }} + app.kubernetes.io/component: nvidia-device-plugin +data: + config.yaml: | + version: v1 + flags: + {{- if eq .Values.ollama.gpu.strategy "mig" }} + migStrategy: "single" + {{- else }} + migStrategy: "none" + {{- end }} + failOnInitError: false + nvidiaDriverRoot: /host-proc/driver/nvidia + plugin: + passDeviceSpecs: true + deviceListStrategy: envvar + {{- if eq .Values.ollama.gpu.strategy "time-slicing" }} + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 10 }} + {{- end }} +--- +# 2. NVIDIA Device Plugin DaemonSet for GPU support apiVersion: apps/v1 kind: DaemonSet metadata: @@ -35,9 +66,6 @@ spec: # Only schedule on nodes with NVIDIA GPUs accelerator: nvidia priorityClassName: system-node-critical - runtimeClassName: nvidia - hostNetwork: true - hostPID: true volumes: - name: device-plugin hostPath: @@ -51,19 +79,16 @@ spec: - name: proc-driver-nvidia hostPath: path: /proc/driver/nvidia + # Volume to mount the ConfigMap + - name: nvidia-device-plugin-config + configMap: + name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config containers: - name: nvidia-device-plugin - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5 + image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 imagePullPolicy: Always args: - - --mig-strategy=single - - --pass-device-specs=true - - --fail-on-init-error=false - - --device-list-strategy=envvar - - --nvidia-driver-root=/host-sys/fs/cgroup - env: - - name: NVIDIA_MIG_MONITOR_DEVICES - value: all + - "--config-file=/etc/device-plugin/config.yaml" securityContext: allowPrivilegeEscalation: false capabilities: @@ -74,29 +99,23 @@ spec: - name: dev mountPath: /dev - name: sys - mountPath: /host-sys + mountPath: /sys readOnly: true - name: proc-driver-nvidia - mountPath: /proc/driver/nvidia + mountPath: /host-proc/driver/nvidia + readOnly: true + - name: nvidia-device-plugin-config + mountPath: /etc/device-plugin/ readOnly: true resources: requests: cpu: 50m - memory: 10Mi + memory: 20Mi limits: cpu: 50m - memory: 20Mi + memory: 50Mi {{- if .Values.nodeSelector }} nodeSelector: {{- toYaml .Values.nodeSelector | nindent 8 }} {{- end }} ---- -# RuntimeClass for NVIDIA Container Runtime -apiVersion: node.k8s.io/v1 -kind: RuntimeClass -metadata: - name: {{ include "sim.fullname" . }}-nvidia - labels: - {{- include "sim.labels" . | nindent 4 }} -handler: nvidia -{{- end }} \ No newline at end of file +{{- end }} diff --git a/helm/sim/values.yaml b/helm/sim/values.yaml index dc09a9ce26..e78e0f9172 100644 --- a/helm/sim/values.yaml +++ b/helm/sim/values.yaml @@ -400,8 +400,10 @@ postgresql: algorithm: RSA # RSA or ECDSA size: 4096 # Key size in bits # Issuer reference (REQUIRED if tls.enabled is true) + # By default, references the CA issuer created by certManager.caIssuer + # Make sure certManager.enabled is true, or provide your own issuer issuerRef: - name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer + name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer kind: ClusterIssuer # ClusterIssuer or Issuer group: "" # Optional: cert-manager.io (leave empty for default) # Additional DNS names (optional) @@ -463,20 +465,26 @@ externalDatabase: ollama: # Enable/disable Ollama deployment enabled: false - + # Image configuration image: repository: ollama/ollama tag: latest pullPolicy: Always - + # Number of replicas replicaCount: 1 - + # GPU configuration gpu: enabled: false count: 1 + # GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing" + # - mig: Hardware-level GPU partitioning (requires supported GPUs like A100) + # - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs) + strategy: "time-slicing" + # Number of time-slicing replicas (only used when strategy is "time-slicing") + timeSlicingReplicas: 5 # Node selector for GPU workloads (adjust labels based on your cluster configuration) nodeSelector: @@ -1185,4 +1193,53 @@ externalSecrets: # External database password (when using managed database services) externalDatabase: # Path to external database password in external store - password: "" \ No newline at end of file + password: "" + +# cert-manager configuration +# Prerequisites: Install cert-manager in your cluster first +# See: https://cert-manager.io/docs/installation/ +# +# This implements the recommended CA bootstrap pattern from cert-manager: +# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA) +# 2. Root CA Certificate (self-signed, becomes the trust anchor) +# 3. CA ClusterIssuer (signs application certificates using root CA) +# +# Reference: https://cert-manager.io/docs/configuration/selfsigned/ +certManager: + # Enable/disable cert-manager issuer resources + enabled: false + + # Self-signed ClusterIssuer (used ONLY to bootstrap the root CA) + # Do not reference this issuer directly for application certificates + selfSignedIssuer: + name: "sim-selfsigned-bootstrap-issuer" + + # Root CA Certificate configuration + # This certificate is signed by the self-signed issuer and used as the trust anchor + rootCA: + # Name of the Certificate resource + certificateName: "sim-root-ca" + # Namespace where the root CA certificate and secret will be created + # Must match cert-manager's cluster-resource-namespace (default: cert-manager) + namespace: "cert-manager" + # Common name for the root CA certificate + commonName: "sim-root-ca" + # Secret name where the root CA certificate and key will be stored + secretName: "sim-root-ca-secret" + # Certificate validity duration (default: 10 years) + duration: "87600h" + # Renew before expiry (default: 90 days) + renewBefore: "2160h" + # Private key configuration + privateKey: + algorithm: RSA + size: 4096 + # Subject configuration + subject: + organizations: [] + # If empty, defaults to the release name + + # CA ClusterIssuer configuration + # This is the issuer that applications should reference for obtaining certificates + caIssuer: + name: "sim-ca-issuer" \ No newline at end of file From 549edba905db1abda7d97cc6f07e24d77cfb0fbb Mon Sep 17 00:00:00 2001 From: waleed Date: Tue, 27 Jan 2026 18:04:42 -0800 Subject: [PATCH 2/4] fix(helm): address code review feedback for GPU plugin and cert-manager --- helm/sim/templates/cert-manager-issuers.yaml | 10 +++++++++- helm/sim/templates/gpu-device-plugin.yaml | 14 ++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/helm/sim/templates/cert-manager-issuers.yaml b/helm/sim/templates/cert-manager-issuers.yaml index 856a466028..2e486a357e 100644 --- a/helm/sim/templates/cert-manager-issuers.yaml +++ b/helm/sim/templates/cert-manager-issuers.yaml @@ -2,6 +2,12 @@ {{- /* cert-manager Issuer Bootstrap Pattern + PREREQUISITE: cert-manager must be installed in your cluster before enabling this. + The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace + (defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there. + + Install cert-manager: https://cert-manager.io/docs/installation/ + This implements the recommended pattern from cert-manager documentation: 1. A self-signed ClusterIssuer (for bootstrapping the root CA only) 2. A root CA Certificate (self-signed, used to sign other certificates) @@ -28,11 +34,13 @@ spec: # 2. Root CA Certificate # This certificate is signed by the self-signed issuer and becomes the root of trust. # The secret created here will be used by the CA issuer to sign certificates. +# NOTE: This must be created in the cert-manager namespace (or the namespace specified +# in certManager.rootCA.namespace). Ensure cert-manager is installed there first. apiVersion: cert-manager.io/v1 kind: Certificate metadata: name: {{ .Values.certManager.rootCA.certificateName }} - namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} + namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace labels: {{- include "sim.labels" . | nindent 4 }} app.kubernetes.io/component: cert-manager diff --git a/helm/sim/templates/gpu-device-plugin.yaml b/helm/sim/templates/gpu-device-plugin.yaml index 3b2f0b59dc..78e43edcd1 100644 --- a/helm/sim/templates/gpu-device-plugin.yaml +++ b/helm/sim/templates/gpu-device-plugin.yaml @@ -19,7 +19,6 @@ data: migStrategy: "none" {{- end }} failOnInitError: false - nvidiaDriverRoot: /host-proc/driver/nvidia plugin: passDeviceSpecs: true deviceListStrategy: envvar @@ -28,7 +27,7 @@ data: timeSlicing: resources: - name: nvidia.com/gpu - replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 10 }} + replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }} {{- end }} --- # 2. NVIDIA Device Plugin DaemonSet for GPU support @@ -76,9 +75,6 @@ spec: - name: sys hostPath: path: /sys - - name: proc-driver-nvidia - hostPath: - path: /proc/driver/nvidia # Volume to mount the ConfigMap - name: nvidia-device-plugin-config configMap: @@ -89,6 +85,11 @@ spec: imagePullPolicy: Always args: - "--config-file=/etc/device-plugin/config.yaml" + {{- if eq .Values.ollama.gpu.strategy "mig" }} + env: + - name: NVIDIA_MIG_MONITOR_DEVICES + value: all + {{- end }} securityContext: allowPrivilegeEscalation: false capabilities: @@ -101,9 +102,6 @@ spec: - name: sys mountPath: /sys readOnly: true - - name: proc-driver-nvidia - mountPath: /host-proc/driver/nvidia - readOnly: true - name: nvidia-device-plugin-config mountPath: /etc/device-plugin/ readOnly: true From 43a87850664e3794828c8b9655bb05679c6da6fe Mon Sep 17 00:00:00 2001 From: waleed Date: Tue, 27 Jan 2026 18:10:01 -0800 Subject: [PATCH 3/4] fix(helm): remove duplicate nodeSelector, add hook for CA issuer ordering --- helm/sim/templates/cert-manager-issuers.yaml | 4 ++++ helm/sim/templates/gpu-device-plugin.yaml | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/sim/templates/cert-manager-issuers.yaml b/helm/sim/templates/cert-manager-issuers.yaml index 2e486a357e..883582a5fd 100644 --- a/helm/sim/templates/cert-manager-issuers.yaml +++ b/helm/sim/templates/cert-manager-issuers.yaml @@ -69,6 +69,7 @@ spec: # 3. CA ClusterIssuer # This is the issuer that should be used by applications to obtain certificates. # It signs certificates using the root CA created above. +# Uses post-install hook to ensure Certificate is ready before this issuer is created. apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: @@ -76,6 +77,9 @@ metadata: labels: {{- include "sim.labels" . | nindent 4 }} app.kubernetes.io/component: cert-manager + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "1" spec: ca: secretName: {{ .Values.certManager.rootCA.secretName }} diff --git a/helm/sim/templates/gpu-device-plugin.yaml b/helm/sim/templates/gpu-device-plugin.yaml index 78e43edcd1..b7bb9a628a 100644 --- a/helm/sim/templates/gpu-device-plugin.yaml +++ b/helm/sim/templates/gpu-device-plugin.yaml @@ -112,8 +112,4 @@ spec: limits: cpu: 50m memory: 50Mi - {{- if .Values.nodeSelector }} - nodeSelector: - {{- toYaml .Values.nodeSelector | nindent 8 }} - {{- end }} {{- end }} From be8abee9b5d01c9a8daacc6f9e85955e14910c7e Mon Sep 17 00:00:00 2001 From: waleed Date: Tue, 27 Jan 2026 18:12:19 -0800 Subject: [PATCH 4/4] fix(helm): remove incorrect hook, CA issuer auto-reconciles --- helm/sim/templates/cert-manager-issuers.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/helm/sim/templates/cert-manager-issuers.yaml b/helm/sim/templates/cert-manager-issuers.yaml index 883582a5fd..aef2a61a0c 100644 --- a/helm/sim/templates/cert-manager-issuers.yaml +++ b/helm/sim/templates/cert-manager-issuers.yaml @@ -69,7 +69,8 @@ spec: # 3. CA ClusterIssuer # This is the issuer that should be used by applications to obtain certificates. # It signs certificates using the root CA created above. -# Uses post-install hook to ensure Certificate is ready before this issuer is created. +# NOTE: This issuer may briefly show "not ready" on first install while cert-manager +# processes the Certificate above and creates the secret. It will auto-reconcile. apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: @@ -77,9 +78,6 @@ metadata: labels: {{- include "sim.labels" . | nindent 4 }} app.kubernetes.io/component: cert-manager - annotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "1" spec: ca: secretName: {{ .Values.certManager.rootCA.secretName }}