Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions helm/sim/templates/cert-manager-issuers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{{- if .Values.certManager.enabled }}
{{- /*
cert-manager Issuer Bootstrap Pattern

PREREQUISITE: cert-manager must be installed in your cluster before enabling this.
The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace
(defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there.

Install cert-manager: https://cert-manager.io/docs/installation/

This implements the recommended pattern from cert-manager documentation:
1. A self-signed ClusterIssuer (for bootstrapping the root CA only)
2. A root CA Certificate (self-signed, used to sign other certificates)
3. A CA ClusterIssuer (uses the root CA to sign certificates)

Reference: https://cert-manager.io/docs/configuration/selfsigned/
*/ -}}

---
# 1. Self-Signed ClusterIssuer (Bootstrap Only)
# This issuer is used ONLY to create the root CA certificate.
# It should NOT be used directly for application certificates.
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: {{ .Values.certManager.selfSignedIssuer.name }}
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: cert-manager
spec:
selfSigned: {}

---
# 2. Root CA Certificate
# This certificate is signed by the self-signed issuer and becomes the root of trust.
# The secret created here will be used by the CA issuer to sign certificates.
# NOTE: This must be created in the cert-manager namespace (or the namespace specified
# in certManager.rootCA.namespace). Ensure cert-manager is installed there first.
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: {{ .Values.certManager.rootCA.certificateName }}
namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: cert-manager
spec:
isCA: true
commonName: {{ .Values.certManager.rootCA.commonName }}
secretName: {{ .Values.certManager.rootCA.secretName }}
duration: {{ .Values.certManager.rootCA.duration | default "87600h" }}
renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }}
privateKey:
algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }}
size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }}
subject:
organizations:
{{- if .Values.certManager.rootCA.subject.organizations }}
{{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }}
{{- else }}
- {{ .Release.Name }}
{{- end }}
issuerRef:
name: {{ .Values.certManager.selfSignedIssuer.name }}
kind: ClusterIssuer
group: cert-manager.io

---
# 3. CA ClusterIssuer
# This is the issuer that should be used by applications to obtain certificates.
# It signs certificates using the root CA created above.
# NOTE: This issuer may briefly show "not ready" on first install while cert-manager
# processes the Certificate above and creates the secret. It will auto-reconcile.
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: {{ .Values.certManager.caIssuer.name }}
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: cert-manager
spec:
ca:
secretName: {{ .Values.certManager.rootCA.secretName }}
{{- end }}
77 changes: 45 additions & 32 deletions helm/sim/templates/gpu-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,36 @@
{{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }}
---
# NVIDIA Device Plugin DaemonSet for GPU support
# 1. ConfigMap for NVIDIA Device Plugin Configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
namespace: {{ .Release.Namespace }}
labels:
{{- include "sim.labels" . | nindent 4 }}
app.kubernetes.io/component: nvidia-device-plugin
data:
config.yaml: |
version: v1
flags:
{{- if eq .Values.ollama.gpu.strategy "mig" }}
migStrategy: "single"
{{- else }}
migStrategy: "none"
{{- end }}
failOnInitError: false
plugin:
passDeviceSpecs: true
deviceListStrategy: envvar
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Invalid config structure for NVIDIA device plugin settings

Medium Severity

The ConfigMap places passDeviceSpecs and deviceListStrategy under a plugin: section, but the NVIDIA k8s-device-plugin config schema expects these settings under the flags: section. The original code passed these as CLI arguments (--pass-device-specs=true, --device-list-strategy=envvar), which map to the flags: section in config file format. With the current structure, the device plugin may ignore these settings and use default values instead, potentially causing GPU device passthrough and enumeration issues.

Fix in Cursor Fix in Web

{{- if eq .Values.ollama.gpu.strategy "time-slicing" }}
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }}
{{- end }}
---
# 2. NVIDIA Device Plugin DaemonSet for GPU support
apiVersion: apps/v1
kind: DaemonSet
metadata:
Expand Down Expand Up @@ -35,9 +65,6 @@ spec:
# Only schedule on nodes with NVIDIA GPUs
accelerator: nvidia
priorityClassName: system-node-critical
runtimeClassName: nvidia
hostNetwork: true
hostPID: true
volumes:
- name: device-plugin
hostPath:
Expand All @@ -48,22 +75,21 @@ spec:
- name: sys
hostPath:
path: /sys
- name: proc-driver-nvidia
hostPath:
path: /proc/driver/nvidia
# Volume to mount the ConfigMap
- name: nvidia-device-plugin-config
configMap:
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
containers:
- name: nvidia-device-plugin
image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5
image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2
imagePullPolicy: Always
args:
- --mig-strategy=single
- --pass-device-specs=true
- --fail-on-init-error=false
- --device-list-strategy=envvar
- --nvidia-driver-root=/host-sys/fs/cgroup
- "--config-file=/etc/device-plugin/config.yaml"
{{- if eq .Values.ollama.gpu.strategy "mig" }}
env:
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
{{- end }}
securityContext:
allowPrivilegeEscalation: false
capabilities:
Expand All @@ -74,29 +100,16 @@ spec:
- name: dev
mountPath: /dev
- name: sys
mountPath: /host-sys
mountPath: /sys
readOnly: true
- name: proc-driver-nvidia
mountPath: /proc/driver/nvidia
- name: nvidia-device-plugin-config
mountPath: /etc/device-plugin/
readOnly: true
resources:
requests:
cpu: 50m
memory: 10Mi
memory: 20Mi
limits:
cpu: 50m
memory: 20Mi
{{- if .Values.nodeSelector }}
nodeSelector:
{{- toYaml .Values.nodeSelector | nindent 8 }}
{{- end }}
---
# RuntimeClass for NVIDIA Container Runtime
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: {{ include "sim.fullname" . }}-nvidia
labels:
{{- include "sim.labels" . | nindent 4 }}
handler: nvidia
{{- end }}
memory: 50Mi
{{- end }}
67 changes: 62 additions & 5 deletions helm/sim/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -400,8 +400,10 @@ postgresql:
algorithm: RSA # RSA or ECDSA
size: 4096 # Key size in bits
# Issuer reference (REQUIRED if tls.enabled is true)
# By default, references the CA issuer created by certManager.caIssuer
# Make sure certManager.enabled is true, or provide your own issuer
issuerRef:
name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer
name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer
kind: ClusterIssuer # ClusterIssuer or Issuer
group: "" # Optional: cert-manager.io (leave empty for default)
# Additional DNS names (optional)
Expand Down Expand Up @@ -463,20 +465,26 @@ externalDatabase:
ollama:
# Enable/disable Ollama deployment
enabled: false

# Image configuration
image:
repository: ollama/ollama
tag: latest
pullPolicy: Always

# Number of replicas
replicaCount: 1

# GPU configuration
gpu:
enabled: false
count: 1
# GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing"
# - mig: Hardware-level GPU partitioning (requires supported GPUs like A100)
# - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs)
strategy: "time-slicing"
# Number of time-slicing replicas (only used when strategy is "time-slicing")
timeSlicingReplicas: 5

# Node selector for GPU workloads (adjust labels based on your cluster configuration)
nodeSelector:
Expand Down Expand Up @@ -1185,4 +1193,53 @@ externalSecrets:
# External database password (when using managed database services)
externalDatabase:
# Path to external database password in external store
password: ""
password: ""

# cert-manager configuration
# Prerequisites: Install cert-manager in your cluster first
# See: https://cert-manager.io/docs/installation/
#
# This implements the recommended CA bootstrap pattern from cert-manager:
# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA)
# 2. Root CA Certificate (self-signed, becomes the trust anchor)
# 3. CA ClusterIssuer (signs application certificates using root CA)
#
# Reference: https://cert-manager.io/docs/configuration/selfsigned/
certManager:
# Enable/disable cert-manager issuer resources
enabled: false

# Self-signed ClusterIssuer (used ONLY to bootstrap the root CA)
# Do not reference this issuer directly for application certificates
selfSignedIssuer:
name: "sim-selfsigned-bootstrap-issuer"

# Root CA Certificate configuration
# This certificate is signed by the self-signed issuer and used as the trust anchor
rootCA:
# Name of the Certificate resource
certificateName: "sim-root-ca"
# Namespace where the root CA certificate and secret will be created
# Must match cert-manager's cluster-resource-namespace (default: cert-manager)
namespace: "cert-manager"
# Common name for the root CA certificate
commonName: "sim-root-ca"
# Secret name where the root CA certificate and key will be stored
secretName: "sim-root-ca-secret"
# Certificate validity duration (default: 10 years)
duration: "87600h"
# Renew before expiry (default: 90 days)
renewBefore: "2160h"
# Private key configuration
privateKey:
algorithm: RSA
size: 4096
# Subject configuration
subject:
organizations: []
# If empty, defaults to the release name

# CA ClusterIssuer configuration
# This is the issuer that applications should reference for obtaining certificates
caIssuer:
name: "sim-ca-issuer"