From 75177b93a9a4885c20924281367a3b73933bffee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ois=C3=ADn=20Kyne?= Date: Tue, 26 May 2026 20:09:22 +0100 Subject: [PATCH] feat(disk): handle disk maxing out better --- CLAUDE.md | 2 + docs/getting-started.md | 19 +++++ .../infrastructure/values/erpc.yaml.gotmpl | 5 +- .../values/monitoring.yaml.gotmpl | 7 +- .../networks/ethereum/helmfile.yaml.gotmpl | 30 ++++++- .../networks/ethereum/templates/pvc.yaml | 32 +++++++- .../networks/ethereum/values.yaml.gotmpl | 5 ++ internal/network/network.go | 14 ++++ internal/network/preflight.go | 78 +++++++++++++++++++ 9 files changed, 186 insertions(+), 6 deletions(-) create mode 100644 internal/network/preflight.go diff --git a/CLAUDE.md b/CLAUDE.md index 3507d402..3f816cda 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -176,6 +176,8 @@ obol sell delete ollama-gated -n llm Two-stage templating: `values.yaml.gotmpl` with `@enum/@default/@description` annotations → CLI flags → rendered `values.yaml` (Stage 1), then `helmfile sync --state-values-file values.yaml --state-values-set id=` (Stage 2). Unique namespaces: `-` where ID is petname or `--id `. Local Ethereum nodes auto-registered as priority upstream in eRPC via `RegisterERPCUpstream()` (write methods blocked on local → routed to remote). +**Ethereum `--mode full|archive`** (default `full`): controls whether reth runs as a pruned full node (~500 GB mainnet / ~100 GB testnet) or an archive node retaining all historical state (~4 TB+ mainnet / ~300 GB testnet). Archive mode is for state replay (block explorers, historical `eth_call`, indexers); full mode is the right default for everything else. The mode flows through to (a) the reth `--full` arg in `internal/embed/networks/ethereum/helmfile.yaml.gotmpl`, (b) PVC sizing in `templates/pvc.yaml`, and (c) the `helmfile` `persistence.size` request. `obol network install ethereum` runs a disk-space preflight via `internal/network/preflight.go` — it warns when `cfg.DataDir` has less free disk than `(network, mode)` is expected to need, prompts the user, and auto-continues in non-interactive mode (no TTY / JSON output) so scripted installs don't deadlock. Other execution clients (geth, nethermind, besu, erigon) ignore the mode flag for now. + ## Stack Lifecycle | Command | Action | diff --git a/docs/getting-started.md b/docs/getting-started.md index 640e7756..ef3fba44 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -216,6 +216,25 @@ obol network sync ethereum/demo This creates the `ethereum-demo` namespace with an execution client (reth) and a consensus client (lighthouse). +### Full vs archive mode + +`obol network install ethereum` defaults to `--mode=full`, which prunes +historical state and needs ~500 GB on mainnet (~100 GB on testnets). Pass +`--mode=archive` if you need to replay state across history (block +explorers, historical `eth_call`, indexers); archive nodes hold the full +state trie and grow to ~4 TB+ on mainnet. + +```bash +# Default: pruned full node +obol network install ethereum --network=mainnet + +# Archive node for state replay (requires ~4-5 TB free) +obol network install ethereum --network=mainnet --mode=archive +``` + +The installer warns when the data directory has less free disk than the +chosen mode is likely to need. + Verify: ```bash diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index 3b45529e..a967a554 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -29,7 +29,10 @@ config: |- - id: memory-cache driver: memory memory: - maxItems: 10000 + # LRU cap on cached RPC responses. The connector is memory-only + # (no disk), so the upper bound is enforced by the pod memory + # limit below; this cap keeps cache RAM well under that limit. + maxItems: 5000 policies: - network: "*" method: "*" diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl index e440bd0d..ddda3b00 100644 --- a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -11,7 +11,12 @@ prometheus: matchLabels: release: monitoring podMonitorNamespaceSelector: {} - retention: 8d + # Time-based retention is still honored, but retentionSize is the hard + # cap that prevents the TSDB from filling the k3d node's writable layer + # overnight and triggering DiskPressure → cascading pod evictions. + # 2GB on emptyDir is plenty for a local single-node stack. + retention: 2d + retentionSize: 2GB resources: requests: cpu: 100m diff --git a/internal/embed/networks/ethereum/helmfile.yaml.gotmpl b/internal/embed/networks/ethereum/helmfile.yaml.gotmpl index 4d1a8e1f..8cc5a455 100644 --- a/internal/embed/networks/ethereum/helmfile.yaml.gotmpl +++ b/internal/embed/networks/ethereum/helmfile.yaml.gotmpl @@ -27,6 +27,15 @@ releases: needs: [ethereum-pvcs] values: # Network and checkpoint sync configuration + # + # global.clientArgs.networks..execution.reth is the upstream + # umbrella chart's per-network reth args list. We APPEND to it rather + # than overriding reth.extraArgs directly, because reth.extraArgs in + # the umbrella chart is a templated string that looks up this very + # list — overriding it would silently drop the --chain= arg + # that testnets depend on. Mainnet's upstream list is empty; testnets + # are ["--chain="]. We carry those over verbatim and add + # --full only when mode != archive. - global: main: network: '{{ .Values.network }}' @@ -36,8 +45,25 @@ releases: mainnet: https://mainnet-checkpoint-sync.attestant.io sepolia: https://checkpoint-sync.sepolia.ethpandaops.io hoodi: https://checkpoint-sync.hoodi.ethpandaops.io + {{- if eq .Values.executionClient "reth" }} + clientArgs: + networks: + {{ .Values.network }}: + execution: + reth: + {{- if ne .Values.network "mainnet" }} + - --chain={{ .Values.network }} + {{- end }} + {{- if ne .Values.mode "archive" }} + - --full + {{- end }} + {{- end }} # Execution client (pinned versions — update periodically) + # Reth defaults to archive (~4TB+ mainnet). The --mode flag controls + # whether we pass --full to prune historical state down to ~500GB, + # wired through global.clientArgs above (NOT reth.extraArgs — see + # comment above). - {{ .Values.executionClient }}: enabled: true image: @@ -54,7 +80,7 @@ releases: {{- end }} persistence: enabled: true - size: 500Gi + size: {{ if eq .Values.network "mainnet" }}{{ if eq .Values.mode "archive" }}4500Gi{{ else }}500Gi{{ end }}{{ else }}{{ if eq .Values.mode "archive" }}300Gi{{ else }}100Gi{{ end }}{{ end }} existingClaim: execution-{{ .Values.executionClient }}-{{ .Values.network }} # Consensus client (pinned versions — update periodically) @@ -75,7 +101,7 @@ releases: {{- end }} persistence: enabled: true - size: 200Gi + size: {{ if eq .Values.network "mainnet" }}{{ if eq .Values.mode "archive" }}500Gi{{ else }}200Gi{{ end }}{{ else }}{{ if eq .Values.mode "archive" }}100Gi{{ else }}50Gi{{ end }}{{ end }} existingClaim: consensus-{{ .Values.consensusClient }}-{{ .Values.network }} # Metadata ConfigMap for frontend discovery diff --git a/internal/embed/networks/ethereum/templates/pvc.yaml b/internal/embed/networks/ethereum/templates/pvc.yaml index de4e018c..38c41093 100644 --- a/internal/embed/networks/ethereum/templates/pvc.yaml +++ b/internal/embed/networks/ethereum/templates/pvc.yaml @@ -1,4 +1,32 @@ {{- if eq .Release.Name "ethereum-pvcs" }} +{{- /* + PVC sizing is a function of (network, mode). Sizes are estimates with + ~30% headroom for chain growth. local-path storage does not pre-allocate, + so these requests primarily document intent and serve as soft caps when + a sized storage class is swapped in later. +*/ -}} +{{- $mode := default "full" .Values.mode -}} +{{- $isArchive := eq $mode "archive" -}} +{{- $execSize := "500Gi" -}} +{{- $consensusSize := "200Gi" -}} +{{- if eq .Values.network "mainnet" -}} + {{- if $isArchive -}} + {{- $execSize = "4500Gi" -}} + {{- $consensusSize = "500Gi" -}} + {{- else -}} + {{- $execSize = "500Gi" -}} + {{- $consensusSize = "200Gi" -}} + {{- end -}} +{{- else -}} + {{- /* sepolia, hoodi and other testnets */ -}} + {{- if $isArchive -}} + {{- $execSize = "300Gi" -}} + {{- $consensusSize = "100Gi" -}} + {{- else -}} + {{- $execSize = "100Gi" -}} + {{- $consensusSize = "50Gi" -}} + {{- end -}} +{{- end -}} --- # Ethereum Execution Client PVC apiVersion: v1 @@ -12,7 +40,7 @@ spec: storageClassName: local-path resources: requests: - storage: 500Gi + storage: {{ $execSize }} --- # Ethereum Consensus Client PVC apiVersion: v1 @@ -26,5 +54,5 @@ spec: storageClassName: local-path resources: requests: - storage: 200Gi + storage: {{ $consensusSize }} {{- end }} diff --git a/internal/embed/networks/ethereum/values.yaml.gotmpl b/internal/embed/networks/ethereum/values.yaml.gotmpl index 874a0e2e..d5068896 100644 --- a/internal/embed/networks/ethereum/values.yaml.gotmpl +++ b/internal/embed/networks/ethereum/values.yaml.gotmpl @@ -15,3 +15,8 @@ executionClient: {{.ExecutionClient}} # @default lighthouse # @description Consensus layer client consensusClient: {{.ConsensusClient}} + +# @enum full,archive +# @default full +# @description Node mode. 'full' prunes historical state (~500GB mainnet); 'archive' keeps all state for history replay (~4TB+ mainnet) +mode: {{.Mode}} diff --git a/internal/network/network.go b/internal/network/network.go index 20a7ae07..59e9be95 100644 --- a/internal/network/network.go +++ b/internal/network/network.go @@ -125,6 +125,20 @@ func Install(cfg *config.Config, u *ui.UI, network string, overrides map[string] templateData[field.Name] = value } + // Disk-space preflight (currently only meaningful for ethereum). The + // check warns and prompts; in non-interactive mode (no TTY / JSON) it + // auto-continues so scripted installs don't deadlock. + if network == "ethereum" { + netValue := templateData["Network"] + modeValue := templateData["Mode"] + if modeValue == "" { + modeValue = "full" + } + if err := CheckNetworkDiskSpace(u, cfg.DataDir, netValue, modeValue); err != nil { + return err + } + } + // Read the embedded values template valuesContent, err := embed.ReadEmbeddedNetworkFile(network, "values.yaml.gotmpl") if err != nil { diff --git a/internal/network/preflight.go b/internal/network/preflight.go new file mode 100644 index 00000000..e25679ba --- /dev/null +++ b/internal/network/preflight.go @@ -0,0 +1,78 @@ +package network + +import ( + "fmt" + "syscall" + + "github.com/ObolNetwork/obol-stack/internal/ui" +) + +// diskSpaceRequirementGB returns the recommended free-disk minimum for +// (network, mode) in gigabytes. Numbers include ~30% headroom for chain +// growth between releases. Sizes are reth-anchored; other clients are in +// the same ballpark. +func diskSpaceRequirementGB(network, mode string) uint64 { + archive := mode == "archive" + switch network { + case "mainnet": + if archive { + return 5000 + } + return 700 + default: + // sepolia, hoodi, and other testnets + if archive { + return 400 + } + return 150 + } +} + +// freeDiskBytes returns the free disk bytes available at path. Used to +// check whether a network install has room to grow before we let helmfile +// schedule a 4TB PVC that will silently fill the host overnight. +func freeDiskBytes(path string) (uint64, error) { + var stat syscall.Statfs_t + if err := syscall.Statfs(path, &stat); err != nil { + return 0, fmt.Errorf("statfs %s: %w", path, err) + } + // Bavail is reserved-block-aware (vs Bfree); use it for "what a regular + // process can actually allocate". + return stat.Bavail * uint64(stat.Bsize), nil +} + +// CheckNetworkDiskSpace warns when the data directory has less free disk +// than the install is expected to need. The default answer is to continue: +// in non-interactive contexts (no TTY, JSON mode) the prompt auto-accepts +// so scripted installs don't deadlock. The user only blocks the install by +// explicitly declining at an interactive prompt. +func CheckNetworkDiskSpace(u *ui.UI, dataDir, network, mode string) error { + requiredGB := diskSpaceRequirementGB(network, mode) + + freeBytes, err := freeDiskBytes(dataDir) + if err != nil { + // Best-effort: a statfs failure shouldn't block install. + u.Warnf("Could not check free disk space at %s: %v", dataDir, err) + return nil + } + + freeGB := freeBytes / (1024 * 1024 * 1024) + + u.Detail("Disk space", fmt.Sprintf("%d GB free at %s (this network needs ~%d GB)", freeGB, dataDir, requiredGB)) + + if freeGB >= requiredGB { + return nil + } + + u.Warnf("Low disk space: %d GB free, ~%d GB recommended for %s/%s", + freeGB, requiredGB, network, mode) + if mode != "archive" { + u.Dim(" (full mode is the lighter option; archive mode would need ~4-5 TB on mainnet)") + } + + if !u.Confirm("Continue with install anyway?", true) { + return fmt.Errorf("install cancelled: insufficient disk space (%d GB free, ~%d GB recommended)", freeGB, requiredGB) + } + + return nil +}