From 1d4dc7d7b7c8cae22b9e2c28887e16d53d9b857c Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sat, 30 May 2026 10:38:55 -0700 Subject: [PATCH 1/7] feat(planner): re-apply config.toml as part of buildNodeUpdatePlan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The image-drift plan previously ran ApplyStatefulSet → ApplyService → ReplacePod → ObserveImage → MarkReady. config.toml was never re-applied, so any change to fields the controller owns via commonOverrides — most notably Spec.ExternalAddress — never reached the running pod. Inbound publishable P2P could work (NLB to pod), but seid never advertised the NLB hostname via PEX/NodeInfo because external-address stayed empty. Add TaskConfigApply + TaskConfigValidate to the progression, between ApplyService and ReplacePod. ConfigApply runs against the OLD pod's sidecar (writes to the PVC); ReplacePod then cycles the pod and the new seid reads the fresh config.toml. external-address has no hot-reload path in seid today (seictl config_reload.go has a TODO for SIGHUP). Two correctness fixes folded in per pre-implementation expert review: 1. buildNodeUpdatePlan now constructs a real ConfigIntent via the mode planner. Previously paramsForTaskType received nil and returned an empty-Mode ConfigIntent — sei-config's ResolveIntent rejects empty Mode, so TaskConfigApply would have failed at runtime on first image bump. New NodePlanner.BuildConfigIntent method extracted from each mode's BuildPlan; same code path on init and day-2, shared source of truth for mode + overrides. 2. Day-2 intent sets Incremental=true. The init applyFull regenerates config.toml from mode defaults + overrides; running that on a Running node would wipe persistent-peers, state-sync trust point, and any operator-managed TOML keys outside the controller's overrides set. applyIncremental reads on-disk config and patches only the keys we own. Tests: - TestBuildRunningPlan_ImageDrift_ReturnsNodeUpdatePlan updated to expect the new 7-task progression (was 5). - TestBuildRunningPlan_ImageDriftWinsOverSidecar updated similarly. - TestBuildNodeUpdatePlan_ConfigApplyHasIntentWithMode — asserts Mode is set and Overrides carries KeyP2PExternalAddress from Spec.ExternalAddress. - TestBuildNodeUpdatePlan_ConfigApplyIsIncremental — pins Incremental=true. - TestBuildNodeUpdatePlan_ConfigApplyIntentMatchesModePlanner — table- driven coverage across full/archive/validator modes. Out of scope (separate follow-up, both experts flagged): - Autonomous ExternalAddress drift detection on Running nodes. With this change ExternalAddress propagates on next image bump (or any other trigger of buildNodeUpdatePlan). Operators currently trigger via an image SHA pin. If autonomous propagation becomes operationally necessary (e.g., NLB hostname rotation, Spec.Overrides drift, opt-out of publishable P2P), file a follow-up to add drift detection in buildRunningPlan that routes to the same buildNodeUpdatePlan. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/archive.go | 12 ++- internal/planner/full.go | 13 ++- internal/planner/node_update_test.go | 116 ++++++++++++++++++++++++++- internal/planner/planner.go | 39 ++++++++- internal/planner/replay.go | 13 ++- internal/planner/validator.go | 13 ++- 6 files changed, 191 insertions(+), 15 deletions(-) diff --git a/internal/planner/archive.go b/internal/planner/archive.go index a940c7c8..5fe1f576 100644 --- a/internal/planner/archive.go +++ b/internal/planner/archive.go @@ -28,10 +28,18 @@ func (p *archiveNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1. if node.Status.Phase == seiv1alpha1.PhaseRunning { return buildRunningPlan(node) } - return buildBasePlan(node, node.Spec.Peers, nil, &seiconfig.ConfigIntent{ + params, err := p.BuildConfigIntent(node) + if err != nil { + return nil, err + } + return buildBasePlan(node, node.Spec.Peers, nil, params) +} + +func (p *archiveNodePlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + return &seiconfig.ConfigIntent{ Mode: seiconfig.ModeArchive, Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), - }) + }, nil } func (p *archiveNodePlanner) controllerOverrides(node *seiv1alpha1.SeiNode) map[string]string { diff --git a/internal/planner/full.go b/internal/planner/full.go index f04596e1..19c2492b 100644 --- a/internal/planner/full.go +++ b/internal/planner/full.go @@ -33,9 +33,9 @@ func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas return buildRunningPlan(node) } fn := node.Spec.FullNode - params := &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeFull, - Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), + params, err := p.BuildConfigIntent(node) + if err != nil { + return nil, err } if NeedsBootstrap(node) { return buildBootstrapPlan(node, node.Spec.Peers, fn.Snapshot, params) @@ -43,6 +43,13 @@ func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas return buildBasePlan(node, node.Spec.Peers, fn.Snapshot, params) } +func (p *fullNodePlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + return &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeFull, + Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), + }, nil +} + func (p *fullNodePlanner) controllerOverrides(node *seiv1alpha1.SeiNode) map[string]string { sg := node.Spec.FullNode.SnapshotGeneration if sg == nil || sg.Tendermint == nil { diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 333bc665..7684d84b 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -1,10 +1,12 @@ package planner import ( + "encoding/json" "fmt" "testing" . "github.com/onsi/gomega" + seiconfig "github.com/sei-protocol/sei-config" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -68,6 +70,8 @@ func TestBuildRunningPlan_ImageDrift_ReturnsNodeUpdatePlan(t *testing.T) { want := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, + TaskConfigApply, + TaskConfigValidate, task.TaskTypeReplacePod, task.TaskTypeObserveImage, TaskMarkReady, @@ -82,6 +86,114 @@ func TestBuildRunningPlan_ImageDrift_ReturnsNodeUpdatePlan(t *testing.T) { } } +// configIntentFromPlan finds the config-apply task and unmarshals its +// params back into a ConfigIntent for inspection. +func configIntentFromPlan(t *testing.T, plan *seiv1alpha1.TaskPlan) *seiconfig.ConfigIntent { + t.Helper() + g := NewWithT(t) + for _, pt := range plan.Tasks { + if pt.Type != TaskConfigApply { + continue + } + g.Expect(pt.Params).NotTo(BeNil()) + var intent seiconfig.ConfigIntent + g.Expect(json.Unmarshal(pt.Params.Raw, &intent)).To(Succeed()) + return &intent + } + t.Fatal("plan has no config-apply task") + return nil +} + +// The day-2 config-apply must carry the mode planner's full ConfigIntent — +// non-empty Mode, Overrides containing the controller-managed keys — +// otherwise sei-config's ResolveIntent rejects the empty-Mode and +// TaskConfigApply fails at runtime. +func TestBuildNodeUpdatePlan_ConfigApplyHasIntentWithMode(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.ExternalAddress = "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656" + node.Spec.Image = testImageV2 + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + intent := configIntentFromPlan(t, plan) + g.Expect(intent.Mode).To(Equal(seiconfig.ModeFull), "Mode must be set so ResolveIntent accepts the intent") + g.Expect(intent.Overrides).To(HaveKeyWithValue( + seiconfig.KeyP2PExternalAddress, + "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656", + ), "commonOverrides should propagate Spec.ExternalAddress") +} + +// Day-2 must set Incremental=true. The init path uses applyFull which +// regenerates config from mode defaults — running that on a running node +// would wipe persistent-peers, state-sync trust point, and operator-managed +// TOML keys outside the controller's overrides set. Incremental reads +// on-disk config and patches only the keys we own. +func TestBuildNodeUpdatePlan_ConfigApplyIsIncremental(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.Image = testImageV2 + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + intent := configIntentFromPlan(t, plan) + g.Expect(intent.Incremental).To(BeTrue(), + "day-2 ConfigIntent must be incremental to avoid clobbering on-disk state") +} + +// Per-mode coverage: the day-2 intent should match the mode planner's +// Mode value. Catches future drift if a new mode lands and isn't wired +// into BuildConfigIntent. +func TestBuildNodeUpdatePlan_ConfigApplyIntentMatchesModePlanner(t *testing.T) { + cases := []struct { + name string + mutate func(*seiv1alpha1.SeiNode) + wantMode seiconfig.NodeMode + }{ + { + name: "full", + mutate: func(_ *seiv1alpha1.SeiNode) {}, + wantMode: seiconfig.ModeFull, + }, + { + name: "archive", + mutate: func(n *seiv1alpha1.SeiNode) { + n.Spec.FullNode = nil + n.Spec.Archive = &seiv1alpha1.ArchiveSpec{} + }, + wantMode: seiconfig.ModeArchive, + }, + { + name: "validator", + mutate: func(n *seiv1alpha1.SeiNode) { + n.Spec.FullNode = nil + n.Spec.Validator = &seiv1alpha1.ValidatorSpec{} + }, + wantMode: seiconfig.ModeValidator, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + tc.mutate(node) + node.Spec.Image = testImageV2 + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + intent := configIntentFromPlan(t, plan) + g.Expect(intent.Mode).To(Equal(tc.wantMode)) + g.Expect(intent.Incremental).To(BeTrue()) + }) + } +} + // --- ResolvePlan condition tests --- func TestResolvePlan_NodeUpdate_SetsCondition(t *testing.T) { @@ -381,10 +493,12 @@ func TestBuildRunningPlan_ImageDriftWinsOverSidecar(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil()) // Image update plan ends with MarkReady, which also resolves the sidecar. - g.Expect(plan.Tasks).To(HaveLen(5), "should be full node-update plan, not one-task mark-ready") + g.Expect(plan.Tasks).To(HaveLen(7), "should be full node-update plan, not one-task mark-ready") g.Expect(planTaskTypes(plan)).To(Equal([]string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, + TaskConfigApply, + TaskConfigValidate, task.TaskTypeReplacePod, task.TaskTypeObserveImage, TaskMarkReady, diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 1b62a929..3831a168 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -63,6 +63,13 @@ type NodePlanner interface { Validate(node *seiv1alpha1.SeiNode) error BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) Mode() string + + // BuildConfigIntent returns the ConfigIntent the mode's plan would feed + // to TaskConfigApply (mode + merged controller / spec overrides). Shared + // by BuildPlan (init) and buildNodeUpdatePlan (day-2) so they emit + // identical config payloads for the same spec. Day-2 callers set + // Incremental on the returned value before passing it through. + BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) } // GroupPlanner encapsulates logic for building a group-level task plan. @@ -735,18 +742,44 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error } // buildNodeUpdatePlan constructs a plan to roll out an image update on a -// Running node. The plan applies the new StatefulSet spec, waits for the -// rollout to complete, then re-initializes the sidecar. +// Running node. The plan applies the new StatefulSet spec, re-applies +// config.toml (incremental — only the keys the controller owns), then +// cycles the pod and re-initializes the sidecar. // // FailedPhase is deliberately empty: a failure retries on the next reconcile // rather than transitioning the node out of Running. +// +// TaskConfigApply is placed before ReplacePod so the old pod's sidecar +// writes the new config.toml to the PVC; the new pod then mounts the +// PVC and reads the fresh file at seid start. external_address has no +// hot-reload path in seid today (see seictl config_reload.go), so the +// pod restart is the trigger that makes new values take effect. +// +// Incremental=true is mandatory on the day-2 intent. The init path uses +// the non-incremental applyFull, which regenerates config from mode +// defaults + overrides; running that on day 2 would wipe persistent-peers, +// state-sync trust point, and any operator-managed TOML keys that the +// controller doesn't track in overrides. applyIncremental reads on-disk +// config and patches only the overrides we own. func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) + mode, err := plannerForMode(node) + if err != nil { + return nil, fmt.Errorf("resolving mode planner: %w", err) + } + intent, err := mode.BuildConfigIntent(node) + if err != nil { + return nil, fmt.Errorf("building config intent: %w", err) + } + intent.Incremental = true + prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, + TaskConfigApply, + TaskConfigValidate, task.TaskTypeReplacePod, task.TaskTypeObserveImage, sidecar.TaskTypeMarkReady, @@ -755,7 +788,7 @@ func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, erro planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) for i, taskType := range prog { - t, err := buildPlannedTask(planID, taskType, i, paramsForTaskType(node, taskType, nil, nil)) + t, err := buildPlannedTask(planID, taskType, i, paramsForTaskType(node, taskType, nil, intent)) if err != nil { return nil, err } diff --git a/internal/planner/replay.go b/internal/planner/replay.go index e6ec1dcf..3c5bb434 100644 --- a/internal/planner/replay.go +++ b/internal/planner/replay.go @@ -37,9 +37,9 @@ func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas if node.Status.Phase == seiv1alpha1.PhaseRunning { return buildRunningPlan(node) } - params := &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeFull, - Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides()), node.Spec.Overrides), + params, err := p.BuildConfigIntent(node) + if err != nil { + return nil, err } if NeedsBootstrap(node) { return buildBootstrapPlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, params) @@ -47,6 +47,13 @@ func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas return buildBasePlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, params) } +func (p *replayerPlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + return &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeFull, + Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides()), node.Spec.Overrides), + }, nil +} + func (p *replayerPlanner) controllerOverrides() map[string]string { return map[string]string{ keySCAsyncCommitBuffer: "100", diff --git a/internal/planner/validator.go b/internal/planner/validator.go index 88157b5b..99643aae 100644 --- a/internal/planner/validator.go +++ b/internal/planner/validator.go @@ -100,12 +100,19 @@ func (p *validatorPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Ta return buildGenesisPlan(node) } v := node.Spec.Validator - params := &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeValidator, - Overrides: mergeOverrides(commonOverrides(node), node.Spec.Overrides), + params, err := p.BuildConfigIntent(node) + if err != nil { + return nil, err } if NeedsBootstrap(node) { return buildBootstrapPlan(node, node.Spec.Peers, v.Snapshot, params) } return buildBasePlan(node, node.Spec.Peers, v.Snapshot, params) } + +func (p *validatorPlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + return &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeValidator, + Overrides: mergeOverrides(commonOverrides(node), node.Spec.Overrides), + }, nil +} From 43a0018c7dc505aceec762f377c862526f1a673a Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sat, 30 May 2026 10:50:23 -0700 Subject: [PATCH 2/7] fix(planner): nil-guard BuildConfigIntent + pin init Incremental=false MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-review feedback from kubernetes-specialist: 1. Defensive nil-guard on each mode planner's BuildConfigIntent. Today plannerForMode() gates which planner gets called, so the nil-deref isn't reachable. But the method is now part of the NodePlanner interface contract — making it independently safe means future refactors that call it through other paths don't accidentally NPE. 2. Pin init plans to Incremental=false via a new test. The day-2 path sets Incremental=true intentionally; the init path must stay on applyFull (regenerate from mode defaults). Without the assertion, a future refactor could silently flip init and we'd only notice when a fresh node came up with stale state. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/archive.go | 3 +++ internal/planner/full.go | 3 +++ internal/planner/node_update_test.go | 24 ++++++++++++++++++++++++ internal/planner/replay.go | 3 +++ internal/planner/validator.go | 3 +++ 5 files changed, 36 insertions(+) diff --git a/internal/planner/archive.go b/internal/planner/archive.go index 5fe1f576..3bc5d533 100644 --- a/internal/planner/archive.go +++ b/internal/planner/archive.go @@ -36,6 +36,9 @@ func (p *archiveNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1. } func (p *archiveNodePlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + if node.Spec.Archive == nil { + return nil, fmt.Errorf("archive sub-spec is nil") + } return &seiconfig.ConfigIntent{ Mode: seiconfig.ModeArchive, Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), diff --git a/internal/planner/full.go b/internal/planner/full.go index 19c2492b..f130d9c9 100644 --- a/internal/planner/full.go +++ b/internal/planner/full.go @@ -44,6 +44,9 @@ func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas } func (p *fullNodePlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + if node.Spec.FullNode == nil { + return nil, fmt.Errorf("fullNode sub-spec is nil") + } return &seiconfig.ConfigIntent{ Mode: seiconfig.ModeFull, Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 7684d84b..6f5a93f4 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -145,6 +145,30 @@ func TestBuildNodeUpdatePlan_ConfigApplyIsIncremental(t *testing.T) { "day-2 ConfigIntent must be incremental to avoid clobbering on-disk state") } +// Init plans must NOT set Incremental — the init path uses applyFull +// (regenerate config from mode defaults + overrides). Pinning this here +// makes accidental flips during future refactors a loud failure. +func TestBuildPlan_InitPlanConfigIntentNotIncremental(t *testing.T) { + g := NewWithT(t) + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: "full-init", Namespace: "default", Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: "atlantic-2", + Image: "sei:v1.0.0", + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + // No phase — defaults to "" (init path). + } + + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + intent := configIntentFromPlan(t, plan) + g.Expect(intent.Incremental).To(BeFalse(), + "init ConfigIntent must NOT be incremental — applyFull regenerates from mode defaults") +} + // Per-mode coverage: the day-2 intent should match the mode planner's // Mode value. Catches future drift if a new mode lands and isn't wired // into BuildConfigIntent. diff --git a/internal/planner/replay.go b/internal/planner/replay.go index 3c5bb434..8adb5bae 100644 --- a/internal/planner/replay.go +++ b/internal/planner/replay.go @@ -48,6 +48,9 @@ func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas } func (p *replayerPlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + if node.Spec.Replayer == nil { + return nil, fmt.Errorf("replayer sub-spec is nil") + } return &seiconfig.ConfigIntent{ Mode: seiconfig.ModeFull, Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides()), node.Spec.Overrides), diff --git a/internal/planner/validator.go b/internal/planner/validator.go index 99643aae..e5b2cb0b 100644 --- a/internal/planner/validator.go +++ b/internal/planner/validator.go @@ -111,6 +111,9 @@ func (p *validatorPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Ta } func (p *validatorPlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { + if node.Spec.Validator == nil { + return nil, fmt.Errorf("validator sub-spec is nil") + } return &seiconfig.ConfigIntent{ Mode: seiconfig.ModeValidator, Overrides: mergeOverrides(commonOverrides(node), node.Spec.Overrides), From b656d5ec63ba48858b5f922cd3323051a522047b Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sun, 31 May 2026 12:28:10 -0700 Subject: [PATCH 3/7] refactor(planner): per-mode day-2 plans + config-patch task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR feedback: drop the BuildConfigIntent interface method (the abstraction made config-intent a special case when it's just task params), delete the free buildNodeUpdatePlan / buildRunningPlan helpers, and have each mode planner own its full BuildPlan including the day-2 dispatch. Reconciler/executor still just call BuildPlan; the startup-vs-day-2 distinction stays inside the planner. NodePlanner interface unchanged from main (Validate, BuildPlan, Mode). Switch the day-2 task from config-apply (Incremental=true) to config-patch: - config-patch already exists in seictl as a registered sidecar task (sidecar/engine/types.go:15, handler at sidecar/tasks/config.go). Generic TOML merge-patch — read file, merge, write — with no sei-config involvement. - The controller's job becomes "stamp these specific fields the controller directly owns" rather than "regenerate the config via sei-config's typed mode-defaulted resolver." Aligns with the forward-looking design where seid owns config-generation and migrations; controller just patches. - Surgical: only touches the keys we explicitly name. No forced logging.level=info reset on every image bump. No mode-defaulted backfill clobbering operator edits. - For publishable-P2P the day-2 patch is literally {"config.toml": {"p2p": {"external-address": Spec.ExternalAddress}}}. TaskConfigValidate stays after the patch as the parser-level gate (losing sei-config Diagnostics is an explicit, documented trade-off; operator-debt is "have seid grow doctor/migrate" not "keep coupling the controller to sei-config's typed registry"). Validator's day-2 progression prepends ValidateSigningKey / ValidateNodeKey / ValidateOperatorKeyring before ApplyStatefulSet — mirrors the validator's init plan and addresses the Cursor Bugbot high-severity finding (image-drift rollouts must not recreate the pod without verifying mounted key secrets first; kubelet mount failures with no controller-side error context). Notes documented in the planner doc: - pelletier/go-toml/v2 does not preserve comments / key ordering on re-encode. First day-2 patch erases operator-added comments. - "Init writes the whole file, day-2 patches named keys, never mix in one plan" — sharp boundary on Phase == Running. Tests rewritten: - Per-mode "day-2 progression matches expected slice" tests (full/archive share the 7-task list; validator gets 10 with the 3 validation-gate prefix). - "Config-patch payload carries external-address from Spec.ExternalAddress." - Determinism: two builds of the same spec produce identical patch payloads. - All existing buildRunningPlan free-function callsites updated to go through (&fullNodePlanner{}).BuildPlan(node). Drops: - BuildConfigIntent interface method + 4 implementations. - Free helpers buildRunningPlan, buildNodeUpdatePlan in planner.go. - TestBuildNodeUpdatePlan_ConfigApplyIsIncremental and friends (no longer applicable — day-2 doesn't go through ConfigIntent). Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/archive.go | 37 ++- internal/planner/full.go | 51 ++- internal/planner/node_update_test.go | 464 ++++++++++++--------------- internal/planner/planner.go | 120 +++---- internal/planner/replay.go | 38 ++- internal/planner/validator.go | 46 ++- internal/task/config_patch.go | 28 ++ internal/task/task.go | 1 + 8 files changed, 418 insertions(+), 367 deletions(-) create mode 100644 internal/task/config_patch.go diff --git a/internal/planner/archive.go b/internal/planner/archive.go index 3bc5d533..96b35b3a 100644 --- a/internal/planner/archive.go +++ b/internal/planner/archive.go @@ -7,6 +7,7 @@ import ( seiconfig "github.com/sei-protocol/sei-config" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type archiveNodePlanner struct { @@ -26,23 +27,35 @@ func (p *archiveNodePlanner) Validate(node *seiv1alpha1.SeiNode) error { func (p *archiveNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if node.Status.Phase == seiv1alpha1.PhaseRunning { - return buildRunningPlan(node) + return p.buildRunningPlan(node) } - params, err := p.BuildConfigIntent(node) - if err != nil { - return nil, err + intent := &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeArchive, + Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), } - return buildBasePlan(node, node.Spec.Peers, nil, params) + return buildBasePlan(node, node.Spec.Peers, nil, intent) } -func (p *archiveNodePlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { - if node.Spec.Archive == nil { - return nil, fmt.Errorf("archive sub-spec is nil") +// buildRunningPlan returns the day-2 plan for a Running archive node. +// Same shape as full nodes (no extra validation gates) — see full.go's +// buildRunningPlan for the rationale. +func (p *archiveNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + if imageDrifted(node) { + prog := []string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + } + return assembleDay2Plan(node, prog, externalAddressPatch(node)) } - return &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeArchive, - Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), - }, nil + if sidecarNeedsReapproval(node) { + return buildMarkReadyPlan(node) + } + return nil, nil } func (p *archiveNodePlanner) controllerOverrides(node *seiv1alpha1.SeiNode) map[string]string { diff --git a/internal/planner/full.go b/internal/planner/full.go index f130d9c9..df6a48a1 100644 --- a/internal/planner/full.go +++ b/internal/planner/full.go @@ -6,6 +6,7 @@ import ( seiconfig "github.com/sei-protocol/sei-config" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type fullNodePlanner struct { @@ -28,29 +29,53 @@ func (p *fullNodePlanner) Validate(node *seiv1alpha1.SeiNode) error { return nil } +// BuildPlan dispatches between startup (init/bootstrap) and existing-resource +// (day-2) shapes. Reconciler/executor just call this and get a plan; the +// startup-vs-day-2 distinction stays inside the planner. func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if node.Status.Phase == seiv1alpha1.PhaseRunning { - return buildRunningPlan(node) + return p.buildRunningPlan(node) } fn := node.Spec.FullNode - params, err := p.BuildConfigIntent(node) - if err != nil { - return nil, err + intent := &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeFull, + Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), } if NeedsBootstrap(node) { - return buildBootstrapPlan(node, node.Spec.Peers, fn.Snapshot, params) + return buildBootstrapPlan(node, node.Spec.Peers, fn.Snapshot, intent) } - return buildBasePlan(node, node.Spec.Peers, fn.Snapshot, params) + return buildBasePlan(node, node.Spec.Peers, fn.Snapshot, intent) } -func (p *fullNodePlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { - if node.Spec.FullNode == nil { - return nil, fmt.Errorf("fullNode sub-spec is nil") +// buildRunningPlan returns the day-2 plan for a Running full node, or +// nil if no drift. Image drift queues a config-patch + pod-cycle plan; +// sidecar reapproval queues a one-task mark-ready plan. +// +// The day-2 patch stamps only the keys the controller directly owns +// (currently p2p.external-address for publishable P2P). TaskConfigPatch +// is a generic TOML merge — no sei-config involvement, no forced overrides, +// no mode-defaulted backfill. TaskConfigValidate after the patch is the +// parser-level gate. +// +// Note: pelletier/go-toml/v2 does not preserve comments or key ordering +// on re-encode, so the first day-2 patch erases operator-added comments. +func (p *fullNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + if imageDrifted(node) { + prog := []string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + } + return assembleDay2Plan(node, prog, externalAddressPatch(node)) } - return &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeFull, - Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides(node)), node.Spec.Overrides), - }, nil + if sidecarNeedsReapproval(node) { + return buildMarkReadyPlan(node) + } + return nil, nil } func (p *fullNodePlanner) controllerOverrides(node *seiv1alpha1.SeiNode) map[string]string { diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 6f5a93f4..254c7ff1 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -6,7 +6,6 @@ import ( "testing" . "github.com/onsi/gomega" - seiconfig "github.com/sei-protocol/sei-config" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -14,7 +13,10 @@ import ( "github.com/sei-protocol/sei-k8s-controller/internal/task" ) -const testImageV2 = "sei:v2.0.0" +const ( + testImageV2 = "sei:v2.0.0" + testExternalAddrAtl = "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656" +) // runningFullNode returns a SeiNode in the Running phase with currentImage matching spec.image. func runningFullNode() *seiv1alpha1.SeiNode { @@ -41,44 +43,60 @@ func planTaskTypes(plan *seiv1alpha1.TaskPlan) []string { return types } -// --- buildRunningPlan tests --- +// configPatchFromPlan finds the config-patch task in a plan and unmarshals +// its params back into a ConfigPatchTask for inspection. +func configPatchFromPlan(t *testing.T, plan *seiv1alpha1.TaskPlan) task.ConfigPatchTask { + t.Helper() + g := NewWithT(t) + for _, pt := range plan.Tasks { + if pt.Type != TaskConfigPatch { + continue + } + g.Expect(pt.Params).NotTo(BeNil()) + var p task.ConfigPatchTask + g.Expect(json.Unmarshal(pt.Params.Raw, &p)).To(Succeed()) + return p + } + t.Fatal("plan has no config-patch task") + return task.ConfigPatchTask{} +} -func TestBuildRunningPlan_NoDrift_ReturnsNil(t *testing.T) { +// --- fullNodePlanner running-plan tests --- + +func TestFullPlanner_NoDrift_ReturnsNil(t *testing.T) { g := NewWithT(t) node := runningFullNode() - // spec.image == status.currentImage — no drift - plan, err := buildRunningPlan(node) + plan, err := (&fullNodePlanner{}).BuildPlan(node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).To(BeNil(), "no plan should be built when there is no image drift") } -func TestBuildRunningPlan_ImageDrift_ReturnsNodeUpdatePlan(t *testing.T) { +func TestFullPlanner_ImageDrift_Day2Progression(t *testing.T) { g := NewWithT(t) node := runningFullNode() - node.Spec.Image = testImageV2 // drift: spec != status.currentImage + node.Spec.Image = testImageV2 - plan, err := buildRunningPlan(node) + plan, err := (&fullNodePlanner{}).BuildPlan(node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil(), "plan should be built for image drift") g.Expect(plan.Phase).To(Equal(seiv1alpha1.TaskPlanActive)) g.Expect(plan.TargetPhase).To(Equal(seiv1alpha1.PhaseRunning)) - // FailedPhase should be empty — failure retries rather than transitioning out of Running. + // FailedPhase deliberately empty — failure retries on next reconcile. g.Expect(string(plan.FailedPhase)).To(BeEmpty()) - got := planTaskTypes(plan) want := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, - TaskConfigApply, + TaskConfigPatch, TaskConfigValidate, task.TaskTypeReplacePod, task.TaskTypeObserveImage, TaskMarkReady, } - g.Expect(got).To(Equal(want), "NodeUpdate plan should have exactly these tasks in order") + g.Expect(planTaskTypes(plan)).To(Equal(want)) - // All tasks should start Pending with non-empty IDs and params. + // Tasks start Pending with non-empty IDs and params. for _, pt := range plan.Tasks { g.Expect(pt.Status).To(Equal(seiv1alpha1.TaskPending), "task %s should start Pending", pt.Type) g.Expect(pt.ID).NotTo(BeEmpty(), "task %s should have an ID", pt.Type) @@ -86,135 +104,208 @@ func TestBuildRunningPlan_ImageDrift_ReturnsNodeUpdatePlan(t *testing.T) { } } -// configIntentFromPlan finds the config-apply task and unmarshals its -// params back into a ConfigIntent for inspection. -func configIntentFromPlan(t *testing.T, plan *seiv1alpha1.TaskPlan) *seiconfig.ConfigIntent { - t.Helper() +// The day-2 config-patch must carry the external-address from +// Spec.ExternalAddress. This is the publishable-P2P propagation contract. +func TestFullPlanner_ConfigPatchCarriesExternalAddress(t *testing.T) { g := NewWithT(t) - for _, pt := range plan.Tasks { - if pt.Type != TaskConfigApply { - continue - } - g.Expect(pt.Params).NotTo(BeNil()) - var intent seiconfig.ConfigIntent - g.Expect(json.Unmarshal(pt.Params.Raw, &intent)).To(Succeed()) - return &intent - } - t.Fatal("plan has no config-apply task") - return nil + node := runningFullNode() + node.Spec.ExternalAddress = testExternalAddrAtl + node.Spec.Image = testImageV2 + + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + patch := configPatchFromPlan(t, plan) + g.Expect(patch.Files).To(HaveKey("config.toml")) + p2p, ok := patch.Files["config.toml"]["p2p"].(map[string]any) + g.Expect(ok).To(BeTrue(), "config.toml.p2p should be a nested map") + g.Expect(p2p).To(HaveKeyWithValue("external-address", testExternalAddrAtl)) } -// The day-2 config-apply must carry the mode planner's full ConfigIntent — -// non-empty Mode, Overrides containing the controller-managed keys — -// otherwise sei-config's ResolveIntent rejects the empty-Mode and -// TaskConfigApply fails at runtime. -func TestBuildNodeUpdatePlan_ConfigApplyHasIntentWithMode(t *testing.T) { +// Two builds of the same spec must produce identical patch payloads +// (plan-construction is deterministic w.r.t. spec). +func TestFullPlanner_ConfigPatchIsDeterministic(t *testing.T) { g := NewWithT(t) node := runningFullNode() - node.Spec.ExternalAddress = "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656" + node.Spec.ExternalAddress = testExternalAddrAtl node.Spec.Image = testImageV2 - plan, err := buildRunningPlan(node) + plan1, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + plan2, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + + p1 := configPatchFromPlan(t, plan1) + p2 := configPatchFromPlan(t, plan2) + g.Expect(p1.Files).To(Equal(p2.Files), "same spec must produce same patch payload") +} + +// --- archive and replay planners share the full progression --- + +func TestArchivePlanner_ImageDrift_Day2Progression(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.FullNode = nil + node.Spec.Archive = &seiv1alpha1.ArchiveSpec{} + node.Spec.Image = testImageV2 + + plan, err := (&archiveNodePlanner{}).BuildPlan(node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil()) - intent := configIntentFromPlan(t, plan) - g.Expect(intent.Mode).To(Equal(seiconfig.ModeFull), "Mode must be set so ResolveIntent accepts the intent") - g.Expect(intent.Overrides).To(HaveKeyWithValue( - seiconfig.KeyP2PExternalAddress, - "syncer-0-0-p2p.atlantic-2.harbor.platform.sei.io:26656", - ), "commonOverrides should propagate Spec.ExternalAddress") + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) } -// Day-2 must set Incremental=true. The init path uses applyFull which -// regenerates config from mode defaults — running that on a running node -// would wipe persistent-peers, state-sync trust point, and operator-managed -// TOML keys outside the controller's overrides set. Incremental reads -// on-disk config and patches only the keys we own. -func TestBuildNodeUpdatePlan_ConfigApplyIsIncremental(t *testing.T) { +// Validator's day-2 progression prepends the three key-validation gates +// so a missing/malformed secret aborts before any STS mutation. +func TestValidatorPlanner_ImageDrift_PrependsValidationGates(t *testing.T) { g := NewWithT(t) node := runningFullNode() + node.Spec.FullNode = nil + node.Spec.Validator = &seiv1alpha1.ValidatorSpec{} node.Spec.Image = testImageV2 - plan, err := buildRunningPlan(node) + plan, err := (&validatorPlanner{}).BuildPlan(node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil()) - intent := configIntentFromPlan(t, plan) - g.Expect(intent.Incremental).To(BeTrue(), - "day-2 ConfigIntent must be incremental to avoid clobbering on-disk state") + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeValidateSigningKey, + task.TaskTypeValidateNodeKey, + task.TaskTypeValidateOperatorKeyring, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) +} + +// --- sidecar mark-ready re-apply --- + +func setSidecarReady(node *seiv1alpha1.SeiNode, status metav1.ConditionStatus, reason string) { + meta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{ + Type: seiv1alpha1.ConditionSidecarReady, + Status: status, + Reason: reason, + Message: "test", + }) } -// Init plans must NOT set Incremental — the init path uses applyFull -// (regenerate config from mode defaults + overrides). Pinning this here -// makes accidental flips during future refactors a loud failure. -func TestBuildPlan_InitPlanConfigIntentNotIncremental(t *testing.T) { +func TestFullPlanner_SidecarReady_NoPlan(t *testing.T) { g := NewWithT(t) - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: "full-init", Namespace: "default", Generation: 1}, - Spec: seiv1alpha1.SeiNodeSpec{ - ChainID: "atlantic-2", - Image: "sei:v1.0.0", - FullNode: &seiv1alpha1.FullNodeSpec{}, - }, - // No phase — defaults to "" (init path). - } + node := runningFullNode() + setSidecarReady(node, metav1.ConditionTrue, "Ready") + + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).To(BeNil()) +} + +func TestFullPlanner_SidecarNotReady_ReturnsMarkReadyPlan(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + setSidecarReady(node, metav1.ConditionFalse, "NotReady") plan, err := (&fullNodePlanner{}).BuildPlan(node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil()) + g.Expect(plan.Phase).To(Equal(seiv1alpha1.TaskPlanActive)) + g.Expect(plan.TargetPhase).To(Equal(seiv1alpha1.PhaseRunning)) + g.Expect(string(plan.FailedPhase)).To(BeEmpty()) + g.Expect(planTaskTypes(plan)).To(Equal([]string{TaskMarkReady})) +} - intent := configIntentFromPlan(t, plan) - g.Expect(intent.Incremental).To(BeFalse(), - "init ConfigIntent must NOT be incremental — applyFull regenerates from mode defaults") +func TestFullPlanner_SidecarUnknown_NoPlan(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + setSidecarReady(node, metav1.ConditionUnknown, "Unreachable") + + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).To(BeNil(), "Unknown should not trigger a plan — re-probe next tick") } -// Per-mode coverage: the day-2 intent should match the mode planner's -// Mode value. Catches future drift if a new mode lands and isn't wired -// into BuildConfigIntent. -func TestBuildNodeUpdatePlan_ConfigApplyIntentMatchesModePlanner(t *testing.T) { - cases := []struct { - name string - mutate func(*seiv1alpha1.SeiNode) - wantMode seiconfig.NodeMode - }{ - { - name: "full", - mutate: func(_ *seiv1alpha1.SeiNode) {}, - wantMode: seiconfig.ModeFull, - }, - { - name: "archive", - mutate: func(n *seiv1alpha1.SeiNode) { - n.Spec.FullNode = nil - n.Spec.Archive = &seiv1alpha1.ArchiveSpec{} - }, - wantMode: seiconfig.ModeArchive, - }, - { - name: "validator", - mutate: func(n *seiv1alpha1.SeiNode) { - n.Spec.FullNode = nil - n.Spec.Validator = &seiv1alpha1.ValidatorSpec{} - }, - wantMode: seiconfig.ModeValidator, - }, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - tc.mutate(node) - node.Spec.Image = testImageV2 - - plan, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(plan).NotTo(BeNil()) - - intent := configIntentFromPlan(t, plan) - g.Expect(intent.Mode).To(Equal(tc.wantMode)) - g.Expect(intent.Incremental).To(BeTrue()) - }) +func TestFullPlanner_ImageDriftWinsOverSidecar(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.Image = testImageV2 + setSidecarReady(node, metav1.ConditionFalse, "NotReady") + + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + g.Expect(plan.Tasks).To(HaveLen(7), "should be full day-2 plan, not one-task mark-ready") + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) +} + +func TestBuildMarkReadyPlan_FreshIDEveryCall(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + + p1, err := buildMarkReadyPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + p2, err := buildMarkReadyPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(p1.ID).NotTo(Equal(p2.ID)) + g.Expect(p1.Tasks[0].ID).NotTo(Equal(p2.Tasks[0].ID)) +} + +func TestSidecarNeedsReapproval(t *testing.T) { + g := NewWithT(t) + + node := runningFullNode() + g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) + + setSidecarReady(node, metav1.ConditionTrue, "Ready") + g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) + + setSidecarReady(node, metav1.ConditionUnknown, "Unreachable") + g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) + + setSidecarReady(node, metav1.ConditionFalse, "SomethingElse") + g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) + + setSidecarReady(node, metav1.ConditionFalse, "NotReady") + g.Expect(sidecarNeedsReapproval(node)).To(BeTrue()) +} + +func TestBuildRunningPlan_UniqueIDs(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.Image = testImageV2 + + plan1, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + plan2, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(plan1.ID).NotTo(Equal(plan2.ID), "separate plan builds should have unique IDs") + + seen := map[string]bool{} + for _, tsk := range plan1.Tasks { + g.Expect(seen[tsk.ID]).To(BeFalse(), "duplicate task ID: %s", tsk.ID) + seen[tsk.ID] = true } } @@ -223,7 +314,7 @@ func TestBuildNodeUpdatePlan_ConfigApplyIntentMatchesModePlanner(t *testing.T) { func TestResolvePlan_NodeUpdate_SetsCondition(t *testing.T) { g := NewWithT(t) node := runningFullNode() - node.Spec.Image = testImageV2 // drift triggers NodeUpdate plan + node.Spec.Image = testImageV2 err := (&NodeResolver{}).ResolvePlan(t.Context(), node) g.Expect(err).NotTo(HaveOccurred()) @@ -241,22 +332,17 @@ func TestResolvePlan_CompletedPlan_ClearsCondition(t *testing.T) { node := runningFullNode() node.Spec.Image = testImageV2 - // Build a NodeUpdate plan and simulate completion. err := (&NodeResolver{}).ResolvePlan(t.Context(), node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(node.Status.Plan).NotTo(BeNil()) - // Verify the condition was set. cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) g.Expect(cond).NotTo(BeNil()) g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) - // Mark the plan completed. node.Status.Plan.Phase = seiv1alpha1.TaskPlanComplete - // Also converge currentImage so a new plan is not built. node.Status.CurrentImage = testImageV2 - // ResolvePlan should clear the completed plan and the condition. err = (&NodeResolver{}).ResolvePlan(t.Context(), node) g.Expect(err).NotTo(HaveOccurred()) @@ -272,7 +358,6 @@ func TestResolvePlan_FailedPlan_ClearsCondition(t *testing.T) { node := runningFullNode() node.Spec.Image = testImageV2 - // Build a NodeUpdate plan and simulate failure. err := (&NodeResolver{}).ResolvePlan(t.Context(), node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(node.Status.Plan).NotTo(BeNil()) @@ -286,13 +371,9 @@ func TestResolvePlan_FailedPlan_ClearsCondition(t *testing.T) { Error: "apply error", } - // ResolvePlan should clear the failed plan. Since drift still exists, - // it immediately builds a new NodeUpdate plan and sets the condition - // back to True. This is correct — automatic retry on failure. err = (&NodeResolver{}).ResolvePlan(t.Context(), node) g.Expect(err).NotTo(HaveOccurred()) - // A new plan was built because drift still exists. g.Expect(node.Status.Plan).NotTo(BeNil(), "new plan should be built because drift persists") g.Expect(node.Status.Plan.Phase).To(Equal(seiv1alpha1.TaskPlanActive)) @@ -304,7 +385,7 @@ func TestResolvePlan_FailedPlan_ClearsCondition(t *testing.T) { func TestResolvePlan_ResumesActivePlan(t *testing.T) { g := NewWithT(t) node := runningFullNode() - node.Spec.Image = testImageV2 // drift exists + node.Spec.Image = testImageV2 existingPlan := &seiv1alpha1.TaskPlan{ ID: "existing-plan-123", @@ -321,19 +402,15 @@ func TestResolvePlan_ResumesActivePlan(t *testing.T) { "active plan should be resumed, not replaced") } -// --- Additional edge cases --- - func TestResolvePlan_CompletedNonUpdatePlan_DoesNotClearCondition(t *testing.T) { g := NewWithT(t) node := runningFullNode() - // Manually set the condition (as if a previous NodeUpdate plan set it). meta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{ Type: seiv1alpha1.ConditionNodeUpdateInProgress, Status: metav1.ConditionFalse, Reason: "UpdateComplete", }) - // Create a completed plan without observe-image (not a NodeUpdate plan). node.Status.Plan = &seiv1alpha1.TaskPlan{ ID: "non-update-plan", Phase: seiv1alpha1.TaskPlanComplete, @@ -345,7 +422,6 @@ func TestResolvePlan_CompletedNonUpdatePlan_DoesNotClearCondition(t *testing.T) err := (&NodeResolver{}).ResolvePlan(t.Context(), node) g.Expect(err).NotTo(HaveOccurred()) - // The condition should remain unchanged (already False from before). cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) g.Expect(cond).NotTo(BeNil()) g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) @@ -353,26 +429,7 @@ func TestResolvePlan_CompletedNonUpdatePlan_DoesNotClearCondition(t *testing.T) "reason should not be overwritten by a non-update plan completion") } -func TestBuildRunningPlan_UniqueIDs(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - node.Spec.Image = testImageV2 - - plan1, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - - plan2, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - - g.Expect(plan1.ID).NotTo(Equal(plan2.ID), "separate plan builds should have unique IDs") - - // Task IDs within a single plan should all be unique. - seen := map[string]bool{} - for _, tsk := range plan1.Tasks { - g.Expect(seen[tsk.ID]).To(BeFalse(), "duplicate task ID: %s", tsk.ID) - seen[tsk.ID] = true - } -} +// --- handleTerminalPlan tests (unchanged) --- func TestHandleTerminalPlan_CompletedWithUpdateCondition(t *testing.T) { g := NewWithT(t) @@ -427,7 +484,6 @@ func TestHandleTerminalPlan_NilPlan(t *testing.T) { g := NewWithT(t) node := runningFullNode() node.Status.Plan = nil - // Should be a no-op — no panic. handleTerminalPlan(t.Context(), node) g.Expect(node.Status.Plan).To(BeNil()) } @@ -461,107 +517,3 @@ func TestPlanFailureMessage_NoDetail(t *testing.T) { plan := &seiv1alpha1.TaskPlan{} g.Expect(planFailureMessage(plan)).To(Equal("unknown")) } - -// --- sidecar mark-ready re-apply tests --- - -func setSidecarReady(node *seiv1alpha1.SeiNode, status metav1.ConditionStatus, reason string) { - meta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{ - Type: seiv1alpha1.ConditionSidecarReady, - Status: status, - Reason: reason, - Message: "test", - }) -} - -func TestBuildRunningPlan_SidecarReady_NoPlan(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - setSidecarReady(node, metav1.ConditionTrue, "Ready") - - plan, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(plan).To(BeNil()) -} - -func TestBuildRunningPlan_SidecarNotReady_ReturnsMarkReadyPlan(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - setSidecarReady(node, metav1.ConditionFalse, "NotReady") - - plan, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(plan).NotTo(BeNil()) - g.Expect(plan.Phase).To(Equal(seiv1alpha1.TaskPlanActive)) - g.Expect(plan.TargetPhase).To(Equal(seiv1alpha1.PhaseRunning)) - g.Expect(string(plan.FailedPhase)).To(BeEmpty()) - g.Expect(planTaskTypes(plan)).To(Equal([]string{TaskMarkReady})) -} - -func TestBuildRunningPlan_SidecarUnknown_NoPlan(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - setSidecarReady(node, metav1.ConditionUnknown, "Unreachable") - - plan, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(plan).To(BeNil(), "Unknown should not trigger a plan — re-probe next tick") -} - -func TestBuildRunningPlan_ImageDriftWinsOverSidecar(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - node.Spec.Image = testImageV2 // image drift - setSidecarReady(node, metav1.ConditionFalse, "NotReady") - - plan, err := buildRunningPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(plan).NotTo(BeNil()) - // Image update plan ends with MarkReady, which also resolves the sidecar. - g.Expect(plan.Tasks).To(HaveLen(7), "should be full node-update plan, not one-task mark-ready") - g.Expect(planTaskTypes(plan)).To(Equal([]string{ - task.TaskTypeApplyStatefulSet, - task.TaskTypeApplyService, - TaskConfigApply, - TaskConfigValidate, - task.TaskTypeReplacePod, - task.TaskTypeObserveImage, - TaskMarkReady, - })) -} - -func TestBuildMarkReadyPlan_FreshIDEveryCall(t *testing.T) { - g := NewWithT(t) - node := runningFullNode() - - p1, err := buildMarkReadyPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - p2, err := buildMarkReadyPlan(node) - g.Expect(err).NotTo(HaveOccurred()) - - g.Expect(p1.ID).NotTo(Equal(p2.ID)) - g.Expect(p1.Tasks[0].ID).NotTo(Equal(p2.Tasks[0].ID)) -} - -func TestSidecarNeedsReapproval(t *testing.T) { - g := NewWithT(t) - - // missing condition - node := runningFullNode() - g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) - - // True - setSidecarReady(node, metav1.ConditionTrue, "Ready") - g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) - - // Unknown - setSidecarReady(node, metav1.ConditionUnknown, "Unreachable") - g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) - - // False + wrong reason - setSidecarReady(node, metav1.ConditionFalse, "SomethingElse") - g.Expect(sidecarNeedsReapproval(node)).To(BeFalse()) - - // False + NotReady - setSidecarReady(node, metav1.ConditionFalse, "NotReady") - g.Expect(sidecarNeedsReapproval(node)).To(BeTrue()) -} diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 3831a168..c70071fe 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -32,6 +32,7 @@ const ( TaskConfigureGenesis = sidecar.TaskTypeConfigureGenesis TaskConfigureStateSync = sidecar.TaskTypeConfigureStateSync TaskConfigApply = sidecar.TaskTypeConfigApply + TaskConfigPatch = sidecar.TaskTypeConfigPatch TaskConfigValidate = sidecar.TaskTypeConfigValidate TaskMarkReady = sidecar.TaskTypeMarkReady TaskSnapshotUpload = sidecar.TaskTypeSnapshotUpload @@ -58,18 +59,19 @@ var baseProgression = map[string][]string{ } // NodePlanner encapsulates mode-specific logic for validating a SeiNode -// and building its initialization task plan with fully embedded params. +// and building its task plan. BuildPlan dispatches internally between +// startup (init) and existing-resource (day-2) shapes — callers don't +// see the distinction and the interface stays narrow. +// +// Rule of thumb for implementations: init plans write the whole config +// file via TaskConfigApply (sei-config mode-defaulted resolution). +// Day-2 plans patch only the keys the controller directly owns via +// TaskConfigPatch (generic TOML merge, no sei-config involvement). +// Never mix the two in one plan. type NodePlanner interface { Validate(node *seiv1alpha1.SeiNode) error BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) Mode() string - - // BuildConfigIntent returns the ConfigIntent the mode's plan would feed - // to TaskConfigApply (mode + merged controller / spec overrides). Shared - // by BuildPlan (init) and buildNodeUpdatePlan (day-2) so they emit - // identical config payloads for the same spec. Day-2 callers set - // Incremental on the returned value before passing it through. - BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) } // GroupPlanner encapsulates logic for building a group-level task plan. @@ -709,24 +711,17 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string { return out } -// buildRunningPlan returns a steady-state drift plan, or nil if no drift. -// Image drift is checked first — its plan ends with MarkReady, so it also -// resolves any stale sidecar. -func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if node.Spec.Image != node.Status.CurrentImage { - return buildNodeUpdatePlan(node) - } - if sidecarNeedsReapproval(node) { - return buildMarkReadyPlan(node) - } - return nil, nil -} - +// sidecarNeedsReapproval reports whether the sidecar has been observed +// to have lost readiness. Mode-agnostic — any mode planner that's in +// the running phase checks this and routes to buildMarkReadyPlan. func sidecarNeedsReapproval(node *seiv1alpha1.SeiNode) bool { cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady) return cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == "NotReady" } +// buildMarkReadyPlan is the single-task plan used to re-mark sidecar +// readiness. Mode-agnostic — kept as a free helper so each mode planner +// can call it directly from its running-phase branch. func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { planID := uuid.New().String() t, err := buildPlannedTask(planID, sidecar.TaskTypeMarkReady, 0, paramsForTaskType(node, sidecar.TaskTypeMarkReady, nil, nil)) @@ -741,54 +736,48 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error }, nil } -// buildNodeUpdatePlan constructs a plan to roll out an image update on a -// Running node. The plan applies the new StatefulSet spec, re-applies -// config.toml (incremental — only the keys the controller owns), then -// cycles the pod and re-initializes the sidecar. +// imageDrifted reports whether the running pod's currently-observed +// image diverges from the spec'd image. Mode-agnostic check; the +// returned task list is mode-specific (each planner assembles its own). +func imageDrifted(node *seiv1alpha1.SeiNode) bool { + return node.Spec.Image != node.Status.CurrentImage +} + +// externalAddressPatch returns the day-2 config.toml patch that stamps +// the publishable P2P external address. Empty Spec.ExternalAddress means +// the SND is not opted into publishable P2P; emit an empty-string patch +// so opt-out reaches the pod symmetrically with opt-in. +func externalAddressPatch(node *seiv1alpha1.SeiNode) map[string]map[string]any { + return map[string]map[string]any{ + "config.toml": { + "p2p": map[string]any{ + "external-address": node.Spec.ExternalAddress, + }, + }, + } +} + +// assembleDay2Plan composes a day-2 task progression into a TaskPlan, +// shared by every mode planner so the boilerplate (planID, ordering, +// param marshaling, phase fields) lives in one place. The progression +// slice IS the per-mode authorship — each planner decides which tasks +// it needs (e.g. validator prepends key-validation gates). +// +// patch carries the TOML fragments the controller wants stamped into +// named files for TaskConfigPatch; pass nil if the planner's progression +// doesn't include a config-patch step (none today). // // FailedPhase is deliberately empty: a failure retries on the next reconcile // rather than transitioning the node out of Running. -// -// TaskConfigApply is placed before ReplacePod so the old pod's sidecar -// writes the new config.toml to the PVC; the new pod then mounts the -// PVC and reads the fresh file at seid start. external_address has no -// hot-reload path in seid today (see seictl config_reload.go), so the -// pod restart is the trigger that makes new values take effect. -// -// Incremental=true is mandatory on the day-2 intent. The init path uses -// the non-incremental applyFull, which regenerates config from mode -// defaults + overrides; running that on day 2 would wipe persistent-peers, -// state-sync trust point, and any operator-managed TOML keys that the -// controller doesn't track in overrides. applyIncremental reads on-disk -// config and patches only the overrides we own. -func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { +func assembleDay2Plan(node *seiv1alpha1.SeiNode, prog []string, patch map[string]map[string]any) (*seiv1alpha1.TaskPlan, error) { setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) - mode, err := plannerForMode(node) - if err != nil { - return nil, fmt.Errorf("resolving mode planner: %w", err) - } - intent, err := mode.BuildConfigIntent(node) - if err != nil { - return nil, fmt.Errorf("building config intent: %w", err) - } - intent.Incremental = true - - prog := []string{ - task.TaskTypeApplyStatefulSet, - task.TaskTypeApplyService, - TaskConfigApply, - TaskConfigValidate, - task.TaskTypeReplacePod, - task.TaskTypeObserveImage, - sidecar.TaskTypeMarkReady, - } - planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) for i, taskType := range prog { - t, err := buildPlannedTask(planID, taskType, i, paramsForTaskType(node, taskType, nil, intent)) + params := paramsForDay2Task(node, taskType, patch) + t, err := buildPlannedTask(planID, taskType, i, params) if err != nil { return nil, err } @@ -802,6 +791,17 @@ func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, erro }, nil } +// paramsForDay2Task is a thin wrapper around paramsForTaskType that +// returns the config-patch params when the task type is TaskConfigPatch. +// All other task types delegate to paramsForTaskType with nil intent — +// day-2 plans never carry a ConfigIntent. +func paramsForDay2Task(node *seiv1alpha1.SeiNode, taskType string, patch map[string]map[string]any) any { + if taskType == TaskConfigPatch { + return task.ConfigPatchTask{Files: patch} + } + return paramsForTaskType(node, taskType, nil, nil) +} + // mergeOverrides combines controller-generated overrides with user-specified // overrides. User overrides take precedence. func mergeOverrides(controllerOverrides, userOverrides map[string]string) map[string]string { diff --git a/internal/planner/replay.go b/internal/planner/replay.go index 8adb5bae..ad178675 100644 --- a/internal/planner/replay.go +++ b/internal/planner/replay.go @@ -6,6 +6,7 @@ import ( seiconfig "github.com/sei-protocol/sei-config" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type replayerPlanner struct { @@ -35,26 +36,37 @@ func (p *replayerPlanner) Validate(node *seiv1alpha1.SeiNode) error { func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if node.Status.Phase == seiv1alpha1.PhaseRunning { - return buildRunningPlan(node) + return p.buildRunningPlan(node) } - params, err := p.BuildConfigIntent(node) - if err != nil { - return nil, err + intent := &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeFull, + Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides()), node.Spec.Overrides), } if NeedsBootstrap(node) { - return buildBootstrapPlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, params) + return buildBootstrapPlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, intent) } - return buildBasePlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, params) + return buildBasePlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, intent) } -func (p *replayerPlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { - if node.Spec.Replayer == nil { - return nil, fmt.Errorf("replayer sub-spec is nil") +// buildRunningPlan returns the day-2 plan for a Running replayer node. +// Same shape as full/archive — see full.go's buildRunningPlan. +func (p *replayerPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + if imageDrifted(node) { + prog := []string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + } + return assembleDay2Plan(node, prog, externalAddressPatch(node)) } - return &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeFull, - Overrides: mergeOverrides(mergeOverrides(commonOverrides(node), p.controllerOverrides()), node.Spec.Overrides), - }, nil + if sidecarNeedsReapproval(node) { + return buildMarkReadyPlan(node) + } + return nil, nil } func (p *replayerPlanner) controllerOverrides() map[string]string { diff --git a/internal/planner/validator.go b/internal/planner/validator.go index e5b2cb0b..e0246121 100644 --- a/internal/planner/validator.go +++ b/internal/planner/validator.go @@ -6,6 +6,7 @@ import ( seiconfig "github.com/sei-protocol/sei-config" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" ) type validatorPlanner struct { @@ -94,28 +95,47 @@ func validateOperatorKeyringDistinctness(v *seiv1alpha1.ValidatorSpec) error { func (p *validatorPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if node.Status.Phase == seiv1alpha1.PhaseRunning { - return buildRunningPlan(node) + return p.buildRunningPlan(node) } if isGenesisCeremonyNode(node) { return buildGenesisPlan(node) } v := node.Spec.Validator - params, err := p.BuildConfigIntent(node) - if err != nil { - return nil, err + intent := &seiconfig.ConfigIntent{ + Mode: seiconfig.ModeValidator, + Overrides: mergeOverrides(commonOverrides(node), node.Spec.Overrides), } if NeedsBootstrap(node) { - return buildBootstrapPlan(node, node.Spec.Peers, v.Snapshot, params) + return buildBootstrapPlan(node, node.Spec.Peers, v.Snapshot, intent) } - return buildBasePlan(node, node.Spec.Peers, v.Snapshot, params) + return buildBasePlan(node, node.Spec.Peers, v.Snapshot, intent) } -func (p *validatorPlanner) BuildConfigIntent(node *seiv1alpha1.SeiNode) (*seiconfig.ConfigIntent, error) { - if node.Spec.Validator == nil { - return nil, fmt.Errorf("validator sub-spec is nil") +// buildRunningPlan returns the day-2 plan for a Running validator. The +// progression prepends ValidateSigningKey / ValidateNodeKey / +// ValidateOperatorKeyring before any STS or pod mutation: a missing or +// malformed key secret must abort *before* we touch the StatefulSet +// (which would otherwise schedule a pod that fails its kubelet volume +// mount with no controller-side error context). Mirrors the same +// readiness gates the validator's init plan applies in buildBasePlan. +func (p *validatorPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + if imageDrifted(node) { + prog := []string{ + task.TaskTypeValidateSigningKey, + task.TaskTypeValidateNodeKey, + task.TaskTypeValidateOperatorKeyring, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + } + return assembleDay2Plan(node, prog, externalAddressPatch(node)) } - return &seiconfig.ConfigIntent{ - Mode: seiconfig.ModeValidator, - Overrides: mergeOverrides(commonOverrides(node), node.Spec.Overrides), - }, nil + if sidecarNeedsReapproval(node) { + return buildMarkReadyPlan(node) + } + return nil, nil } diff --git a/internal/task/config_patch.go b/internal/task/config_patch.go new file mode 100644 index 00000000..9587f6a9 --- /dev/null +++ b/internal/task/config_patch.go @@ -0,0 +1,28 @@ +package task + +import ( + sidecar "github.com/sei-protocol/seictl/sidecar/client" +) + +// ConfigPatchTask satisfies sidecar.TaskBuilder for config-patch. The +// controller stamps a subset of TOML keys it directly owns into named +// files; the sidecar handler is a generic merge-and-write per file with +// no sei-config involvement. Used for day-2 reconvergence where the +// controller wants surgical authority over operational fields without +// re-running the typed mode-defaulted resolver. +// +// JSON tag matches the wire format sidecar.ConfigPatchTask serializes +// (sidecar handler expects {"files": {...}}). +type ConfigPatchTask struct { + Files map[string]map[string]any `json:"files"` +} + +func (t ConfigPatchTask) TaskType() string { return sidecar.TaskTypeConfigPatch } + +func (t ConfigPatchTask) Validate() error { + return sidecar.ConfigPatchTask{Files: t.Files}.Validate() +} + +func (t ConfigPatchTask) ToTaskRequest() sidecar.TaskRequest { + return sidecar.ConfigPatchTask{Files: t.Files}.ToTaskRequest() +} diff --git a/internal/task/task.go b/internal/task/task.go index 5705f7d3..90f02e61 100644 --- a/internal/task/task.go +++ b/internal/task/task.go @@ -204,6 +204,7 @@ var registry = map[string]taskDeserializer{ sidecar.TaskTypeConfigureStateSync: sidecarTask[sidecar.ConfigureStateSyncTask](false), sidecar.TaskTypeAwaitCondition: sidecarTask[sidecar.AwaitConditionTask](false), sidecar.TaskTypeConfigApply: sidecarTask[configApplyTask](false), + sidecar.TaskTypeConfigPatch: sidecarTask[ConfigPatchTask](false), sidecar.TaskTypeConfigValidate: sidecarTask[sidecar.ConfigValidateTask](true), sidecar.TaskTypeConfigureGenesis: sidecarTask[sidecar.ConfigureGenesisTask](false), sidecar.TaskTypeDiscoverPeers: sidecarTask[sidecar.DiscoverPeersTask](false), From 4b91ac611177da3859df744c9af2fdbccc089e23 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sun, 31 May 2026 12:45:16 -0700 Subject: [PATCH 4/7] chore(planner): drop "day 2" terminology + comment sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR review: - "day-2" / "day 2" is unconventional terminology — rename to "update" / "running" consistent with the existing NodeUpdateInProgress condition and setNodeUpdateCondition vocabulary. - assembleDay2Plan → assembleUpdatePlan; paramsForDay2Task → paramsForUpdateTask. - Test names + doc references updated to match. Comment sweep — trim narration, keep the load-bearing WHY: - NodePlanner interface doc 10 → 6 lines. - fullNodePlanner.buildRunningPlan doc 11 → 3 lines (keeps the non-obvious go-toml comment-erasure note). - archive/replay buildRunningPlan docs trimmed to 2 lines each. - validatorPlanner.buildRunningPlan doc 7 → 4 lines (keeps the load-bearing kubelet-mount-failure rationale). - ConfigPatchTask doc 8 → 4 lines. - Free helpers in planner.go (sidecarNeedsReapproval, buildMarkReadyPlan, imageDrifted, externalAddressPatch) trimmed. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/archive.go | 7 ++-- internal/planner/full.go | 20 ++-------- internal/planner/node_update_test.go | 12 +++--- internal/planner/planner.go | 57 ++++++++++------------------ internal/planner/replay.go | 6 +-- internal/planner/validator.go | 13 +++---- internal/task/config_patch.go | 13 ++----- 7 files changed, 45 insertions(+), 83 deletions(-) diff --git a/internal/planner/archive.go b/internal/planner/archive.go index 96b35b3a..3fc35738 100644 --- a/internal/planner/archive.go +++ b/internal/planner/archive.go @@ -36,9 +36,8 @@ func (p *archiveNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1. return buildBasePlan(node, node.Spec.Peers, nil, intent) } -// buildRunningPlan returns the day-2 plan for a Running archive node. -// Same shape as full nodes (no extra validation gates) — see full.go's -// buildRunningPlan for the rationale. +// buildRunningPlan returns the update plan for a Running archive node. +// Same shape as full nodes (no extra validation gates). func (p *archiveNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { prog := []string{ @@ -50,7 +49,7 @@ func (p *archiveNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1 task.TaskTypeObserveImage, TaskMarkReady, } - return assembleDay2Plan(node, prog, externalAddressPatch(node)) + return assembleUpdatePlan(node, prog, externalAddressPatch(node)) } if sidecarNeedsReapproval(node) { return buildMarkReadyPlan(node) diff --git a/internal/planner/full.go b/internal/planner/full.go index df6a48a1..cbb1cd69 100644 --- a/internal/planner/full.go +++ b/internal/planner/full.go @@ -29,9 +29,6 @@ func (p *fullNodePlanner) Validate(node *seiv1alpha1.SeiNode) error { return nil } -// BuildPlan dispatches between startup (init/bootstrap) and existing-resource -// (day-2) shapes. Reconciler/executor just call this and get a plan; the -// startup-vs-day-2 distinction stays inside the planner. func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if node.Status.Phase == seiv1alpha1.PhaseRunning { return p.buildRunningPlan(node) @@ -47,18 +44,9 @@ func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas return buildBasePlan(node, node.Spec.Peers, fn.Snapshot, intent) } -// buildRunningPlan returns the day-2 plan for a Running full node, or -// nil if no drift. Image drift queues a config-patch + pod-cycle plan; -// sidecar reapproval queues a one-task mark-ready plan. -// -// The day-2 patch stamps only the keys the controller directly owns -// (currently p2p.external-address for publishable P2P). TaskConfigPatch -// is a generic TOML merge — no sei-config involvement, no forced overrides, -// no mode-defaulted backfill. TaskConfigValidate after the patch is the -// parser-level gate. -// -// Note: pelletier/go-toml/v2 does not preserve comments or key ordering -// on re-encode, so the first day-2 patch erases operator-added comments. +// buildRunningPlan returns the update plan for a Running full node, or +// nil if no drift. pelletier/go-toml/v2 does not preserve comments on +// re-encode — the first config-patch erases operator-added comments. func (p *fullNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { prog := []string{ @@ -70,7 +58,7 @@ func (p *fullNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alp task.TaskTypeObserveImage, TaskMarkReady, } - return assembleDay2Plan(node, prog, externalAddressPatch(node)) + return assembleUpdatePlan(node, prog, externalAddressPatch(node)) } if sidecarNeedsReapproval(node) { return buildMarkReadyPlan(node) diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 254c7ff1..9d83bc1c 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -71,7 +71,7 @@ func TestFullPlanner_NoDrift_ReturnsNil(t *testing.T) { g.Expect(plan).To(BeNil(), "no plan should be built when there is no image drift") } -func TestFullPlanner_ImageDrift_Day2Progression(t *testing.T) { +func TestFullPlanner_ImageDrift_UpdateProgression(t *testing.T) { g := NewWithT(t) node := runningFullNode() node.Spec.Image = testImageV2 @@ -104,8 +104,8 @@ func TestFullPlanner_ImageDrift_Day2Progression(t *testing.T) { } } -// The day-2 config-patch must carry the external-address from -// Spec.ExternalAddress. This is the publishable-P2P propagation contract. +// The update plan's config-patch must carry the external-address from +// Spec.ExternalAddress — the publishable-P2P propagation contract. func TestFullPlanner_ConfigPatchCarriesExternalAddress(t *testing.T) { g := NewWithT(t) node := runningFullNode() @@ -143,7 +143,7 @@ func TestFullPlanner_ConfigPatchIsDeterministic(t *testing.T) { // --- archive and replay planners share the full progression --- -func TestArchivePlanner_ImageDrift_Day2Progression(t *testing.T) { +func TestArchivePlanner_ImageDrift_UpdateProgression(t *testing.T) { g := NewWithT(t) node := runningFullNode() node.Spec.FullNode = nil @@ -165,7 +165,7 @@ func TestArchivePlanner_ImageDrift_Day2Progression(t *testing.T) { })) } -// Validator's day-2 progression prepends the three key-validation gates +// Validator's update progression prepends the three key-validation gates // so a missing/malformed secret aborts before any STS mutation. func TestValidatorPlanner_ImageDrift_PrependsValidationGates(t *testing.T) { g := NewWithT(t) @@ -246,7 +246,7 @@ func TestFullPlanner_ImageDriftWinsOverSidecar(t *testing.T) { plan, err := (&fullNodePlanner{}).BuildPlan(node) g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil()) - g.Expect(plan.Tasks).To(HaveLen(7), "should be full day-2 plan, not one-task mark-ready") + g.Expect(plan.Tasks).To(HaveLen(7), "should be full update plan, not one-task mark-ready") g.Expect(planTaskTypes(plan)).To(Equal([]string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/planner.go b/internal/planner/planner.go index c70071fe..756120c5 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -58,16 +58,12 @@ var baseProgression = map[string][]string{ "genesis": {TaskConfigApply, TaskConfigValidate, TaskMarkReady}, } -// NodePlanner encapsulates mode-specific logic for validating a SeiNode -// and building its task plan. BuildPlan dispatches internally between -// startup (init) and existing-resource (day-2) shapes — callers don't -// see the distinction and the interface stays narrow. +// NodePlanner encapsulates mode-specific plan construction. BuildPlan +// dispatches internally between startup (init) and an existing +// running resource — callers don't see the distinction. // -// Rule of thumb for implementations: init plans write the whole config -// file via TaskConfigApply (sei-config mode-defaulted resolution). -// Day-2 plans patch only the keys the controller directly owns via -// TaskConfigPatch (generic TOML merge, no sei-config involvement). -// Never mix the two in one plan. +// Convention: init writes the whole config via TaskConfigApply; an +// existing resource patches only controller-owned keys via TaskConfigPatch. type NodePlanner interface { Validate(node *seiv1alpha1.SeiNode) error BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) @@ -712,16 +708,13 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string { } // sidecarNeedsReapproval reports whether the sidecar has been observed -// to have lost readiness. Mode-agnostic — any mode planner that's in -// the running phase checks this and routes to buildMarkReadyPlan. +// to have lost readiness. Mode-agnostic. func sidecarNeedsReapproval(node *seiv1alpha1.SeiNode) bool { cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady) return cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == "NotReady" } -// buildMarkReadyPlan is the single-task plan used to re-mark sidecar -// readiness. Mode-agnostic — kept as a free helper so each mode planner -// can call it directly from its running-phase branch. +// buildMarkReadyPlan is the single-task plan that re-marks sidecar readiness. func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { planID := uuid.New().String() t, err := buildPlannedTask(planID, sidecar.TaskTypeMarkReady, 0, paramsForTaskType(node, sidecar.TaskTypeMarkReady, nil, nil)) @@ -736,17 +729,13 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error }, nil } -// imageDrifted reports whether the running pod's currently-observed -// image diverges from the spec'd image. Mode-agnostic check; the -// returned task list is mode-specific (each planner assembles its own). func imageDrifted(node *seiv1alpha1.SeiNode) bool { return node.Spec.Image != node.Status.CurrentImage } -// externalAddressPatch returns the day-2 config.toml patch that stamps -// the publishable P2P external address. Empty Spec.ExternalAddress means -// the SND is not opted into publishable P2P; emit an empty-string patch -// so opt-out reaches the pod symmetrically with opt-in. +// externalAddressPatch is the config.toml patch that stamps the +// publishable-P2P external address. An empty Spec.ExternalAddress +// stamps an empty value so opt-out reaches the pod symmetrically. func externalAddressPatch(node *seiv1alpha1.SeiNode) map[string]map[string]any { return map[string]map[string]any{ "config.toml": { @@ -757,26 +746,21 @@ func externalAddressPatch(node *seiv1alpha1.SeiNode) map[string]map[string]any { } } -// assembleDay2Plan composes a day-2 task progression into a TaskPlan, -// shared by every mode planner so the boilerplate (planID, ordering, -// param marshaling, phase fields) lives in one place. The progression -// slice IS the per-mode authorship — each planner decides which tasks -// it needs (e.g. validator prepends key-validation gates). -// -// patch carries the TOML fragments the controller wants stamped into -// named files for TaskConfigPatch; pass nil if the planner's progression -// doesn't include a config-patch step (none today). +// assembleUpdatePlan composes a task progression for a Running node into +// a TaskPlan. The progression slice is the per-mode authorship — each +// planner decides which tasks it needs (e.g. validator prepends +// key-validation gates). // // FailedPhase is deliberately empty: a failure retries on the next reconcile // rather than transitioning the node out of Running. -func assembleDay2Plan(node *seiv1alpha1.SeiNode, prog []string, patch map[string]map[string]any) (*seiv1alpha1.TaskPlan, error) { +func assembleUpdatePlan(node *seiv1alpha1.SeiNode, prog []string, patch map[string]map[string]any) (*seiv1alpha1.TaskPlan, error) { setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) for i, taskType := range prog { - params := paramsForDay2Task(node, taskType, patch) + params := paramsForUpdateTask(node, taskType, patch) t, err := buildPlannedTask(planID, taskType, i, params) if err != nil { return nil, err @@ -791,11 +775,10 @@ func assembleDay2Plan(node *seiv1alpha1.SeiNode, prog []string, patch map[string }, nil } -// paramsForDay2Task is a thin wrapper around paramsForTaskType that -// returns the config-patch params when the task type is TaskConfigPatch. -// All other task types delegate to paramsForTaskType with nil intent — -// day-2 plans never carry a ConfigIntent. -func paramsForDay2Task(node *seiv1alpha1.SeiNode, taskType string, patch map[string]map[string]any) any { +// paramsForUpdateTask returns the config-patch params for TaskConfigPatch +// and delegates everything else to paramsForTaskType with nil intent — +// update plans on a Running node never carry a ConfigIntent. +func paramsForUpdateTask(node *seiv1alpha1.SeiNode, taskType string, patch map[string]map[string]any) any { if taskType == TaskConfigPatch { return task.ConfigPatchTask{Files: patch} } diff --git a/internal/planner/replay.go b/internal/planner/replay.go index ad178675..91846fac 100644 --- a/internal/planner/replay.go +++ b/internal/planner/replay.go @@ -48,8 +48,8 @@ func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas return buildBasePlan(node, node.Spec.Peers, &node.Spec.Replayer.Snapshot, intent) } -// buildRunningPlan returns the day-2 plan for a Running replayer node. -// Same shape as full/archive — see full.go's buildRunningPlan. +// buildRunningPlan returns the update plan for a Running replayer node. +// Same shape as full and archive. func (p *replayerPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { prog := []string{ @@ -61,7 +61,7 @@ func (p *replayerPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alp task.TaskTypeObserveImage, TaskMarkReady, } - return assembleDay2Plan(node, prog, externalAddressPatch(node)) + return assembleUpdatePlan(node, prog, externalAddressPatch(node)) } if sidecarNeedsReapproval(node) { return buildMarkReadyPlan(node) diff --git a/internal/planner/validator.go b/internal/planner/validator.go index e0246121..cc9cf97f 100644 --- a/internal/planner/validator.go +++ b/internal/planner/validator.go @@ -111,13 +111,10 @@ func (p *validatorPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Ta return buildBasePlan(node, node.Spec.Peers, v.Snapshot, intent) } -// buildRunningPlan returns the day-2 plan for a Running validator. The -// progression prepends ValidateSigningKey / ValidateNodeKey / -// ValidateOperatorKeyring before any STS or pod mutation: a missing or -// malformed key secret must abort *before* we touch the StatefulSet -// (which would otherwise schedule a pod that fails its kubelet volume -// mount with no controller-side error context). Mirrors the same -// readiness gates the validator's init plan applies in buildBasePlan. +// buildRunningPlan returns the update plan for a Running validator. The +// key-validation gates run before any STS mutation so a missing or +// malformed secret aborts with a clear controller-side error rather than +// a kubelet volume-mount failure on the recreated pod. func (p *validatorPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { prog := []string{ @@ -132,7 +129,7 @@ func (p *validatorPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1al task.TaskTypeObserveImage, TaskMarkReady, } - return assembleDay2Plan(node, prog, externalAddressPatch(node)) + return assembleUpdatePlan(node, prog, externalAddressPatch(node)) } if sidecarNeedsReapproval(node) { return buildMarkReadyPlan(node) diff --git a/internal/task/config_patch.go b/internal/task/config_patch.go index 9587f6a9..b5696e67 100644 --- a/internal/task/config_patch.go +++ b/internal/task/config_patch.go @@ -4,15 +4,10 @@ import ( sidecar "github.com/sei-protocol/seictl/sidecar/client" ) -// ConfigPatchTask satisfies sidecar.TaskBuilder for config-patch. The -// controller stamps a subset of TOML keys it directly owns into named -// files; the sidecar handler is a generic merge-and-write per file with -// no sei-config involvement. Used for day-2 reconvergence where the -// controller wants surgical authority over operational fields without -// re-running the typed mode-defaulted resolver. -// -// JSON tag matches the wire format sidecar.ConfigPatchTask serializes -// (sidecar handler expects {"files": {...}}). +// ConfigPatchTask stamps controller-owned TOML keys into named seid +// config files via a generic merge-and-write on the sidecar — no +// sei-config involvement. JSON tag matches the wire format +// sidecar.ConfigPatchTask emits. type ConfigPatchTask struct { Files map[string]map[string]any `json:"files"` } From 819ad3454f4f561323292123f36c7259ab01b555 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sun, 31 May 2026 12:52:26 -0700 Subject: [PATCH 5/7] test(planner,task): close three coverage gaps before final review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TestReplayerPlanner_ImageDrift_UpdateProgression — symmetry coverage for the replayer mode (was implicitly covered by the table-driven per-mode intent test that's gone now). - TestPlannerConvention_InitUsesApply_UpdateUsesPatch — pins the doc rule "init writes whole file via config-apply, update patches via config-patch, never mix" so a future refactor that swaps tasks fails loud at unit-test time. - TestConfigPatchTask_JSONWireFormatUsesFilesKey — pins the `json:"files"` tag. Dropping the tag would emit "Files" capitalised; the sidecar deserializes by literal "files" key and patches would silently no-op. Lint stays green but every config-patch task in prod would do nothing. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/node_update_test.go | 53 ++++++++++++++++++++++++++++ internal/task/config_patch_test.go | 34 ++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 internal/task/config_patch_test.go diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 9d83bc1c..12a61046 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -165,6 +165,59 @@ func TestArchivePlanner_ImageDrift_UpdateProgression(t *testing.T) { })) } +// Replayer shares the full/archive update shape — symmetry test. +func TestReplayerPlanner_ImageDrift_UpdateProgression(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.FullNode = nil + node.Spec.Replayer = &seiv1alpha1.ReplayerSpec{} + node.Spec.Image = testImageV2 + + plan, err := (&replayerPlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + TaskConfigPatch, + TaskConfigValidate, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) +} + +// Convention: init writes the whole config via TaskConfigApply; a Running +// node's update plan patches via TaskConfigPatch. Never both in one plan. +// Stated as a doc rule on NodePlanner; this test enforces it. +func TestPlannerConvention_InitUsesApply_UpdateUsesPatch(t *testing.T) { + g := NewWithT(t) + + // Init: empty phase, empty currentImage. + initNode := runningFullNode() + initNode.Status.Phase = "" + initNode.Status.CurrentImage = "" + + initPlan, err := (&fullNodePlanner{}).BuildPlan(initNode) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(initPlan).NotTo(BeNil()) + initTypes := planTaskTypes(initPlan) + g.Expect(initTypes).To(ContainElement(TaskConfigApply), "init must include config-apply") + g.Expect(initTypes).NotTo(ContainElement(TaskConfigPatch), "init must NOT include config-patch") + + // Running update: spec.image diverges from status.currentImage. + updateNode := runningFullNode() + updateNode.Spec.Image = testImageV2 + + updatePlan, err := (&fullNodePlanner{}).BuildPlan(updateNode) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(updatePlan).NotTo(BeNil()) + updateTypes := planTaskTypes(updatePlan) + g.Expect(updateTypes).To(ContainElement(TaskConfigPatch), "update must include config-patch") + g.Expect(updateTypes).NotTo(ContainElement(TaskConfigApply), "update must NOT include config-apply") +} + // Validator's update progression prepends the three key-validation gates // so a missing/malformed secret aborts before any STS mutation. func TestValidatorPlanner_ImageDrift_PrependsValidationGates(t *testing.T) { diff --git a/internal/task/config_patch_test.go b/internal/task/config_patch_test.go new file mode 100644 index 00000000..9bd832bc --- /dev/null +++ b/internal/task/config_patch_test.go @@ -0,0 +1,34 @@ +package task + +import ( + "encoding/json" + "testing" + + . "github.com/onsi/gomega" +) + +// The sidecar handler keys its deserialization on a literal "files" JSON +// field (sidecar/tasks/config.go ConfigPatchRequest). Dropping the json +// tag on ConfigPatchTask.Files would silently break every patch at +// runtime — Go's default would emit "Files" capitalized and the sidecar +// would see an empty params block. Pin the wire format here so a +// regression in the tag fails fast at unit-test time. +func TestConfigPatchTask_JSONWireFormatUsesFilesKey(t *testing.T) { + g := NewWithT(t) + in := ConfigPatchTask{Files: map[string]map[string]any{ + "config.toml": {"p2p": map[string]any{"external-address": "host:26656"}}, + }} + + raw, err := json.Marshal(in) + g.Expect(err).NotTo(HaveOccurred()) + + var asMap map[string]any + g.Expect(json.Unmarshal(raw, &asMap)).To(Succeed()) + g.Expect(asMap).To(HaveKey("files"), "sidecar deserializes by 'files' key — drop the json tag and patches silently no-op") + g.Expect(asMap).NotTo(HaveKey("Files"), "capitalised 'Files' would mean the json tag was dropped") + + // Roundtrip back into the controller-side type. + var out ConfigPatchTask + g.Expect(json.Unmarshal(raw, &out)).To(Succeed()) + g.Expect(out.Files).To(Equal(in.Files)) +} From f03714fda744f6ad5f0d3df1729a81a671a2446f Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sun, 31 May 2026 13:03:44 -0700 Subject: [PATCH 6/7] chore(planner): hoist setNodeUpdateCondition out of assembleUpdatePlan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both reviewers flagged the same ergonomic trap on the as-landed code: assembleUpdatePlan unconditionally stamped a "UpdateStarted / image drift detected" condition. Every caller today gates on imageDrifted() so the message is accurate, but the helper had a generic name and a future Running-phase trigger that isn't image-drift (e.g. ExternalAddress drift detection) would silently produce a misleading condition message. Hoist: assembleUpdatePlan now only assembles the plan (pure boilerplate — planID, ordering, marshaling, phase fields). Each mode planner's buildRunningPlan stamps the condition with its own context before calling the helper. Helper imageDriftMessage(node) keeps callers terse. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/archive.go | 2 ++ internal/planner/full.go | 2 ++ internal/planner/planner.go | 17 +++++++++++------ internal/planner/replay.go | 2 ++ internal/planner/validator.go | 2 ++ 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/internal/planner/archive.go b/internal/planner/archive.go index 3fc35738..04c30a6d 100644 --- a/internal/planner/archive.go +++ b/internal/planner/archive.go @@ -5,6 +5,7 @@ import ( "strconv" seiconfig "github.com/sei-protocol/sei-config" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/task" @@ -40,6 +41,7 @@ func (p *archiveNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1. // Same shape as full nodes (no extra validation gates). func (p *archiveNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/full.go b/internal/planner/full.go index cbb1cd69..ebb64f5e 100644 --- a/internal/planner/full.go +++ b/internal/planner/full.go @@ -4,6 +4,7 @@ import ( "fmt" seiconfig "github.com/sei-protocol/sei-config" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/task" @@ -49,6 +50,7 @@ func (p *fullNodePlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas // re-encode — the first config-patch erases operator-added comments. func (p *fullNodePlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 756120c5..ae5b28c7 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -733,6 +733,13 @@ func imageDrifted(node *seiv1alpha1.SeiNode) bool { return node.Spec.Image != node.Status.CurrentImage } +// imageDriftMessage formats the standard NodeUpdateInProgress message +// for image-drift-triggered plans, used by every mode planner's +// buildRunningPlan before calling assembleUpdatePlan. +func imageDriftMessage(node *seiv1alpha1.SeiNode) string { + return fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage) +} + // externalAddressPatch is the config.toml patch that stamps the // publishable-P2P external address. An empty Spec.ExternalAddress // stamps an empty value so opt-out reaches the pod symmetrically. @@ -747,16 +754,14 @@ func externalAddressPatch(node *seiv1alpha1.SeiNode) map[string]map[string]any { } // assembleUpdatePlan composes a task progression for a Running node into -// a TaskPlan. The progression slice is the per-mode authorship — each -// planner decides which tasks it needs (e.g. validator prepends -// key-validation gates). +// a TaskPlan. Pure boilerplate (planID, task ordering, param marshaling, +// phase fields) — the progression slice is the per-mode authorship. +// Callers own the NodeUpdateInProgress condition: stamp it before calling +// so the reason/message reflects the actual trigger. // // FailedPhase is deliberately empty: a failure retries on the next reconcile // rather than transitioning the node out of Running. func assembleUpdatePlan(node *seiv1alpha1.SeiNode, prog []string, patch map[string]map[string]any) (*seiv1alpha1.TaskPlan, error) { - setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", - fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) - planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) for i, taskType := range prog { diff --git a/internal/planner/replay.go b/internal/planner/replay.go index 91846fac..594fce0c 100644 --- a/internal/planner/replay.go +++ b/internal/planner/replay.go @@ -4,6 +4,7 @@ import ( "fmt" seiconfig "github.com/sei-protocol/sei-config" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/task" @@ -52,6 +53,7 @@ func (p *replayerPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Tas // Same shape as full and archive. func (p *replayerPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) prog := []string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, diff --git a/internal/planner/validator.go b/internal/planner/validator.go index cc9cf97f..602239fd 100644 --- a/internal/planner/validator.go +++ b/internal/planner/validator.go @@ -4,6 +4,7 @@ import ( "fmt" seiconfig "github.com/sei-protocol/sei-config" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/task" @@ -117,6 +118,7 @@ func (p *validatorPlanner) BuildPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.Ta // a kubelet volume-mount failure on the recreated pod. func (p *validatorPlanner) buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if imageDrifted(node) { + setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", imageDriftMessage(node)) prog := []string{ task.TaskTypeValidateSigningKey, task.TaskTypeValidateNodeKey, From ea58977c6b5a132bea1f1263337571854daf5097 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sun, 31 May 2026 20:39:23 -0700 Subject: [PATCH 7/7] chore(planner): brevity sweep on comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trim a few comments that survived the prior sweep with extra fat: - imageDriftMessage: 3 → 2 lines. - paramsForUpdateTask: 3 → 3 (rephrased, no length change). - assembleUpdatePlan: 7 → 4 lines (merged the "pure boilerplate" framing with the "callers own the condition" guidance + FailedPhase note). - ConfigPatchTask doc: 4 → 3 lines. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/planner/planner.go | 23 +++++++++-------------- internal/task/config_patch.go | 5 ++--- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/internal/planner/planner.go b/internal/planner/planner.go index ae5b28c7..4bea23ab 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -733,9 +733,8 @@ func imageDrifted(node *seiv1alpha1.SeiNode) bool { return node.Spec.Image != node.Status.CurrentImage } -// imageDriftMessage formats the standard NodeUpdateInProgress message -// for image-drift-triggered plans, used by every mode planner's -// buildRunningPlan before calling assembleUpdatePlan. +// imageDriftMessage formats the NodeUpdateInProgress message every mode +// planner stamps before an image-drift-triggered assembleUpdatePlan call. func imageDriftMessage(node *seiv1alpha1.SeiNode) string { return fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage) } @@ -753,14 +752,10 @@ func externalAddressPatch(node *seiv1alpha1.SeiNode) map[string]map[string]any { } } -// assembleUpdatePlan composes a task progression for a Running node into -// a TaskPlan. Pure boilerplate (planID, task ordering, param marshaling, -// phase fields) — the progression slice is the per-mode authorship. -// Callers own the NodeUpdateInProgress condition: stamp it before calling -// so the reason/message reflects the actual trigger. -// -// FailedPhase is deliberately empty: a failure retries on the next reconcile -// rather than transitioning the node out of Running. +// assembleUpdatePlan composes a per-mode task progression into a TaskPlan. +// Callers own the NodeUpdateInProgress condition — stamp it before calling +// so the reason/message reflects the actual trigger. FailedPhase stays +// empty so a failure retries on next reconcile. func assembleUpdatePlan(node *seiv1alpha1.SeiNode, prog []string, patch map[string]map[string]any) (*seiv1alpha1.TaskPlan, error) { planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) @@ -780,9 +775,9 @@ func assembleUpdatePlan(node *seiv1alpha1.SeiNode, prog []string, patch map[stri }, nil } -// paramsForUpdateTask returns the config-patch params for TaskConfigPatch -// and delegates everything else to paramsForTaskType with nil intent — -// update plans on a Running node never carry a ConfigIntent. +// paramsForUpdateTask returns ConfigPatchTask params for TaskConfigPatch +// and delegates everything else to paramsForTaskType. Update plans never +// carry a ConfigIntent — those are init-path only. func paramsForUpdateTask(node *seiv1alpha1.SeiNode, taskType string, patch map[string]map[string]any) any { if taskType == TaskConfigPatch { return task.ConfigPatchTask{Files: patch} diff --git a/internal/task/config_patch.go b/internal/task/config_patch.go index b5696e67..fb6bb1b9 100644 --- a/internal/task/config_patch.go +++ b/internal/task/config_patch.go @@ -5,9 +5,8 @@ import ( ) // ConfigPatchTask stamps controller-owned TOML keys into named seid -// config files via a generic merge-and-write on the sidecar — no -// sei-config involvement. JSON tag matches the wire format -// sidecar.ConfigPatchTask emits. +// config files via the sidecar's generic merge-and-write. The json tag +// matches the wire format the sidecar deserializes. type ConfigPatchTask struct { Files map[string]map[string]any `json:"files"` }