-
Notifications
You must be signed in to change notification settings - Fork 24
fix(controllers): replace a persistently crash-looping member #336
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -898,9 +898,66 @@ func restoreInitContainer(member *lll.EtcdMember, peerAddr, operatorImage string | |
| }, vols | ||
| } | ||
|
|
||
| // dataLossRestartThreshold is how many times the etcd container must have | ||
| // restarted before we treat a non-bootstrap PVC member as unrecoverable and | ||
| // replace it. High enough to ride out transient join churn and slow restores; | ||
| // CrashLoopBackOff caps its backoff at 5m, so this many restarts means a | ||
| // member that has been unable to start for several minutes. | ||
| const dataLossRestartThreshold = 5 | ||
|
|
||
| // etcdContainerStuck reports whether the pod's etcd container is persistently | ||
| // failing to start: not ready and restarted at least dataLossRestartThreshold | ||
| // times, excluding OOMKilled (a resource problem that re-creating the member | ||
| // would not fix). This is the signature of an unrecoverable member — most | ||
| // commonly a data dir lost while the cluster membership moved on, so the | ||
| // member's frozen --initial-cluster no longer matches the live cluster and | ||
| // etcd refuses to boot ("error validating peerURLs ... member count is | ||
| // unequal"). etcd ignores --initial-cluster for an initialised data dir, so a | ||
| // healthy member that merely restarts is unaffected. | ||
| func etcdContainerStuck(pod *corev1.Pod) bool { | ||
| for _, cs := range pod.Status.ContainerStatuses { | ||
| if cs.Name != "etcd" { | ||
| continue | ||
| } | ||
| if cs.Ready || cs.RestartCount < dataLossRestartThreshold { | ||
| return false | ||
| } | ||
| if t := cs.LastTerminationState.Terminated; t != nil && t.Reason == "OOMKilled" { | ||
| return false | ||
| } | ||
| return true | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| // clusterHasQuorumWithout reports whether the member's cluster still has quorum | ||
| // among its OTHER members — i.e. losing this one is a minority failure, so | ||
| // replacing it cannot break the cluster. ReadyMembers already excludes this | ||
| // (not-ready) member, so it is the live healthy count. Used to gate self-heal: | ||
| // we never delete a member during a cluster-wide outage (where many members | ||
| // crash at once), only an isolated stuck member backed by a healthy majority. | ||
| func (r *EtcdMemberReconciler) clusterHasQuorumWithout(ctx context.Context, member *lll.EtcdMember) bool { | ||
| cluster, err := r.clusterFor(ctx, member) | ||
| if err != nil { | ||
| return false | ||
| } | ||
| desired := 0 | ||
| if cluster.Status.Observed != nil { | ||
| desired = int(cluster.Status.Observed.Replicas) | ||
| } | ||
| if desired == 0 && cluster.Spec.Replicas != nil { | ||
| desired = int(*cluster.Spec.Replicas) | ||
| } | ||
| if desired == 0 { | ||
| return false | ||
| } | ||
| return int(cluster.Status.ReadyMembers) >= desired/2+1 | ||
| } | ||
|
Comment on lines
+939
to
+955
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Issue: Potential race condition with stale
|
||
|
|
||
| // ── Status ─────────────────────────────────────────────────────────────── | ||
|
|
||
| func (r *EtcdMemberReconciler) updateStatus(ctx context.Context, member *lll.EtcdMember) (ctrl.Result, error) { | ||
| log := log.FromContext(ctx) | ||
| pod := &corev1.Pod{} | ||
| if err := r.Get(ctx, types.NamespacedName{Namespace: member.Namespace, Name: member.Name}, pod); err != nil { | ||
| if errors.IsNotFound(err) { | ||
|
|
@@ -950,6 +1007,26 @@ func (r *EtcdMemberReconciler) updateStatus(ctx context.Context, member *lll.Etc | |
|
|
||
| switch { | ||
| case !podReady: | ||
| // Self-heal an unrecoverable member. A non-bootstrap PVC member whose | ||
| // etcd cannot start — classically because its data dir was lost while | ||
| // the cluster membership moved on, leaving its frozen --initial-cluster | ||
| // stale (etcd: "member count is unequal") — crash-loops forever on its | ||
| // own. Replace it: delete the CR so the finalizer does a clean | ||
| // MemberRemove and the cluster controller gap-fills a fresh member with | ||
| // a current --initial-cluster. Gate on the rest of the cluster having | ||
| // quorum so a cluster-wide outage never cascades into mass deletion | ||
| // (the finalizer's MemberRemove is quorum-gated too — belt and braces). | ||
| if !member.Spec.Bootstrap && | ||
| member.Spec.Storage.Medium != lll.StorageMediumMemory && | ||
| etcdContainerStuck(pod) && | ||
| r.clusterHasQuorumWithout(ctx, member) { | ||
| log.Info("etcd member is persistently crash-looping while the rest of the cluster is healthy; deleting it for replacement", | ||
| "restartThreshold", dataLossRestartThreshold) | ||
| if err := r.Delete(ctx, member); err != nil { | ||
| return ctrl.Result{}, err | ||
| } | ||
| return ctrl.Result{}, nil | ||
| } | ||
| if setMemberCondition(member, lll.MemberReady, metav1.ConditionFalse, "PodNotReady", | ||
| fmt.Sprintf("pod phase: %s", pod.Status.Phase)) { | ||
| changed = true | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -9,6 +9,7 @@ import ( | |||||||||||||||||||||||
| "net" | ||||||||||||||||||||||||
| "net/http" | ||||||||||||||||||||||||
| "net/url" | ||||||||||||||||||||||||
| "sort" | ||||||||||||||||||||||||
| "strings" | ||||||||||||||||||||||||
| "testing" | ||||||||||||||||||||||||
| "time" | ||||||||||||||||||||||||
|
|
@@ -134,6 +135,94 @@ func TestKamajiDataStore(t *testing.T) { | |||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| t.Logf("found %q among etcd keys", proofName) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| // ── Member churn: replace EVERY original member one at a time and prove | ||||||||||||||||||||||||
| // Kamaji keeps working through the new, GenerateName-hashed members. | ||||||||||||||||||||||||
| // Native members are named via GenerateName ("<cluster>-<hash>"), so the | ||||||||||||||||||||||||
| // DataStore can only address the cluster through the stable | ||||||||||||||||||||||||
| // <cluster>-client Service (its sole endpoint) and the operator's server | ||||||||||||||||||||||||
| // cert SAN is a wildcard (*.<cluster>.<ns>.svc). Member names therefore | ||||||||||||||||||||||||
| // never reach Kamaji — this churns the entire member set out from under a | ||||||||||||||||||||||||
| // live tenant control plane and guards that contract end to end. | ||||||||||||||||||||||||
| // | ||||||||||||||||||||||||
| // Gate on a fully-formed cluster first: Available latches on quorum, not on | ||||||||||||||||||||||||
| // the full replica count, so without this wait we might delete a member | ||||||||||||||||||||||||
| // while the third is still a freshly-promoted/learner member — collapsing | ||||||||||||||||||||||||
| // the cluster into the fragile 2-node window mid-bootstrap. | ||||||||||||||||||||||||
| waitFor(ctx, t, 5*time.Minute, "all 3 members Ready before churn", func(ctx context.Context) error { | ||||||||||||||||||||||||
| ec := &etcdv1alpha2.EtcdCluster{} | ||||||||||||||||||||||||
| if err := kube.Get(ctx, client.ObjectKey{Namespace: e2eNamespace, Name: clusterName}, ec); err != nil { | ||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if ec.Status.ReadyMembers != 3 { | ||||||||||||||||||||||||
| return fmt.Errorf("readyMembers=%d, want 3", ec.Status.ReadyMembers) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return nil | ||||||||||||||||||||||||
| }) | ||||||||||||||||||||||||
| original := memberNames(ctx, t) | ||||||||||||||||||||||||
| if len(original) != 3 { | ||||||||||||||||||||||||
| t.Fatalf("expected 3 members before churn, got %d: %v", len(original), original) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| t.Logf("original members: %v", original) | ||||||||||||||||||||||||
| for _, victim := range original { | ||||||||||||||||||||||||
| t.Logf("deleting EtcdMember %q (operator does MemberRemove + a GenerateName replacement)", victim) | ||||||||||||||||||||||||
| m := &etcdv1alpha2.EtcdMember{ObjectMeta: metav1.ObjectMeta{Namespace: e2eNamespace, Name: victim}} | ||||||||||||||||||||||||
| if err := kube.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) { | ||||||||||||||||||||||||
| t.Fatalf("delete member %s: %v", victim, err) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| waitFor(ctx, t, 5*time.Minute, fmt.Sprintf("%q removed and the cluster back to 3 ready members", victim), | ||||||||||||||||||||||||
| func(ctx context.Context) error { | ||||||||||||||||||||||||
| err := kube.Get(ctx, client.ObjectKey{Namespace: e2eNamespace, Name: victim}, &etcdv1alpha2.EtcdMember{}) | ||||||||||||||||||||||||
| if err == nil { | ||||||||||||||||||||||||
| return fmt.Errorf("victim %s still present (MemberRemove in flight)", victim) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if !apierrors.IsNotFound(err) { | ||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if names := memberNames(ctx, t); len(names) != 3 { | ||||||||||||||||||||||||
| return fmt.Errorf("have %d members, want 3: %v", len(names), names) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
Comment on lines
+181
to
+183
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The Proposed inline fix- if names := memberNames(ctx, t); len(names) != 3 {
- return fmt.Errorf("have %d members, want 3: %v", len(names), names)
- }
+ list := &etcdv1alpha2.EtcdMemberList{}
+ if err := kube.List(ctx, list, client.InNamespace(e2eNamespace),
+ client.MatchingLabels{"etcd-operator.cozystack.io/cluster": clusterName}); err != nil {
+ return err
+ }
+ if len(list.Items) != 3 {
+ return fmt.Errorf("have %d members, want 3", len(list.Items))
+ }📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||
| ec := &etcdv1alpha2.EtcdCluster{} | ||||||||||||||||||||||||
| if err := kube.Get(ctx, client.ObjectKey{Namespace: e2eNamespace, Name: clusterName}, ec); err != nil { | ||||||||||||||||||||||||
| return err | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if ec.Status.ReadyMembers != 3 { | ||||||||||||||||||||||||
| return fmt.Errorf("readyMembers=%d, want 3", ec.Status.ReadyMembers) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| return nil | ||||||||||||||||||||||||
| }) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| // The cluster is now a wholly fresh member set — no original name remains. | ||||||||||||||||||||||||
| final := memberNames(ctx, t) | ||||||||||||||||||||||||
| for _, o := range original { | ||||||||||||||||||||||||
| for _, f := range final { | ||||||||||||||||||||||||
| if o == f { | ||||||||||||||||||||||||
| t.Fatalf("original member %q still present after full churn: %v", o, final) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| t.Logf("fully churned member set: %v -> %v", original, final) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| // Kamaji must still work through the new members: the original proof key | ||||||||||||||||||||||||
| // survived all three replacements, and a fresh write still lands in etcd — | ||||||||||||||||||||||||
| // all without any change to the DataStore (it still points at the same | ||||||||||||||||||||||||
| // <cluster>-client Service). | ||||||||||||||||||||||||
| if keys := etcdKeys(ctx, t); !strings.Contains(keys, proofName) { | ||||||||||||||||||||||||
| t.Fatalf("proof key %q lost after member churn", proofName) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| const churnProof = "e2e-proof-postchurn" | ||||||||||||||||||||||||
| cm2 := &corev1.ConfigMap{ | ||||||||||||||||||||||||
| ObjectMeta: metav1.ObjectMeta{Name: churnProof, Namespace: "default"}, | ||||||||||||||||||||||||
| Data: map[string]string{"written-by": "etcd-operator-e2e-postchurn"}, | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if _, err := tenantSet.CoreV1().ConfigMaps("default").Create(ctx, cm2, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { | ||||||||||||||||||||||||
| t.Fatalf("create post-churn ConfigMap via tenant API: %v", err) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| if keys := etcdKeys(ctx, t); !strings.Contains(keys, churnProof) { | ||||||||||||||||||||||||
| t.Fatalf("post-churn write %q did not reach etcd through the fresh members", churnProof) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| t.Log("Kamaji tenant API still roundtrips through the fully churned (hash-named) member set") | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| // Teardown — reverse order, waiting where deletion is asynchronous. | ||||||||||||||||||||||||
| deleteAndWait(ctx, t, "kamaji.clastix.io/v1alpha1", "TenantControlPlane", e2eNamespace, tcpName, 5*time.Minute) | ||||||||||||||||||||||||
| deleteAndWait(ctx, t, "kamaji.clastix.io/v1alpha1", "DataStore", "", "kamaji-e2e", 2*time.Minute) | ||||||||||||||||||||||||
|
|
@@ -307,6 +396,23 @@ func etcdKeys(ctx context.Context, t *testing.T) string { | |||||||||||||||||||||||
| return stdout | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| // memberNames returns the sorted names of the cluster's EtcdMembers, selected | ||||||||||||||||||||||||
| // by the cluster label (member pods/CRs carry GenerateName-hashed names). | ||||||||||||||||||||||||
| func memberNames(ctx context.Context, t *testing.T) []string { | ||||||||||||||||||||||||
| t.Helper() | ||||||||||||||||||||||||
| list := &etcdv1alpha2.EtcdMemberList{} | ||||||||||||||||||||||||
| if err := kube.List(ctx, list, client.InNamespace(e2eNamespace), | ||||||||||||||||||||||||
| client.MatchingLabels{"etcd-operator.cozystack.io/cluster": clusterName}); err != nil { | ||||||||||||||||||||||||
| t.Fatalf("list etcd members: %v", err) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| names := make([]string, 0, len(list.Items)) | ||||||||||||||||||||||||
| for i := range list.Items { | ||||||||||||||||||||||||
| names = append(names, list.Items[i].Name) | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| sort.Strings(names) | ||||||||||||||||||||||||
| return names | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| func podExec(ctx context.Context, namespace, pod, container string, command []string) (string, string, error) { | ||||||||||||||||||||||||
| req := clientset.CoreV1().RESTClient().Post(). | ||||||||||||||||||||||||
| Resource("pods").Namespace(namespace).Name(pod).SubResource("exec"). | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Issue 1: Missing check for Pod deletion state
If a Pod is currently being deleted (e.g., due to a manual restart, node drain, or eviction), its
DeletionTimestampis set, and its containers will eventually terminate. If we evaluate it as "stuck" during this graceful termination window, we might prematurely delete theEtcdMemberCR and trigger an expensive member replacement instead of letting the Pod naturally restart or reschedule.Issue 2: Missing check for current OOMKilled state
The current implementation only checks
cs.LastTerminationState.Terminatedfor the"OOMKilled"reason. However, if the container has just been OOMKilled and is currently in theTerminatedstate (but has not yet transitioned toWaiting/CrashLoopBackOff), the"OOMKilled"reason will be incs.State.Terminatedinstead. Checking both states ensures we don't falsely trigger self-healing for resource-limit issues.