From 19ca281e058755d4644779a1321759facf4b1491 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Date: Wed, 13 May 2026 15:00:05 +0530 Subject: [PATCH 1/2] SCF-830: Added state management for cluster hibernation --- manifests/postgresql.crd.yaml | 10 +++ pkg/apis/acid.zalan.do/v1/const.go | 2 + pkg/apis/acid.zalan.do/v1/postgresql.crd.yaml | 10 +++ pkg/apis/acid.zalan.do/v1/postgresql_type.go | 9 ++- pkg/apis/acid.zalan.do/v1/util.go | 10 +++ .../acid.zalan.do/v1/zz_generated.deepcopy.go | 21 +++++ pkg/cluster/cluster.go | 81 +++++++++++++++++++ pkg/cluster/lifecycle.go | 78 ++++++++++++++++++ pkg/cluster/sync.go | 7 +- pkg/util/k8sutil/k8sutil.go | 10 +++ 10 files changed, 236 insertions(+), 2 deletions(-) create mode 100644 pkg/cluster/lifecycle.go diff --git a/manifests/postgresql.crd.yaml b/manifests/postgresql.crd.yaml index 39811824e..1bba0c788 100644 --- a/manifests/postgresql.crd.yaml +++ b/manifests/postgresql.crd.yaml @@ -3246,6 +3246,13 @@ spec: - name type: object type: array + lifecycle: + description: LifecycleSpec describes the lifecycle state of a Postgres + cluster. + properties: + phase: + type: string + type: object logicalBackupRetention: type: string logicalBackupSchedule: @@ -4197,6 +4204,9 @@ spec: properties: PostgresClusterStatus: type: string + previousNumberOfInstances: + format: int32 + type: integer required: - PostgresClusterStatus type: object diff --git a/pkg/apis/acid.zalan.do/v1/const.go b/pkg/apis/acid.zalan.do/v1/const.go index 4102ea3d3..69012427a 100644 --- a/pkg/apis/acid.zalan.do/v1/const.go +++ b/pkg/apis/acid.zalan.do/v1/const.go @@ -9,6 +9,8 @@ const ( ClusterStatusSyncFailed = "SyncFailed" ClusterStatusAddFailed = "CreateFailed" ClusterStatusRunning = "Running" + ClusterStatusStopping = "Stopping" + ClusterStatusStopped = "Stopped" ClusterStatusInvalid = "Invalid" ) diff --git a/pkg/apis/acid.zalan.do/v1/postgresql.crd.yaml b/pkg/apis/acid.zalan.do/v1/postgresql.crd.yaml index 39811824e..1bba0c788 100644 --- a/pkg/apis/acid.zalan.do/v1/postgresql.crd.yaml +++ b/pkg/apis/acid.zalan.do/v1/postgresql.crd.yaml @@ -3246,6 +3246,13 @@ spec: - name type: object type: array + lifecycle: + description: LifecycleSpec describes the lifecycle state of a Postgres + cluster. + properties: + phase: + type: string + type: object logicalBackupRetention: type: string logicalBackupSchedule: @@ -4197,6 +4204,9 @@ spec: properties: PostgresClusterStatus: type: string + previousNumberOfInstances: + format: int32 + type: integer required: - PostgresClusterStatus type: object diff --git a/pkg/apis/acid.zalan.do/v1/postgresql_type.go b/pkg/apis/acid.zalan.do/v1/postgresql_type.go index 1dadfd06c..4b17cf67f 100644 --- a/pkg/apis/acid.zalan.do/v1/postgresql_type.go +++ b/pkg/apis/acid.zalan.do/v1/postgresql_type.go @@ -115,6 +115,7 @@ type PostgresSpec struct { TLS *TLSDescription `json:"tls,omitempty"` AdditionalVolumes []AdditionalVolume `json:"additionalVolumes,omitempty"` Streams []Stream `json:"streams,omitempty"` + Lifecycle *LifecycleSpec `json:"lifecycle,omitempty"` Env []v1.EnvVar `json:"env,omitempty"` // deprecated @@ -257,6 +258,11 @@ type StandbyDescription struct { StandbyPrimarySlotName string `json:"standby_primary_slot_name,omitempty"` } +// LifecycleSpec describes the lifecycle state of a Postgres cluster. +type LifecycleSpec struct { + Phase string `json:"phase,omitempty"` +} + // TLSDescription specs TLS properties type TLSDescription struct { // +required @@ -302,7 +308,8 @@ type UserFlags []string // PostgresStatus contains status of the PostgreSQL cluster (running, creation failed etc.) type PostgresStatus struct { - PostgresClusterStatus string `json:"PostgresClusterStatus"` + PostgresClusterStatus string `json:"PostgresClusterStatus"` + PreviousNumberOfInstances int32 `json:"previousNumberOfInstances,omitempty"` } // ConnectionPooler Options for connection pooler diff --git a/pkg/apis/acid.zalan.do/v1/util.go b/pkg/apis/acid.zalan.do/v1/util.go index 719defe93..7bbdc0bbf 100644 --- a/pkg/apis/acid.zalan.do/v1/util.go +++ b/pkg/apis/acid.zalan.do/v1/util.go @@ -101,6 +101,16 @@ func (postgresStatus PostgresStatus) Creating() bool { return postgresStatus.PostgresClusterStatus == ClusterStatusCreating } +// Stopping status of cluster +func (postgresStatus PostgresStatus) Stopping() bool { + return postgresStatus.PostgresClusterStatus == ClusterStatusStopping +} + +// Stopped status of cluster +func (postgresStatus PostgresStatus) Stopped() bool { + return postgresStatus.PostgresClusterStatus == ClusterStatusStopped +} + func (postgresStatus PostgresStatus) String() string { return postgresStatus.PostgresClusterStatus } diff --git a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go index 159a87f35..692a6fe30 100644 --- a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go +++ b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go @@ -310,6 +310,22 @@ func (in *KubernetesMetaConfiguration) DeepCopy() *KubernetesMetaConfiguration { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LifecycleSpec) DeepCopyInto(out *LifecycleSpec) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LifecycleSpec. +func (in *LifecycleSpec) DeepCopy() *LifecycleSpec { + if in == nil { + return nil + } + out := new(LifecycleSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *LoadBalancerConfiguration) DeepCopyInto(out *LoadBalancerConfiguration) { *out = *in @@ -874,6 +890,11 @@ func (in *PostgresSpec) DeepCopyInto(out *PostgresSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.Lifecycle != nil { + in, out := &in.Lifecycle, &out.Lifecycle + *out = new(LifecycleSpec) + **out = **in + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]corev1.EnvVar, len(*in)) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 04c974f4c..bf1e24a09 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -1008,6 +1008,22 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { c.mu.Lock() defer c.mu.Unlock() + // Block all spec changes when cluster is stopped or stopping + if c.Status.Stopped() || c.Status.Stopping() { + lifecyclePhase := "" + if newSpec.Spec.Lifecycle != nil { + lifecyclePhase = newSpec.Spec.Lifecycle.Phase + } + // During Stopping: block ALL spec changes (no cancellation allowed) + // During Stopped: only block if keeping lifecycle.phase="stopped" + if c.Status.Stopping() { + return fmt.Errorf("cannot update cluster while it is stopping. Wait for it to fully stop first") + } + if lifecyclePhase == "stopped" { + return fmt.Errorf("cannot update cluster while stopped. Remove lifecycle.phase to wake up the cluster") + } + } + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating newSpec, err := c.KubeClient.SetPostgresCRDStatus(c.clusterName(), newSpec) @@ -1015,6 +1031,71 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { return fmt.Errorf("could not set cluster status to updating: %w", err) } + // Check if user is initiating hibernate (Running -> Stopping) + if c.Status.Running() && newSpec.Spec.Lifecycle != nil && newSpec.Spec.Lifecycle.Phase == "stopped" { + c.logger.Infof("[lifecycle] initiating hibernate for cluster %s: current numberOfInstances=%d", c.Name, c.Spec.NumberOfInstances) + + // Store previousNumberOfInstances BEFORE setting numberOfInstances to 0 + newSpec.Status.PreviousNumberOfInstances = c.Spec.NumberOfInstances + newSpec.Spec.NumberOfInstances = 0 + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopping + + c.logger.Infof("[lifecycle] hibernate initiated: setting numberOfInstances=0, previousNumberOfInstances=%d", newSpec.Status.PreviousNumberOfInstances) + + // Update spec first (Update only updates spec when CR has status subresource) + pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec) + if err != nil { + return fmt.Errorf("could not update spec during hibernate: %w", err) + } + c.logger.Infof("[lifecycle] hibernate: spec updated successfully") + + // Update status separately - we need to preserve the status values we set + // because UpdatePostgresCR returns object with status zeroed (subresource behavior) + pgUpdated.Status.PreviousNumberOfInstances = newSpec.Status.PreviousNumberOfInstances + pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus + + pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated) + if err != nil { + return fmt.Errorf("could not update status during hibernate: %w", err) + } + c.logger.Infof("[lifecycle] hibernate: status updated successfully, previousNumberOfInstances=%d", pgUpdated.Status.PreviousNumberOfInstances) + + c.setSpec(pgUpdated) + return nil + } + + // Check if user is waking up from stopped state (Stopped -> Running) + // This is when user clears lifecycle.phase to wake up the cluster + if c.Status.Stopped() && (newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped") { + if newSpec.Status.PreviousNumberOfInstances > 0 { + c.logger.Infof("[lifecycle] waking up cluster %s: restoring numberOfInstances=%d", c.Name, newSpec.Status.PreviousNumberOfInstances) + + // Restore numberOfInstances from previousNumberOfInstances + newSpec.Spec.NumberOfInstances = newSpec.Status.PreviousNumberOfInstances + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating + + // Update spec first + pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec) + if err != nil { + return fmt.Errorf("could not update spec during wake-up: %w", err) + } + c.logger.Infof("[lifecycle] wake-up: spec updated successfully") + + // Update status separately, and clear previousNumberOfInstances after restore + pgUpdated.Status.PreviousNumberOfInstances = 0 // Clear after successful restore + pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus + + pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated) + if err != nil { + return fmt.Errorf("could not update status during wake-up: %w", err) + } + c.logger.Infof("[lifecycle] wake-up: status updated successfully, previousNumberOfInstances cleared") + + c.setSpec(pgUpdated) + return nil + } + } + if !c.isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) { // do not apply any major version related changes yet newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion diff --git a/pkg/cluster/lifecycle.go b/pkg/cluster/lifecycle.go new file mode 100644 index 000000000..83d58a2b9 --- /dev/null +++ b/pkg/cluster/lifecycle.go @@ -0,0 +1,78 @@ +package cluster + +import ( + acidv1 "github.com/zalando/postgres-operator/pkg/apis/acid.zalan.do/v1" +) + +// manageHibernateState manages cluster hibernate/wake-up state transitions. +// Returns true if sync should continue, false if it should return early. +// +// This function handles the following state transitions: +// - Running -> Stopping: When user sets lifecycle.phase = "stopped" +// - Stopping -> Stopped: When StatefulSet replicas reach 0 +// - Stopped -> Updating: When user clears lifecycle.phase (wake-up) +// - Updating -> Running: Normal sync continues, defer sets final status +func (c *Cluster) manageHibernateState(oldSpec acidv1.Postgresql, newSpec *acidv1.Postgresql) bool { + + // FIX B: Detect wake-up by comparing oldSpec status vs newSpec status + // When Update() is called, it sets status=Updating before Sync() runs. + // So we need to check if oldSpec.Status was Stopped and newSpec is Updating + // with lifecycle cleared to properly detect wake-up. + isWakingUp := oldSpec.Status.Stopped() && + newSpec.Status.PostgresClusterStatus == acidv1.ClusterStatusUpdating && + (newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped") + + // FIX C: Additional wake-up detection with simpler condition + // If lifecycle was cleared and we have previousNumberOfInstances and numberOfInstances is 0 + isWakingUpSimple := newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped" + hasPreviousInstances := newSpec.Status.PreviousNumberOfInstances > 0 + needsRestore := newSpec.Spec.NumberOfInstances == 0 + + isWakingUp = isWakingUp || (isWakingUpSimple && hasPreviousInstances && needsRestore) + + // === INITIATE HIBERNATE: Running -> Stopping === + // Only initiate if not already stopping or stopped, and lifecycle.phase = "stopped" + if newSpec.Spec.Lifecycle != nil && + newSpec.Spec.Lifecycle.Phase == "stopped" && + !newSpec.Status.Stopping() && + !newSpec.Status.Stopped() { + + newSpec.Status.PreviousNumberOfInstances = newSpec.Spec.NumberOfInstances + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopping + newSpec.Spec.NumberOfInstances = 0 + c.logger.Infof("[lifecycle] cluster is going to hibernate, stored previous number of instances: %d", + newSpec.Status.PreviousNumberOfInstances) + return true + } + + // === STOPPING -> STOPPED: Check actual StatefulSet replicas === + // Only transition to Stopped when StatefulSet replicas have actually reached 0 + if newSpec.Status.Stopping() { + if c.Statefulset != nil && *c.Statefulset.Spec.Replicas == 0 { + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopped + c.logger.Infof("[lifecycle] cluster has stopped, all pods are terminated") + } + return true + } + + // === WAKE-UP: Stopped/Updating -> Running === + // Restore numberOfInstances from previousNumberOfInstances when waking up + if newSpec.Status.Stopped() || isWakingUp { + // Check if lifecycle.phase was cleared (user wants to wake up) + if isWakingUp || newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped" { + if newSpec.Status.PreviousNumberOfInstances > 0 { + newSpec.Spec.NumberOfInstances = newSpec.Status.PreviousNumberOfInstances + c.logger.Infof("[lifecycle] cluster is waking up, restoring number of instances: %d", + newSpec.Status.PreviousNumberOfInstances) + } else { + c.logger.Warningf("[lifecycle] cluster is waking up but previousNumberOfInstances is 0, cannot restore") + } + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating + return true + } + // Still stopped and lifecycle.phase = "stopped", skip further sync + return false + } + + return true +} \ No newline at end of file diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index ffebd306c..13982900c 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -47,7 +47,7 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error { if err != nil { c.logger.Warningf("error while syncing cluster state: %v", err) newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusSyncFailed - } else if !c.Status.Running() { + } else if !c.Status.Running() && !c.Status.Stopping() && !c.Status.Stopped() { newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusRunning } @@ -65,6 +65,11 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error { c.logger.Debugf("could not sync finalizers: %v", err) } + // Handle lifecycle hibernate/wake-up state transitions + if !c.manageHibernateState(oldSpec, newSpec) { + return nil + } + if err = c.initUsers(); err != nil { err = fmt.Errorf("could not init users: %v", err) return err diff --git a/pkg/util/k8sutil/k8sutil.go b/pkg/util/k8sutil/k8sutil.go index c34faddd4..0e31112ad 100644 --- a/pkg/util/k8sutil/k8sutil.go +++ b/pkg/util/k8sutil/k8sutil.go @@ -200,6 +200,16 @@ func (client *KubernetesClient) SetPostgresCRDStatus(clusterName spec.Namespaced return pg, nil } +// UpdatePostgresCR of Postgres cluster (updates full resource including spec) +func (client *KubernetesClient) UpdatePostgresCR(clusterName spec.NamespacedName, pg *apiacidv1.Postgresql) (*apiacidv1.Postgresql, error) { + pg, err := client.PostgresqlsGetter.Postgresqls(clusterName.Namespace).Update(context.TODO(), pg, metav1.UpdateOptions{}) + if err != nil { + return pg, fmt.Errorf("could not update PostgresCR: %v", err) + } + + return pg, nil +} + // SetFinalizer of Postgres cluster func (client *KubernetesClient) SetFinalizer(clusterName spec.NamespacedName, pg *apiacidv1.Postgresql, finalizers []string) (*apiacidv1.Postgresql, error) { var ( From 8d75f56abcbf990e04b625efc6cfcf7fb74827f8 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Date: Wed, 13 May 2026 15:20:35 +0530 Subject: [PATCH 2/2] SCF-830: Steamlined cluster update function --- pkg/cluster/cluster.go | 189 +++++++++++++++++++++++---------------- pkg/cluster/lifecycle.go | 7 +- 2 files changed, 116 insertions(+), 80 deletions(-) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index bf1e24a09..e28fa46d0 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -1009,91 +1009,28 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { defer c.mu.Unlock() // Block all spec changes when cluster is stopped or stopping - if c.Status.Stopped() || c.Status.Stopping() { - lifecyclePhase := "" - if newSpec.Spec.Lifecycle != nil { - lifecyclePhase = newSpec.Spec.Lifecycle.Phase - } - // During Stopping: block ALL spec changes (no cancellation allowed) - // During Stopped: only block if keeping lifecycle.phase="stopped" - if c.Status.Stopping() { - return fmt.Errorf("cannot update cluster while it is stopping. Wait for it to fully stop first") - } - if lifecyclePhase == "stopped" { - return fmt.Errorf("cannot update cluster while stopped. Remove lifecycle.phase to wake up the cluster") - } + blocked, err := c.blockLifecycleUpdate(newSpec) + if err != nil { + return err + } + if blocked { + return nil } newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating - newSpec, err := c.KubeClient.SetPostgresCRDStatus(c.clusterName(), newSpec) + newSpec, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), newSpec) if err != nil { return fmt.Errorf("could not set cluster status to updating: %w", err) } - // Check if user is initiating hibernate (Running -> Stopping) - if c.Status.Running() && newSpec.Spec.Lifecycle != nil && newSpec.Spec.Lifecycle.Phase == "stopped" { - c.logger.Infof("[lifecycle] initiating hibernate for cluster %s: current numberOfInstances=%d", c.Name, c.Spec.NumberOfInstances) - - // Store previousNumberOfInstances BEFORE setting numberOfInstances to 0 - newSpec.Status.PreviousNumberOfInstances = c.Spec.NumberOfInstances - newSpec.Spec.NumberOfInstances = 0 - newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopping - - c.logger.Infof("[lifecycle] hibernate initiated: setting numberOfInstances=0, previousNumberOfInstances=%d", newSpec.Status.PreviousNumberOfInstances) - - // Update spec first (Update only updates spec when CR has status subresource) - pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec) - if err != nil { - return fmt.Errorf("could not update spec during hibernate: %w", err) - } - c.logger.Infof("[lifecycle] hibernate: spec updated successfully") - - // Update status separately - we need to preserve the status values we set - // because UpdatePostgresCR returns object with status zeroed (subresource behavior) - pgUpdated.Status.PreviousNumberOfInstances = newSpec.Status.PreviousNumberOfInstances - pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus - - pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated) - if err != nil { - return fmt.Errorf("could not update status during hibernate: %w", err) - } - c.logger.Infof("[lifecycle] hibernate: status updated successfully, previousNumberOfInstances=%d", pgUpdated.Status.PreviousNumberOfInstances) - - c.setSpec(pgUpdated) - return nil + // Handle lifecycle transitions (hibernate/wake-up) + handled, err := c.handleHibernateAndWakeUp(newSpec) + if err != nil { + return err } - - // Check if user is waking up from stopped state (Stopped -> Running) - // This is when user clears lifecycle.phase to wake up the cluster - if c.Status.Stopped() && (newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped") { - if newSpec.Status.PreviousNumberOfInstances > 0 { - c.logger.Infof("[lifecycle] waking up cluster %s: restoring numberOfInstances=%d", c.Name, newSpec.Status.PreviousNumberOfInstances) - - // Restore numberOfInstances from previousNumberOfInstances - newSpec.Spec.NumberOfInstances = newSpec.Status.PreviousNumberOfInstances - newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating - - // Update spec first - pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec) - if err != nil { - return fmt.Errorf("could not update spec during wake-up: %w", err) - } - c.logger.Infof("[lifecycle] wake-up: spec updated successfully") - - // Update status separately, and clear previousNumberOfInstances after restore - pgUpdated.Status.PreviousNumberOfInstances = 0 // Clear after successful restore - pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus - - pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated) - if err != nil { - return fmt.Errorf("could not update status during wake-up: %w", err) - } - c.logger.Infof("[lifecycle] wake-up: status updated successfully, previousNumberOfInstances cleared") - - c.setSpec(pgUpdated) - return nil - } + if handled { + return nil } if !c.isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) { @@ -1301,6 +1238,106 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { return nil } +// blockLifecycleUpdate checks if an update should be blocked due to lifecycle state. +// Returns (blocked bool, err error): +// - (true, nil) if update is blocked and caller should return early +// - (false, nil) if update can proceed +// - (false, error) on error +func (c *Cluster) blockLifecycleUpdate(newSpec *acidv1.Postgresql) (bool, error) { + if !c.Status.Stopped() && !c.Status.Stopping() { + return false, nil + } + + lifecyclePhase := "" + if newSpec.Spec.Lifecycle != nil { + lifecyclePhase = newSpec.Spec.Lifecycle.Phase + } + + // During Stopping: block ALL spec changes (no cancellation allowed) + if c.Status.Stopping() { + return true, fmt.Errorf("cannot update cluster while it is stopping. Wait for it to fully stop first") + } + + // During Stopped: only block if keeping lifecycle.phase="stopped" + if lifecyclePhase == "stopped" { + return true, fmt.Errorf("cannot update cluster while stopped. Remove lifecycle.phase to wake up the cluster") + } + + return false, nil +} + +// handleHibernateAndWakeUp processes lifecycle hibernate/wake-up transitions. +// Returns (handled bool, err error): +// - (true, nil) if lifecycle transition was handled, Update() should return early +// - (false, nil) if no lifecycle transition, normal update continues +// - (false, error) on error +func (c *Cluster) handleHibernateAndWakeUp(newSpec *acidv1.Postgresql) (bool, error) { + // === INITIATE HIBERNATE: Running -> Stopping === + if c.Status.Running() && newSpec.Spec.Lifecycle != nil && newSpec.Spec.Lifecycle.Phase == "stopped" { + c.logger.Infof("[lifecycle] initiating hibernate for cluster %s: current numberOfInstances=%d", c.Name, c.Spec.NumberOfInstances) + + // Store previousNumberOfInstances BEFORE setting numberOfInstances to 0 + newSpec.Status.PreviousNumberOfInstances = c.Spec.NumberOfInstances + newSpec.Spec.NumberOfInstances = 0 + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusStopping + + c.logger.Infof("[lifecycle] hibernate initiated: setting numberOfInstances=0, previousNumberOfInstances=%d", newSpec.Status.PreviousNumberOfInstances) + + // Update spec first (Update only updates spec when CR has status subresource) + pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec) + if err != nil { + return false, fmt.Errorf("could not update spec during hibernate: %w", err) + } + c.logger.Infof("[lifecycle] hibernate: spec updated successfully") + + // Update status separately - preserve status values since UpdatePostgresCR returns object with status zeroed + pgUpdated.Status.PreviousNumberOfInstances = newSpec.Status.PreviousNumberOfInstances + pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus + + pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated) + if err != nil { + return false, fmt.Errorf("could not update status during hibernate: %w", err) + } + c.logger.Infof("[lifecycle] hibernate: status updated successfully, previousNumberOfInstances=%d", pgUpdated.Status.PreviousNumberOfInstances) + + c.setSpec(pgUpdated) + return true, nil + } + + // === WAKE-UP: Stopped -> Running === + if c.Status.Stopped() && (newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped") { + if newSpec.Status.PreviousNumberOfInstances > 0 { + c.logger.Infof("[lifecycle] waking up cluster %s: restoring numberOfInstances=%d", c.Name, newSpec.Status.PreviousNumberOfInstances) + + // Restore numberOfInstances from previousNumberOfInstances + newSpec.Spec.NumberOfInstances = newSpec.Status.PreviousNumberOfInstances + newSpec.Status.PostgresClusterStatus = acidv1.ClusterStatusUpdating + + // Update spec first + pgUpdated, err := c.KubeClient.UpdatePostgresCR(c.clusterName(), newSpec) + if err != nil { + return false, fmt.Errorf("could not update spec during wake-up: %w", err) + } + c.logger.Infof("[lifecycle] wake-up: spec updated successfully") + + // Update status separately, and clear previousNumberOfInstances after restore + pgUpdated.Status.PreviousNumberOfInstances = 0 + pgUpdated.Status.PostgresClusterStatus = newSpec.Status.PostgresClusterStatus + + pgUpdated, err = c.KubeClient.SetPostgresCRDStatus(c.clusterName(), pgUpdated) + if err != nil { + return false, fmt.Errorf("could not update status during wake-up: %w", err) + } + c.logger.Infof("[lifecycle] wake-up: status updated successfully, previousNumberOfInstances cleared") + + c.setSpec(pgUpdated) + return true, nil + } + } + + return false, nil +} + func syncResources(a, b *v1.ResourceRequirements) bool { for _, res := range []v1.ResourceName{ v1.ResourceCPU, diff --git a/pkg/cluster/lifecycle.go b/pkg/cluster/lifecycle.go index 83d58a2b9..bb27fd533 100644 --- a/pkg/cluster/lifecycle.go +++ b/pkg/cluster/lifecycle.go @@ -13,8 +13,6 @@ import ( // - Stopped -> Updating: When user clears lifecycle.phase (wake-up) // - Updating -> Running: Normal sync continues, defer sets final status func (c *Cluster) manageHibernateState(oldSpec acidv1.Postgresql, newSpec *acidv1.Postgresql) bool { - - // FIX B: Detect wake-up by comparing oldSpec status vs newSpec status // When Update() is called, it sets status=Updating before Sync() runs. // So we need to check if oldSpec.Status was Stopped and newSpec is Updating // with lifecycle cleared to properly detect wake-up. @@ -22,12 +20,12 @@ func (c *Cluster) manageHibernateState(oldSpec acidv1.Postgresql, newSpec *acidv newSpec.Status.PostgresClusterStatus == acidv1.ClusterStatusUpdating && (newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped") - // FIX C: Additional wake-up detection with simpler condition // If lifecycle was cleared and we have previousNumberOfInstances and numberOfInstances is 0 isWakingUpSimple := newSpec.Spec.Lifecycle == nil || newSpec.Spec.Lifecycle.Phase != "stopped" hasPreviousInstances := newSpec.Status.PreviousNumberOfInstances > 0 needsRestore := newSpec.Spec.NumberOfInstances == 0 + // double verification of waking up isWakingUp = isWakingUp || (isWakingUpSimple && hasPreviousInstances && needsRestore) // === INITIATE HIBERNATE: Running -> Stopping === @@ -75,4 +73,5 @@ func (c *Cluster) manageHibernateState(oldSpec acidv1.Postgresql, newSpec *acidv } return true -} \ No newline at end of file +} +