diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index dc50e4acf..18c246c4a 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -2,11 +2,14 @@ package controllers import ( "context" + "errors" "fmt" "reflect" + "sync" + "time" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -14,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" @@ -29,40 +33,93 @@ const ( // Controller conditions for the Cluster Operator resource cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" + + // transientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. This prevents brief + // API server blips during upgrades from immediately degrading the operator. + // Applies to both CloudConfigController and TrustedCAController. + transientDegradedThreshold = 2 * time.Minute ) type CloudConfigReconciler struct { ClusterOperatorStatusClient Scheme *runtime.Scheme FeatureGateAccess featuregates.FeatureGateAccess + failures failureWindow +} + +// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use. +type failureWindow struct { + mu sync.Mutex + consecutiveFailureSince *time.Time + lastTransientFailureAt *time.Time } -func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +// clear resets the failure window. Call this on every successful reconcile. +func (fw *failureWindow) clear() { + fw.mu.Lock() + defer fw.mu.Unlock() + fw.consecutiveFailureSince = nil + fw.lastTransientFailureAt = nil +} + +// observe records a transient failure at now and returns the elapsed time since +// the window started plus a boolean indicating whether the window was just opened +// or restarted. staleAfter controls stale-window detection: if the gap since the +// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable. +func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) { + fw.mu.Lock() + defer fw.mu.Unlock() + stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter + if fw.consecutiveFailureSince == nil || stale { + fw.consecutiveFailureSince = &now + fw.lastTransientFailureAt = &now + return 0, true + } + fw.lastTransientFailureAt = &now + return now.Sub(*fw.consecutiveFailureSince), false +} + +func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // All nil-error paths clear the failure window. + defer func() { + if retErr == nil { + r.clearFailureWindow() + return + } + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleTerminalError(ctx, retErr) + } else { + result, retErr = r.handleTransientError(ctx, retErr) + } + }() + infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { - klog.Errorf("infrastructure resource not found") - if err := r.setDegradedCondition(ctx); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { + // No cloud platform: mirror the main controller's behaviour of returning Available. + klog.Infof("Infrastructure cluster does not exist. Skipping...") + if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - return ctrl.Result{}, err + return ctrl.Result{}, nil + } else if err != nil { + return ctrl.Result{}, err // transient } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // nil platformStatus is a terminal misconfiguration. + return ctrl.Result{}, reconcile.TerminalError(err) } if !syncNeeded { if err := r.setAvailableCondition(ctx); err != nil { @@ -74,11 +131,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) if err != nil { + // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, reconcile.TerminalError(err) } sourceCM := &corev1.ConfigMap{} @@ -99,14 +154,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil { managedConfigFound = true - } else if errors.IsNotFound(err) { + } else if apierrors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") - } else if err != nil { - klog.Errorf("unable to get managed cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + } else { + return ctrl.Result{}, err // transient } } @@ -116,41 +167,29 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) Name: infra.Spec.CloudConfig.Name, Namespace: OpenshiftConfigNamespace, } - if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { + if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") } else if err != nil { - klog.Errorf("unable to get cloud-config for sync: %v", err) - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } } sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes. + return ctrl.Result{}, reconcile.TerminalError(err) } - // Check if FeatureGateAccess is configured if r.FeatureGateAccess == nil { - klog.Errorf("FeatureGateAccess is not configured") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured") + // Operator misconfiguration at startup: ermanent. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured")) } features, err := r.FeatureGateAccess.CurrentFeatureGates() if err != nil { + // The feature-gate informer may not have synced yet: transient. klog.Errorf("unable to get feature gates: %v", err) - if errD := r.setDegradedCondition(ctx); errD != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } if cloudConfigTransformerFn != nil { @@ -159,10 +198,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // we're not expecting users to put their data in the former. output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // Platform-specific transform failed on the current config data: terminal. + return ctrl.Result{}, reconcile.TerminalError(err) } sourceCM.Data[defaultConfigKey] = output } @@ -174,12 +211,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // If the config does not exist, it will be created later, so we can ignore a Not Found error - if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { - klog.Errorf("unable to get target cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !apierrors.IsNotFound(err) { + return ctrl.Result{}, err // transient } // Note that the source config map is actually a *transformed* source config map @@ -193,10 +226,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil { klog.Errorf("unable to sync cloud config") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } if err := r.setAvailableCondition(ctx); err != nil { @@ -206,6 +236,45 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } +// clearFailureWindow resets the transient-error tracking. Called by the deferred +// dispatcher in Reconcile on every successful (nil-error) return path. +func (r *CloudConfigReconciler) clearFailureWindow() { + r.failures.clear() +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + elapsed, started := r.failures.observe(r.Clock.Now(), 0) + if started { + klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleTerminalError sets CloudConfigControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { if platformStatus == nil { return false, fmt.Errorf("platformStatus is required") @@ -279,7 +348,7 @@ func (r *CloudConfigReconciler) syncCloudConfigData(ctx context.Context, source // check if target config exists, create if not err := r.Get(ctx, client.ObjectKeyFromObject(target), &corev1.ConfigMap{}) - if err != nil && errors.IsNotFound(err) { + if err != nil && apierrors.IsNotFound(err) { return r.Create(ctx, target) } else if err != nil { return err diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 513e944e3..dcfcd02c7 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() { }, timeout).Should(Succeed()) initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion + // Introducing the consecutiveFailureWindow means that there's a field that could be racy + // between the manager calling Reconcile and the test calling Reconcile. + // In production, we only have 1 instance of the reconciler running. + // Create a fresh reconciler that is NOT registered with the manager. + // It shares the same API client (thread-safe) but has its own + // consecutiveFailureSince field, so no data race with the manager's copy. + freshReconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: targetNamespaceName, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting( + nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil, + ), + } + request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}} - _, err := reconciler.Reconcile(ctx, request) + _, err := freshReconciler.Reconcile(ctx, request) Expect(err).Should(Succeed()) Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed()) @@ -509,7 +527,7 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(len(allCMs.Items)).To(BeEquivalentTo(1)) }) - It("should error if a user-specified configmap key isn't present", func() { + It("should degrade immediately if a user-specified configmap key isn't present", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) infraResource.Spec.CloudConfig.Key = "notfound" Expect(cl.Create(ctx, infraResource)).To(Succeed()) @@ -518,8 +536,19 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap")) - + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) It("should continue with reconcile when feature gates are available", func() { @@ -584,15 +613,39 @@ var _ = Describe("Cloud config sync reconciler", func() { }) }) - It("reconcile should fail if no infra resource found", func() { + It("reconcile should succeed and be available if no infra resource found", func() { _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(configv1.ConditionTrue)) }) - It("should fail if no PlatformStatus in infra resource presented ", func() { + It("should degrade immediately if no PlatformStatus in infra resource", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) Expect(cl.Create(ctx, infraResource)).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) }) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index 8d8da01f7..7defae2cf 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -19,20 +19,23 @@ package controllers import ( "context" "crypto/tls" + "errors" "fmt" + "time" configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" "github.com/openshift/library-go/pkg/cloudprovider" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" "github.com/openshift/cluster-cloud-controller-manager-operator/pkg/cloud" @@ -46,6 +49,13 @@ const ( // Condition type for Cloud Controller ownership cloudControllerOwnershipCondition = "CloudControllerOwner" + + // aggregatedTransientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. + // This prevents brief API server blips during upgrades from immediately degrading the operator. + // Applies to top-level operator, and is longer in order + // to accomodate changes in the lower-level operators. + aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second) ) // CloudOperatorReconciler reconciles a ClusterOperator object @@ -56,6 +66,7 @@ type CloudOperatorReconciler struct { ImagesFile string FeatureGateAccess featuregates.FeatureGateAccess TLSConfig func(*tls.Config) + failures failureWindow } // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete @@ -64,65 +75,68 @@ type CloudOperatorReconciler struct { // +kubebuilder:rbac:groups=config.openshift.io,resources=infrastructures,verbs=get;list;watch // Reconcile will process the cloud-controller-manager clusterOperator -func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { +func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (result ctrl.Result, retErr error) { conditionOverrides := []configv1.ClusterOperatorStatusCondition{} + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // All nil-error paths clear the failure window. + defer func() { + if retErr == nil { + r.clearFailureWindow() + return + } + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleDegradeError(ctx, conditionOverrides, retErr) + } else { + result, retErr = r.handleTransientError(ctx, conditionOverrides, retErr) + } + }() + infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") - if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) return ctrl.Result{}, err } - - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } + // Known limitation: when provisioningAllowed internally calls setStatusDegraded + // (e.g. a sub-controller has Degraded=True, or IsCloudProviderExternal errors), + // it returns a non-nil error. Reconcile passes that error to handleTransientError, + // which starts the 2m30s window. After the threshold, handleTransientError calls + // setStatusDegraded again — redundant but harmless, since status is already degraded. + // This is a consequence of keeping status-setting inside provisioningAllowed rather + // than pushing it into Reconcile. allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return ctrl.Result{}, err + return ctrl.Result{}, err // transient; status already set inside provisioningAllowed } else if !allowedToProvision { - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } clusterProxy := &configv1.Proxy{} - if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig) if err != nil { klog.Errorf("Unable to build operator config %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return ctrl.Result{}, err // transient } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { @@ -135,7 +149,44 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, err } - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window +} + +func (r *CloudOperatorReconciler) clearFailureWindow() { + r.failures.clear() +} + +// handleTransientError records the start of a failure window and degrades the +// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns +// a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + elapsed, started := r.failures.observe(r.Clock.Now(), 0) + if started { + klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold) + return ctrl.Result{}, err + } + if elapsed < aggregatedTransientDegradedThreshold { + klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets OperatorDegraded=True immediately and returns nil so +// controller-runtime does NOT requeue. Existing watches on Infrastructure, +// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. +func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger } func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index 041670762..987b5be2d 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -2,6 +2,7 @@ package controllers import ( "context" + "fmt" "time" . "github.com/onsi/ginkgo/v2" @@ -618,3 +619,73 @@ var _ = Describe("Apply resources should", func() { }) }) + +var _ = Describe("CloudOperatorReconciler error handling", func() { + ctx := context.Background() + + AfterEach(func() { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() bool { + err := cl.Delete(ctx, co) + return err == nil || apierrors.IsNotFound(err) + }).Should(BeTrue()) + } + Eventually(apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + }) + + It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() { + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + _, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error")) + Expect(err).NotTo(HaveOccurred()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) + + It("handleTransientError should not degrade before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // Pre-create the ClusterOperator so that setStatusDegraded can update its status + // subresource when the threshold is exceeded (status subresource updates require the + // object to already exist in the cluster). + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(ctx, co)).To(Succeed()) + + // First reconcile: transient failure starts; error returned but no degraded condition set. + _, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "should not be degraded before threshold") + + // Advance clock past the degraded threshold. + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) +}) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index ef33709e1..97ca33baa 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "crypto/x509" + "errors" "fmt" "os" @@ -43,6 +44,7 @@ type TrustedCABundleReconciler struct { ClusterOperatorStatusClient Scheme *runtime.Scheme trustBundlePath string + failures failureWindow } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -50,9 +52,32 @@ func isSpecTrustedCASet(proxyConfig *configv1.ProxySpec) bool { return len(proxyConfig.TrustedCA.Name) > 0 } -func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("%s emitted event, syncing %s ConfigMap", req, trustedCAConfigMapName) + // partialRun is set to true on the early-exit path where the event is for + // an unrelated ConfigMap. That path returns available=true but should NOT + // reset an ongoing transient failure window from a previous full reconcile. + partialRun := false + + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with terminal()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // Nil-error paths clear the failure window unless partialRun is set. + defer func() { + if retErr == nil { + if !partialRun { + r.clearFailureWindow() + } + return + } + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleDegradeError(ctx, retErr) + } else { + result, retErr = r.handleTransientError(ctx, retErr) + } + }() + proxyConfig := &configv1.Proxy{} if err := r.Get(ctx, types.NamespacedName{Name: proxyResourceName}, proxyConfig); err != nil { if apierrors.IsNotFound(err) { @@ -62,62 +87,89 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - return reconcile.Result{}, nil + return reconcile.Result{}, nil // defer clears failure window } - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - // Error reading the object - requeue the request. - return reconcile.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) + // Non-NotFound: transient API error. + return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) // transient } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. - // If not, return early + // If not, return early without resetting the failure window (partialRun=true). if req.Namespace == OpenshiftConfigNamespace && proxyConfig.Spec.TrustedCA.Name != req.Name { + partialRun = true + klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - - klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) return reconcile.Result{}, nil } systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) + // Node cert store may be updating during upgrade: transient. + return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) // transient } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: terminal. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: terminal. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) + return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) // transient } if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } + return ctrl.Result{}, nil // defer clears failure window +} + +func (r *TrustedCABundleReconciler) clearFailureWindow() { + r.failures.clear() +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. +func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + // Pass transientDegradedThreshold as the stale-window threshold to detect gaps + // where no reconcile ran (e.g. a partialRun returned nil, resetting the rate limiter). + elapsed, started := r.failures.observe(r.Clock.Now(), transientDegradedThreshold) + if started { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. +func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } return ctrl.Result{}, nil } diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index 2c112413f..e9c564a57 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/tools/record" clocktesting "k8s.io/utils/clock/testing" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -309,6 +310,153 @@ var _ = Describe("Trusted CA bundle sync controller", func() { }) }) +var _ = Describe("Trusted CA bundle reconciler unit tests", func() { + ctx := context.Background() + + AfterEach(func() { + co := &v1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Expect(cl.Delete(ctx, co)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)) + }).Should(BeTrue()) + } + }) + + It("reconcile should succeed and be available if no proxy resource found", func() { + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: systemCAValid, + } + + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(Succeed()) + + co := &v1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(v1.ConditionTrue)) + }) + + It("stale failure window should be restarted when gap since last error exceeds threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the reconcile progresses to getSystemTrustBundle. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // Step 1: First transient error; failure window opens at T0. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil()) + + // Step 2: Advance clock past the threshold — simulates a gap with no reconciles + // (e.g., system recovered, no events fired for a long time). + fakeClock.Step(transientDegradedThreshold + time.Second) + + // Step 3: New transient error arrives. The stale-window logic should detect that + // lastTransientFailureAt is more than the threshold ago and restart the window + // from 'now', NOT degrade immediately. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), + "should not degrade immediately after a gap — window should restart") + } + } + } + // consecutiveFailureSince should now be 'now', not the original T0. + Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil()) + Expect(fakeClock.Now().Sub(*reconciler.failures.consecutiveFailureSince)).To(BeNumerically("<", time.Second), + "window should have been restarted to ~now, not retained from original T0") + }) + + It("should not degrade on transient error before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the Proxy get succeeds and we reach the system trust bundle read. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // First reconcile at T0: transient failure starts; error is returned but no degraded condition set. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock to mid-window (half the threshold) and reconcile again to simulate + // continuous failures. This updates lastTransientFailureAt, keeping it fresh. + fakeClock.Step(transientDegradedThreshold / 2) + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock so total elapsed from T0 exceeds the threshold, but the gap since the + // most recent failure (lastTransientFailureAt) is less than the threshold. This ensures + // the degradation path is taken rather than the stale-window restart path. + fakeClock.Step(transientDegradedThreshold/2 + time.Second) + + // Final reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(v1.ConditionTrue)) + }) +}) + var _ = Describe("Trusted CA reconciler methods", func() { It("Get system CA should be fine if bundle is valid", func() { reconciler := &TrustedCABundleReconciler{