From ea1190774af976964a58ac4742a800d78521cf8c Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Tue, 3 Mar 2026 17:02:22 -0500 Subject: [PATCH 1/8] Do not set Degraded=True on transient errors CloudConfigReconciler: gate transient errors behind a 2-minute window Three related fixes to stop upgrade-time API blips from immediately setting CloudConfigControllerDegraded=True: 1. Infrastructure NotFound now calls setAvailableCondition (nil return) instead of setDegradedCondition, matching the main controller's existing behaviour. 2. Errors are classified as transient (API blips: all Get/Create/Update calls, feature-gate informer not yet synced) or permanent (config problems that won't self-heal: nil platformStatus, unsupported platform, missing user config key, nil FeatureGateAccess, transform failure). 3. handleTransientError() only sets degraded after consecutiveFailureSince has been set for longer than transientDegradedThreshold (2 min); handleDegradeError() sets degraded immediately and returns nil so controller-runtime does not requeue (existing watches re-trigger when the underlying config changes). clearFailureWindow() is called at every successful reconcile exit. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 133 ++++++++++-------- .../cloud_config_sync_controller_test.go | 49 ++++++- 2 files changed, 119 insertions(+), 63 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index dc50e4acf..30077cf66 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "reflect" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -29,42 +30,48 @@ const ( // Controller conditions for the Cluster Operator resource cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" + + // transientDegradedThreshold is how long transient errors must persist before + // the controller sets CloudConfigControllerDegraded=True. This prevents brief + // API server blips during upgrades from immediately degrading the operator. + transientDegradedThreshold = 2 * time.Minute ) type CloudConfigReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - FeatureGateAccess featuregates.FeatureGateAccess + Scheme *runtime.Scheme + FeatureGateAccess featuregates.FeatureGateAccess + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { - klog.Errorf("infrastructure resource not found") - if err := r.setDegradedCondition(ctx); err != nil { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + // No cloud platform: mirror the main controller's behaviour of returning Available. + klog.Infof("Infrastructure cluster does not exist. Skipping...") + r.clearFailureWindow() + if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - return ctrl.Result{}, err + return ctrl.Result{}, nil + } else if err != nil { + return r.handleTransientError(ctx, err) } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // nil platformStatus is a permanent misconfiguration. + return r.handleDegradeError(ctx, err) } if !syncNeeded { + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -74,11 +81,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) if err != nil { + // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleDegradeError(ctx, err) } sourceCM := &corev1.ConfigMap{} @@ -101,12 +106,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) managedConfigFound = true } else if errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") - } else if err != nil { - klog.Errorf("unable to get managed cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + } else { + return r.handleTransientError(ctx, err) } } @@ -119,38 +120,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") } else if err != nil { - klog.Errorf("unable to get cloud-config for sync: %v", err) - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } } sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes. + return r.handleDegradeError(ctx, err) } - // Check if FeatureGateAccess is configured if r.FeatureGateAccess == nil { - klog.Errorf("FeatureGateAccess is not configured") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured") + // Operator misconfiguration at startup: permanent. + return r.handleDegradeError(ctx, fmt.Errorf("FeatureGateAccess is not configured")) } features, err := r.FeatureGateAccess.CurrentFeatureGates() if err != nil { + // The feature-gate informer may not have synced yet: transient. klog.Errorf("unable to get feature gates: %v", err) - if errD := r.setDegradedCondition(ctx); errD != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } if cloudConfigTransformerFn != nil { @@ -159,10 +148,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // we're not expecting users to put their data in the former. output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + // Platform-specific transform failed on the current config data: permanent. + return r.handleDegradeError(ctx, err) } sourceCM.Data[defaultConfigKey] = output } @@ -175,16 +162,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // If the config does not exist, it will be created later, so we can ignore a Not Found error if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { - klog.Errorf("unable to get target cloud-config for sync") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } // Note that the source config map is actually a *transformed* source config map if r.isCloudConfigEqual(sourceCM, targetCM) { klog.V(1).Infof("source and target cloud-config content are equal, no sync needed") + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -193,12 +177,10 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil { klog.Errorf("unable to sync cloud config") - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, err) } + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -206,6 +188,45 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } +// clearFailureWindow resets the transient-error tracking. Call this on every +// successful reconcile so the 2-minute window restarts fresh on the next failure. +func (r *CloudConfigReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets CloudConfigControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { if platformStatus == nil { return false, fmt.Errorf("platformStatus is required") diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 513e944e3..5e1a6ce28 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -509,7 +509,7 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(len(allCMs.Items)).To(BeEquivalentTo(1)) }) - It("should error if a user-specified configmap key isn't present", func() { + It("should degrade immediately if a user-specified configmap key isn't present", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) infraResource.Spec.CloudConfig.Key = "notfound" Expect(cl.Create(ctx, infraResource)).To(Succeed()) @@ -518,8 +518,19 @@ var _ = Describe("Cloud config sync reconciler", func() { Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap")) - + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) It("should continue with reconcile when feature gates are available", func() { @@ -584,15 +595,39 @@ var _ = Describe("Cloud config sync reconciler", func() { }) }) - It("reconcile should fail if no infra resource found", func() { + It("reconcile should succeed and be available if no infra resource found", func() { _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(configv1.ConditionTrue)) }) - It("should fail if no PlatformStatus in infra resource presented ", func() { + It("should degrade immediately if no PlatformStatus in infra resource", func() { infraResource := makeInfrastructureResource(configv1.AWSPlatformType) Expect(cl.Create(ctx, infraResource)).To(Succeed()) _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) - Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required")) + Expect(err).To(Succeed()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) }) }) From 48723ac4d89b4846b494ceb1636694783e22ea0f Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Wed, 4 Mar 2026 09:48:45 -0500 Subject: [PATCH 2/8] Handle transient errors for CA controller Mirror the CloudConfigReconciler pattern: transient API errors (Proxy get, system trust bundle read, ConfigMap write) are silently requeued and only set Degraded=True after transientDegradedThreshold (2 min) has elapsed. Errors that indicate corrupt cert data (merge failures) set Degraded=True immediately and return nil so controller-runtime does not requeue; existing watches re-trigger reconciliation when the data changes. Also adds two direct unit tests that verify the threshold gating via a fake clock, without running through the manager. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../trusted_ca_bundle_controller.go | 79 ++++++++++++----- .../trusted_ca_bundle_controller_test.go | 87 +++++++++++++++++++ 2 files changed, 142 insertions(+), 24 deletions(-) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index ef33709e1..e6e788548 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -6,6 +6,7 @@ import ( "crypto/x509" "fmt" "os" + "time" "github.com/openshift/api/annotations" configv1 "github.com/openshift/api/config/v1" @@ -41,8 +42,9 @@ const ( type TrustedCABundleReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - trustBundlePath string + Scheme *runtime.Scheme + trustBundlePath string + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -59,16 +61,14 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Request object not found, could have been deleted after reconcile request. // Return and don't requeue klog.Infof("proxy not found; reconciliation will be skipped") + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } return reconcile.Result{}, nil } - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - // Error reading the object - requeue the request. - return reconcile.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) + // Non-NotFound: transient API error. + return r.handleTransientError(ctx, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err)) } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. @@ -77,43 +77,37 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - + // Note: clearFailureWindow is intentionally NOT called here. This path did not + // exercise the full reconcile logic, so an ongoing transient failure window + // (set by a previous reconcile pass) should not be reset. klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) return reconcile.Result{}, nil } systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) + // Node cert store may be updating during upgrade: transient. + return r.handleTransientError(ctx, fmt.Errorf("failed to get system trust bundle: %v", err)) } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: persistent. + return r.handleDegradeError(ctx, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err) + // Combined cert bundle is corrupt: persistent. + return r.handleDegradeError(ctx, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - if err := r.setDegradedCondition(ctx); err != nil { - return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) - } - return reconcile.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) + return r.handleTransientError(ctx, fmt.Errorf("can not update target trust bundle configmap: %v", err)) } + r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } @@ -121,6 +115,43 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, nil } +func (r *TrustedCABundleReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// controller only after transientDegradedThreshold has elapsed. It always +// returns a non-nil error so controller-runtime requeues with exponential backoff. +func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < transientDegradedThreshold { + klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("TrustedCABundleReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and +// returns nil so controller-runtime does NOT requeue. An existing watch on the +// relevant resource will re-trigger reconciliation when the problem is fixed. +func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setDegradedCondition(ctx); setErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) + } + return ctrl.Result{}, nil +} + // addProxyCABundle checks ca bundle referred by Proxy resource and adds it to passed bundle // in case if proxy one is valid. // This function returns added bundle as first value, result as second and an error if it was occurred. diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index 2c112413f..db1be8c07 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -16,6 +16,7 @@ import ( "k8s.io/client-go/tools/record" clocktesting "k8s.io/utils/clock/testing" "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -309,6 +310,92 @@ var _ = Describe("Trusted CA bundle sync controller", func() { }) }) +var _ = Describe("Trusted CA bundle reconciler unit tests", func() { + ctx := context.Background() + + AfterEach(func() { + co := &v1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Expect(cl.Delete(ctx, co)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)) + }).Should(BeTrue()) + } + }) + + It("reconcile should succeed and be available if no proxy resource found", func() { + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: systemCAValid, + } + + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(Succeed()) + + co := &v1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var availCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerAvailableCondition { + availCond = &co.Status.Conditions[i] + break + } + } + Expect(availCond).NotTo(BeNil()) + Expect(availCond.Status).To(Equal(v1.ConditionTrue)) + }) + + It("should not degrade on transient error before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the Proxy get succeeds and we reach the system trust bundle read. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // First reconcile: transient failure starts; error is returned but no degraded condition set. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock past the degraded threshold. + fakeClock.Step(transientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *v1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == trustedCABundleControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil()) + Expect(degradedCond.Status).To(Equal(v1.ConditionTrue)) + }) +}) + var _ = Describe("Trusted CA reconciler methods", func() { It("Get system CA should be fine if bundle is valid", func() { reconciler := &TrustedCABundleReconciler{ From b139f7e664387b216ab460713919756c8b7b3d39 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Wed, 4 Mar 2026 17:16:48 -0500 Subject: [PATCH 3/8] Fix transient error handling in CloudOperatorReconciler and tests Add handleTransientError/handleDegradeError methods to CloudOperatorReconciler with an aggregatedTransientDegradedThreshold of 2m30s (longer than the sub-controller threshold of 2m, to accommodate sub-controller recovery time). Fix test: handleTransientError test was stepping the clock by transientDegradedThreshold (2m, the sub-controller constant) instead of aggregatedTransientDegradedThreshold (2m30s), so the threshold was never exceeded and the degraded condition was never set. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: Nolan Brubaker --- .../cloud_config_sync_controller.go | 6 +- pkg/controllers/clusteroperator_controller.go | 94 +++++++++++++------ .../clusteroperator_controller_test.go | 71 ++++++++++++++ .../trusted_ca_bundle_controller.go | 3 +- 4 files changed, 140 insertions(+), 34 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index 30077cf66..caa4e237f 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -32,8 +32,9 @@ const ( cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" // transientDegradedThreshold is how long transient errors must persist before - // the controller sets CloudConfigControllerDegraded=True. This prevents brief + // the controller sets Degraded=True. This prevents brief // API server blips during upgrades from immediately degrading the operator. + // Applies to both CloudConfigController and TrustedCAController. transientDegradedThreshold = 2 * time.Minute ) @@ -51,10 +52,11 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { // No cloud platform: mirror the main controller's behaviour of returning Available. klog.Infof("Infrastructure cluster does not exist. Skipping...") - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } + // Skip if the infrastructure resource doesn't exist. + r.clearFailureWindow() return ctrl.Result{}, nil } else if err != nil { return r.handleTransientError(ctx, err) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index 8d8da01f7..ddfc3e910 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -20,6 +20,7 @@ import ( "context" "crypto/tls" "fmt" + "time" configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" @@ -46,16 +47,24 @@ const ( // Condition type for Cloud Controller ownership cloudControllerOwnershipCondition = "CloudControllerOwner" + + // aggregatedTransientDegradedThreshold is how long transient errors must persist before + // the controller sets Degraded=True. + // This prevents brief API server blips during upgrades from immediately degrading the operator. + // Applies to top-level operator, and is longer in order + // to accomodate changes in the lower-level operators. + aggregatedTransientDegradedThreshold = 2*time.Minute + (30 * time.Second) ) // CloudOperatorReconciler reconciles a ClusterOperator object type CloudOperatorReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - watcher ObjectWatcher - ImagesFile string - FeatureGateAccess featuregates.FeatureGateAccess - TLSConfig func(*tls.Config) + Scheme *runtime.Scheme + watcher ObjectWatcher + ImagesFile string + FeatureGateAccess featuregates.FeatureGateAccess + TLSConfig func(*tls.Config) + consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete @@ -70,59 +79,43 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) infra := &configv1.Infrastructure{} if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") - if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } - + // It's ok for the infrastructure cluster to not exist + r.clearFailureWindow() return ctrl.Result{}, nil } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } else if !allowedToProvision { + // We're not allowed to provision, but didn't have any failures. + r.clearFailureWindow() return ctrl.Result{}, nil } clusterProxy := &configv1.Proxy{} if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig) if err != nil { klog.Errorf("Unable to build operator config %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleDegradeError(ctx, conditionOverrides, err) } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - if err := r.setStatusDegraded(ctx, err, conditionOverrides); err != nil { - klog.Errorf("Error syncing ClusterOperatorStatus: %v", err) - return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", err) - } - return ctrl.Result{}, err + return r.handleTransientError(ctx, conditionOverrides, err) } if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { @@ -135,9 +128,48 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, err } + // successful reconcile, make sure the failure window is cleared. + r.clearFailureWindow() return ctrl.Result{}, nil } +func (r *CloudOperatorReconciler) clearFailureWindow() { + r.consecutiveFailureSince = nil +} + +// handleTransientError records the start of a failure window and degrades the +// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns +// a non-nil error so controller-runtime requeues with exponential backoff. +func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + now := r.Clock.Now() + if r.consecutiveFailureSince == nil { + r.consecutiveFailureSince = &now + klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold) + return ctrl.Result{}, err + } + elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + if elapsed < aggregatedTransientDegradedThreshold { + klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err) + return ctrl.Result{}, err + } + klog.Warningf("CloudOperatorReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, err +} + +// handleDegradeError sets OperatorDegraded=True immediately and returns nil so +// controller-runtime does NOT requeue. Existing watches on Infrastructure, +// ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed. +func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { + klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) + if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { + return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %v", setErr) + } + return ctrl.Result{}, nil // do not requeue; a watch event will re-trigger +} + func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { // Deploy resources for platform resources, err := cloud.GetResources(config) diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index 041670762..987b5be2d 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -2,6 +2,7 @@ package controllers import ( "context" + "fmt" "time" . "github.com/onsi/ginkgo/v2" @@ -618,3 +619,73 @@ var _ = Describe("Apply resources should", func() { }) }) + +var _ = Describe("CloudOperatorReconciler error handling", func() { + ctx := context.Background() + + AfterEach(func() { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() bool { + err := cl.Delete(ctx, co) + return err == nil || apierrors.IsNotFound(err) + }).Should(BeTrue()) + } + Eventually(apierrors.IsNotFound(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co))).Should(BeTrue()) + }) + + It("handleDegradeError should set OperatorDegraded=True immediately and return nil error", func() { + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + _, err := reconciler.handleDegradeError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test persistent error")) + Expect(err).NotTo(HaveOccurred()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) + + It("handleTransientError should not degrade before threshold, but degrade after threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // Pre-create the ClusterOperator so that setStatusDegraded can update its status + // subresource when the threshold is exceeded (status subresource updates require the + // object to already exist in the cluster). + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(ctx, co)).To(Succeed()) + + // First reconcile: transient failure starts; error returned but no degraded condition set. + _, err := reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "should not be degraded before threshold") + + // Advance clock past the degraded threshold. + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + + // Second reconcile: threshold exceeded, controller sets degraded. + _, err = reconciler.handleTransientError(ctx, []configv1.ClusterOperatorStatusCondition{}, fmt.Errorf("test transient error")) + Expect(err).To(HaveOccurred()) + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeTrue()) + }) +}) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index e6e788548..5921448d7 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -61,10 +61,11 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ // Request object not found, could have been deleted after reconcile request. // Return and don't requeue klog.Infof("proxy not found; reconciliation will be skipped") - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } + // We tolerate the proxy config not being found. + r.clearFailureWindow() return reconcile.Result{}, nil } // Non-NotFound: transient API error. From bda0a0c2a9c114a836c38a14868b46629eed2660 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Thu, 5 Mar 2026 16:46:41 -0500 Subject: [PATCH 4/8] Fix race on consecutiveFailureSince in cloud config sync test The test "config should not be updated if source and target config content are identical" called reconciler.Reconcile() directly while the manager was also running the same reconciler in a background goroutine. Both goroutines could access consecutiveFailureSince concurrently, which the Go race detector flags. Use a fresh CloudConfigReconciler instance (not registered with the manager) for the direct call. It shares the thread-safe API client but owns its own consecutiveFailureSince field, so there is no shared mutable state with the manager's copy. Co-Authored-By: Claude Sonnet 4.6 --- .../cloud_config_sync_controller_test.go | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index 5e1a6ce28..dcfcd02c7 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() { }, timeout).Should(Succeed()) initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion + // Introducing the consecutiveFailureWindow means that there's a field that could be racy + // between the manager calling Reconcile and the test calling Reconcile. + // In production, we only have 1 instance of the reconciler running. + // Create a fresh reconciler that is NOT registered with the manager. + // It shares the same API client (thread-safe) but has its own + // consecutiveFailureSince field, so no data race with the manager's copy. + freshReconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: targetNamespaceName, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting( + nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil, + ), + } + request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}} - _, err := reconciler.Reconcile(ctx, request) + _, err := freshReconciler.Reconcile(ctx, request) Expect(err).Should(Succeed()) Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed()) From fef2652c4ad84ccfd9ac6f12321e0a196de123ef Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Thu, 19 Mar 2026 16:32:12 -0400 Subject: [PATCH 5/8] Fix stale failure window in TrustedCABundleReconciler Adds `lastTransientFailureAt` field and stale-window restart logic to `handleTransientError`. Without this, a transient error that set `consecutiveFailureSince` at T0, followed by a partial-run reconcile (unrelated ConfigMap event) that returned nil without clearing the window, would cause the next transient error (days/weeks later) to degrade immediately because `elapsed = now - T0 >> threshold`. The fix tracks when the most recent transient error was observed. If the gap since the last observed failure exceeds `transientDegradedThreshold`, the window is treated as stale and restarted from the current time. Also updates the Test 2 unit test to add an intermediate reconcile call (simulating continuous failures), keeping `lastTransientFailureAt` fresh so the degradation path is taken rather than the stale-window restart path. Adds Test 3 (stale-window restart) as specified in the plan. Co-Authored-By: Claude Sonnet 4.6 --- .../trusted_ca_bundle_controller.go | 16 ++++- .../trusted_ca_bundle_controller_test.go | 69 +++++++++++++++++-- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index 5921448d7..60418b5e9 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -45,6 +45,7 @@ type TrustedCABundleReconciler struct { Scheme *runtime.Scheme trustBundlePath string consecutiveFailureSince *time.Time // nil when the last reconcile succeeded + lastTransientFailureAt *time.Time // when the most recent transient error was observed } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -118,6 +119,7 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ func (r *TrustedCABundleReconciler) clearFailureWindow() { r.consecutiveFailureSince = nil + r.lastTransientFailureAt = nil } // handleTransientError records the start of a failure window and degrades the @@ -125,12 +127,22 @@ func (r *TrustedCABundleReconciler) clearFailureWindow() { // returns a non-nil error so controller-runtime requeues with exponential backoff. func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { now := r.Clock.Now() - if r.consecutiveFailureSince == nil { + + // Start or restart the failure window. + // Restart if: (a) no window is open, OR (b) the last observed failure was more than + // transientDegradedThreshold ago (stale window). Case (b) handles the scenario where + // a partialRun reconcile returned nil (no requeue) after the system recovered, leaving + // consecutiveFailureSince set but no subsequent successful full reconcile to clear it. + staleWindow := r.lastTransientFailureAt != nil && now.Sub(*r.lastTransientFailureAt) > transientDegradedThreshold + if r.consecutiveFailureSince == nil || staleWindow { r.consecutiveFailureSince = &now + r.lastTransientFailureAt = &now klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) return ctrl.Result{}, err } - elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) + + r.lastTransientFailureAt = &now + elapsed := now.Sub(*r.consecutiveFailureSince) if elapsed < transientDegradedThreshold { klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) return ctrl.Result{}, err diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index db1be8c07..4b7040a03 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -349,6 +349,52 @@ var _ = Describe("Trusted CA bundle reconciler unit tests", func() { Expect(availCond.Status).To(Equal(v1.ConditionTrue)) }) + It("stale failure window should be restarted when gap since last error exceeds threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + reconciler := &TrustedCABundleReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + trustBundlePath: "/broken/ca/path.pem", // unreadable → transient error + } + + // Create a Proxy so the reconcile progresses to getSystemTrustBundle. + proxy := &v1.Proxy{ObjectMeta: metav1.ObjectMeta{Name: proxyResourceName}} + Expect(cl.Create(ctx, proxy)).To(Succeed()) + DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) + + // Step 1: First transient error; failure window opens at T0. + _, err := reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + Expect(reconciler.consecutiveFailureSince).NotTo(BeNil()) + + // Step 2: Advance clock past the threshold — simulates a gap with no reconciles + // (e.g., system recovered, no events fired for a long time). + fakeClock.Step(transientDegradedThreshold + time.Second) + + // Step 3: New transient error arrives. The stale-window logic should detect that + // lastTransientFailureAt is more than the threshold ago and restart the window + // from 'now', NOT degrade immediately. + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + + co := &v1.ClusterOperator{} + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), + "should not degrade immediately after a gap — window should restart") + } + } + } + // consecutiveFailureSince should now be 'now', not the original T0. + Expect(reconciler.consecutiveFailureSince).NotTo(BeNil()) + Expect(fakeClock.Now().Sub(*reconciler.consecutiveFailureSince)).To(BeNumerically("<", time.Second), + "window should have been restarted to ~now, not retained from original T0") + }) + It("should not degrade on transient error before threshold, but degrade after threshold", func() { fakeClock := clocktesting.NewFakeClock(time.Now()) reconciler := &TrustedCABundleReconciler{ @@ -365,7 +411,7 @@ var _ = Describe("Trusted CA bundle reconciler unit tests", func() { Expect(cl.Create(ctx, proxy)).To(Succeed()) DeferCleanup(func() { _ = cl.Delete(ctx, proxy) }) - // First reconcile: transient failure starts; error is returned but no degraded condition set. + // First reconcile at T0: transient failure starts; error is returned but no degraded condition set. _, err := reconciler.Reconcile(ctx, ctrl.Request{}) Expect(err).To(HaveOccurred()) co := &v1.ClusterOperator{} @@ -377,10 +423,25 @@ var _ = Describe("Trusted CA bundle reconciler unit tests", func() { } } - // Advance clock past the degraded threshold. - fakeClock.Step(transientDegradedThreshold + time.Second) + // Advance clock to mid-window (half the threshold) and reconcile again to simulate + // continuous failures. This updates lastTransientFailureAt, keeping it fresh. + fakeClock.Step(transientDegradedThreshold / 2) + _, err = reconciler.Reconcile(ctx, ctrl.Request{}) + Expect(err).To(HaveOccurred()) + if getErr := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); getErr == nil { + for _, cond := range co.Status.Conditions { + if cond.Type == trustedCABundleControllerDegradedCondition { + Expect(cond.Status).NotTo(Equal(v1.ConditionTrue), "should not be degraded before threshold") + } + } + } + + // Advance clock so total elapsed from T0 exceeds the threshold, but the gap since the + // most recent failure (lastTransientFailureAt) is less than the threshold. This ensures + // the degradation path is taken rather than the stale-window restart path. + fakeClock.Step(transientDegradedThreshold/2 + time.Second) - // Second reconcile: threshold exceeded, controller sets degraded. + // Final reconcile: threshold exceeded, controller sets degraded. _, err = reconciler.Reconcile(ctx, ctrl.Request{}) Expect(err).To(HaveOccurred()) Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) From 8560bba27f6ea8ff3e39e46e5c9a5709ed74b64f Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Thu, 19 Mar 2026 16:33:05 -0400 Subject: [PATCH 6/8] Document known limitations in transient error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds inline comments for three known limitations identified in the QC review: 1. CloudConfigReconciler: clearFailureWindow is called before setAvailableCondition on success paths, so persistent status-write failures never accumulate toward the degraded threshold (intentional design choice). 2. CloudOperatorReconciler: setStatusAvailable and clearCloudControllerOwnerCondition failures bypass handleTransientError since these run after successful sync work; persistent status-subresource failures will not set OperatorDegraded=True. 3. CloudOperatorReconciler: provisioningAllowed may call setStatusDegraded internally before returning an error, causing a redundant second setStatusDegraded call from handleTransientError after the threshold — harmless but noted. Co-Authored-By: Claude Sonnet 4.6 --- pkg/controllers/cloud_config_sync_controller.go | 10 ++++++++++ pkg/controllers/clusteroperator_controller.go | 14 ++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index caa4e237f..d23a13e45 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -192,6 +192,16 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // clearFailureWindow resets the transient-error tracking. Call this on every // successful reconcile so the 2-minute window restarts fresh on the next failure. +// +// Known limitation: clearFailureWindow is called *before* setAvailableCondition +// on all success paths. If setAvailableCondition subsequently fails (e.g. the +// ClusterOperator status subresource is unreachable), the failure window has +// already been cleared but a raw error is returned without calling +// handleTransientError. Controller-runtime requeues, but repeated status-write +// failures never accumulate toward the degraded threshold, so a persistent +// status-subresource outage will not cause CloudConfigControllerDegraded=True. +// This is intentional: cloud-config sync errors and status-write errors are +// distinct failure modes and are not conflated here. func (r *CloudConfigReconciler) clearFailureWindow() { r.consecutiveFailureSince = nil } diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index ddfc3e910..69b2097df 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -91,6 +91,13 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return r.handleTransientError(ctx, conditionOverrides, err) } + // Known limitation: when provisioningAllowed internally calls setStatusDegraded + // (e.g. a sub-controller has Degraded=True, or IsCloudProviderExternal errors), + // it returns a non-nil error. Reconcile passes that error to handleTransientError, + // which starts the 2m30s window. After the threshold, handleTransientError calls + // setStatusDegraded again — redundant but harmless, since status is already degraded. + // This is a consequence of keeping status-setting inside provisioningAllowed rather + // than pushing it into Reconcile. allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) @@ -118,6 +125,13 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return r.handleTransientError(ctx, conditionOverrides, err) } + // Known limitation: setStatusAvailable and clearCloudControllerOwnerCondition + // failures return raw errors without calling handleTransientError. These run + // after the actual reconcile work has completed successfully, so their errors + // are not counted as "cloud controller sync failures". If the status subresource + // or condition-cleanup becomes persistently broken, OperatorDegraded=True will + // not be set. Controller-runtime requeues on any non-nil error, so recovery + // still happens; it just does not trigger the degraded threshold. if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) return ctrl.Result{}, err From 64f196319556e7481b5d1e6408a14c75d6f6df66 Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Thu, 19 Mar 2026 17:03:25 -0400 Subject: [PATCH 7/8] Refactor Reconcile methods to use deferred dispatcher pattern Adds permanentError / permanent() / isPermanent() helpers and refactors all three Reconcile methods to use named return values with a single deferred dispatcher, rather than calling handleTransientError / handleDegradeError at each error site. Each error site now returns ctrl.Result{}, err (transient) or ctrl.Result{}, permanent(err) (permanent). The defer inspects the returned error and routes it to the correct handler, also calling clearFailureWindow() on every nil-error return path. Benefits over the direct-call approach: - Single error-classification decision point per reconciler - No risk of forgetting to call clearFailureWindow() on a new success path - Permanent vs. transient classification is explicit at the call site via the permanent() wrapper rather than implicit in which handler is called Co-Authored-By: Claude Sonnet 4.6 --- .../cloud_config_sync_controller.go | 80 ++++++++++++------- pkg/controllers/clusteroperator_controller.go | 51 ++++++------ .../trusted_ca_bundle_controller.go | 56 ++++++++----- 3 files changed, 116 insertions(+), 71 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index d23a13e45..179641fb6 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -45,9 +45,42 @@ type CloudConfigReconciler struct { consecutiveFailureSince *time.Time // nil when the last reconcile succeeded } -func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +// permanentError marks an error as a configuration problem that will not resolve +// without operator or user intervention. The deferred dispatcher in Reconcile +// uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue) +// rather than handleTransientError (failure window, requeue). +type permanentError struct{ cause error } + +func (e *permanentError) Error() string { return e.cause.Error() } +func (e *permanentError) Unwrap() error { return e.cause } + +// permanent wraps err to signal that the condition is not retriable. +func permanent(err error) error { return &permanentError{cause: err} } + +func isPermanent(err error) bool { + _, ok := err.(*permanentError) + return ok +} + +func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // All nil-error paths clear the failure window. + defer func() { + if retErr == nil { + r.clearFailureWindow() + return + } + if isPermanent(retErr) { + result, retErr = r.handleDegradeError(ctx, retErr) + } else { + result, retErr = r.handleTransientError(ctx, retErr) + } + }() + infra := &configv1.Infrastructure{} if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { // No cloud platform: mirror the main controller's behaviour of returning Available. @@ -55,25 +88,22 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } - // Skip if the infrastructure resource doesn't exist. - r.clearFailureWindow() return ctrl.Result{}, nil } else if err != nil { - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } network := &configv1.Network{} if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { // nil platformStatus is a permanent misconfiguration. - return r.handleDegradeError(ctx, err) + return ctrl.Result{}, permanent(err) } if !syncNeeded { - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -85,7 +115,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err != nil { // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - return r.handleDegradeError(ctx, err) + return ctrl.Result{}, permanent(err) } sourceCM := &corev1.ConfigMap{} @@ -109,7 +139,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } else if errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") } else { - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } } @@ -122,26 +152,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") } else if err != nil { - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } } sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes. - return r.handleDegradeError(ctx, err) + return ctrl.Result{}, permanent(err) } if r.FeatureGateAccess == nil { // Operator misconfiguration at startup: permanent. - return r.handleDegradeError(ctx, fmt.Errorf("FeatureGateAccess is not configured")) + return ctrl.Result{}, permanent(fmt.Errorf("FeatureGateAccess is not configured")) } features, err := r.FeatureGateAccess.CurrentFeatureGates() if err != nil { // The feature-gate informer may not have synced yet: transient. klog.Errorf("unable to get feature gates: %v", err) - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } if cloudConfigTransformerFn != nil { @@ -151,7 +181,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { // Platform-specific transform failed on the current config data: permanent. - return r.handleDegradeError(ctx, err) + return ctrl.Result{}, permanent(err) } sourceCM.Data[defaultConfigKey] = output } @@ -164,13 +194,12 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // If the config does not exist, it will be created later, so we can ignore a Not Found error if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } // Note that the source config map is actually a *transformed* source config map if r.isCloudConfigEqual(sourceCM, targetCM) { klog.V(1).Infof("source and target cloud-config content are equal, no sync needed") - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -179,10 +208,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil { klog.Errorf("unable to sync cloud config") - return r.handleTransientError(ctx, err) + return ctrl.Result{}, err // transient } - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) } @@ -190,18 +218,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } -// clearFailureWindow resets the transient-error tracking. Call this on every -// successful reconcile so the 2-minute window restarts fresh on the next failure. -// -// Known limitation: clearFailureWindow is called *before* setAvailableCondition -// on all success paths. If setAvailableCondition subsequently fails (e.g. the -// ClusterOperator status subresource is unreachable), the failure window has -// already been cleared but a raw error is returned without calling -// handleTransientError. Controller-runtime requeues, but repeated status-write -// failures never accumulate toward the degraded threshold, so a persistent -// status-subresource outage will not cause CloudConfigControllerDegraded=True. -// This is intentional: cloud-config sync errors and status-write errors are -// distinct failure modes and are not conflated here. +// clearFailureWindow resets the transient-error tracking. Called by the deferred +// dispatcher in Reconcile on every successful (nil-error) return path. func (r *CloudConfigReconciler) clearFailureWindow() { r.consecutiveFailureSince = nil } @@ -209,6 +227,7 @@ func (r *CloudConfigReconciler) clearFailureWindow() { // handleTransientError records the start of a failure window and degrades the // controller only after transientDegradedThreshold has elapsed. It always // returns a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { now := r.Clock.Now() if r.consecutiveFailureSince == nil { @@ -231,6 +250,7 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er // handleDegradeError sets CloudConfigControllerDegraded=True immediately and // returns nil so controller-runtime does NOT requeue. An existing watch on the // relevant resource will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err) if setErr := r.setDegradedCondition(ctx); setErr != nil { diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index 69b2097df..9d77a254a 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -73,22 +73,36 @@ type CloudOperatorReconciler struct { // +kubebuilder:rbac:groups=config.openshift.io,resources=infrastructures,verbs=get;list;watch // Reconcile will process the cloud-controller-manager clusterOperator -func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { +func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (result ctrl.Result, retErr error) { conditionOverrides := []configv1.ClusterOperatorStatusCondition{} + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // All nil-error paths clear the failure window. + defer func() { + if retErr == nil { + r.clearFailureWindow() + return + } + if isPermanent(retErr) { + result, retErr = r.handleDegradeError(ctx, conditionOverrides, retErr) + } else { + result, retErr = r.handleTransientError(ctx, conditionOverrides, retErr) + } + }() + infra := &configv1.Infrastructure{} if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) - return r.handleTransientError(ctx, conditionOverrides, err) + return ctrl.Result{}, err } - // It's ok for the infrastructure cluster to not exist - r.clearFailureWindow() - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } else if err != nil { klog.Errorf("Unable to retrive Infrastructure object: %v", err) - return r.handleTransientError(ctx, conditionOverrides, err) + return ctrl.Result{}, err // transient } // Known limitation: when provisioningAllowed internally calls setStatusDegraded @@ -101,37 +115,28 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) allowedToProvision, err := r.provisioningAllowed(ctx, infra, conditionOverrides) if err != nil { klog.Errorf("Unable to determine cluster state to check if provision is allowed: %v", err) - return r.handleTransientError(ctx, conditionOverrides, err) + return ctrl.Result{}, err // transient; status already set inside provisioningAllowed } else if !allowedToProvision { - // We're not allowed to provision, but didn't have any failures. - r.clearFailureWindow() - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } clusterProxy := &configv1.Proxy{} if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) - return r.handleTransientError(ctx, conditionOverrides, err) + return ctrl.Result{}, err // transient } operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig) if err != nil { klog.Errorf("Unable to build operator config %s", err) - return r.handleDegradeError(ctx, conditionOverrides, err) + return ctrl.Result{}, permanent(err) // permanent: defer calls handleDegradeError } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { klog.Errorf("Unable to sync operands: %s", err) - return r.handleTransientError(ctx, conditionOverrides, err) + return ctrl.Result{}, err // transient } - // Known limitation: setStatusAvailable and clearCloudControllerOwnerCondition - // failures return raw errors without calling handleTransientError. These run - // after the actual reconcile work has completed successfully, so their errors - // are not counted as "cloud controller sync failures". If the status subresource - // or condition-cleanup becomes persistently broken, OperatorDegraded=True will - // not be set. Controller-runtime requeues on any non-nil error, so recovery - // still happens; it just does not trigger the degraded threshold. if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) return ctrl.Result{}, err @@ -142,9 +147,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, err } - // successful reconcile, make sure the failure window is cleared. - r.clearFailureWindow() - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } func (r *CloudOperatorReconciler) clearFailureWindow() { @@ -154,6 +157,7 @@ func (r *CloudOperatorReconciler) clearFailureWindow() { // handleTransientError records the start of a failure window and degrades the // operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns // a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { now := r.Clock.Now() if r.consecutiveFailureSince == nil { @@ -176,6 +180,7 @@ func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, cond // handleDegradeError sets OperatorDegraded=True immediately and returns nil so // controller-runtime does NOT requeue. Existing watches on Infrastructure, // ConfigMaps, and Secrets will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. func (r *CloudOperatorReconciler) handleDegradeError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { klog.Errorf("CloudOperatorReconciler: persistent error, setting degraded: %v", err) if setErr := r.setStatusDegraded(ctx, err, conditionOverrides); setErr != nil { diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index 60418b5e9..6d73b9dc2 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -53,9 +53,32 @@ func isSpecTrustedCASet(proxyConfig *configv1.ProxySpec) bool { return len(proxyConfig.TrustedCA.Name) > 0 } -func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("%s emitted event, syncing %s ConfigMap", req, trustedCAConfigMapName) + // partialRun is set to true on the early-exit path where the event is for + // an unrelated ConfigMap. That path returns available=true but should NOT + // reset an ongoing transient failure window from a previous full reconcile. + partialRun := false + + // Deferred dispatcher: classifies the returned error and calls the right handler. + // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Transient errors enter the failure window and only degrade after the threshold. + // Nil-error paths clear the failure window unless partialRun is set. + defer func() { + if retErr == nil { + if !partialRun { + r.clearFailureWindow() + } + return + } + if isPermanent(retErr) { + result, retErr = r.handleDegradeError(ctx, retErr) + } else { + result, retErr = r.handleTransientError(ctx, retErr) + } + }() + proxyConfig := &configv1.Proxy{} if err := r.Get(ctx, types.NamespacedName{Name: proxyResourceName}, proxyConfig); err != nil { if apierrors.IsNotFound(err) { @@ -65,56 +88,51 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - // We tolerate the proxy config not being found. - r.clearFailureWindow() - return reconcile.Result{}, nil + return reconcile.Result{}, nil // defer clears failure window } // Non-NotFound: transient API error. - return r.handleTransientError(ctx, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err)) + return ctrl.Result{}, fmt.Errorf("failed to get proxy '%s': %v", req.Name, err) // transient } // Check if changed config map in 'openshift-config' namespace is proxy trusted ca. - // If not, return early + // If not, return early without resetting the failure window (partialRun=true). if req.Namespace == OpenshiftConfigNamespace && proxyConfig.Spec.TrustedCA.Name != req.Name { + partialRun = true + klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - // Note: clearFailureWindow is intentionally NOT called here. This path did not - // exercise the full reconcile logic, so an ongoing transient failure window - // (set by a previous reconcile pass) should not be reset. - klog.V(1).Infof("changed config map %s is not a proxy trusted ca, skipping", req) return reconcile.Result{}, nil } systemTrustBundle, err := r.getSystemTrustBundle() if err != nil { // Node cert store may be updating during upgrade: transient. - return r.handleTransientError(ctx, fmt.Errorf("failed to get system trust bundle: %v", err)) + return ctrl.Result{}, fmt.Errorf("failed to get system trust bundle: %v", err) // transient } proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - // Combined cert bundle is corrupt: persistent. - return r.handleDegradeError(ctx, fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) + // Combined cert bundle is corrupt: permanent. + return ctrl.Result{}, permanent(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - // Combined cert bundle is corrupt: persistent. - return r.handleDegradeError(ctx, fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) + // Combined cert bundle is corrupt: permanent. + return ctrl.Result{}, permanent(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) if err := r.createOrUpdateConfigMap(ctx, ccmTrustedConfigMap); err != nil { - return r.handleTransientError(ctx, fmt.Errorf("can not update target trust bundle configmap: %v", err)) + return ctrl.Result{}, fmt.Errorf("can not update target trust bundle configmap: %v", err) // transient } - r.clearFailureWindow() if err := r.setAvailableCondition(ctx); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set conditions for trusted CA bundle controller: %v", err) } - return ctrl.Result{}, nil + return ctrl.Result{}, nil // defer clears failure window } func (r *TrustedCABundleReconciler) clearFailureWindow() { @@ -125,6 +143,7 @@ func (r *TrustedCABundleReconciler) clearFailureWindow() { // handleTransientError records the start of a failure window and degrades the // controller only after transientDegradedThreshold has elapsed. It always // returns a non-nil error so controller-runtime requeues with exponential backoff. +// Called only from the deferred dispatcher in Reconcile. func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { now := r.Clock.Now() @@ -157,6 +176,7 @@ func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, er // handleDegradeError sets TrustedCABundleControllerControllerDegraded=True immediately and // returns nil so controller-runtime does NOT requeue. An existing watch on the // relevant resource will re-trigger reconciliation when the problem is fixed. +// Called only from the deferred dispatcher in Reconcile. func (r *TrustedCABundleReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { klog.Errorf("TrustedCABundleReconciler: persistent error, setting degraded: %v", err) if setErr := r.setDegradedCondition(ctx); setErr != nil { From 1780461c7c2d21fa05b7e65b2b8c783f03904deb Mon Sep 17 00:00:00 2001 From: Nolan Brubaker Date: Fri, 10 Apr 2026 13:00:25 -0400 Subject: [PATCH 8/8] Address review feedback - Use controller-runtime's TerminalError instead of our own type - Use errors.Is instead of type casting - Consolidate the failure tracking fields into their own struct Signed-off-by: Nolan Brubaker Co-Authored-By: Claude Sonnet 4.6 --- .../cloud_config_sync_controller.go | 100 ++++++++++-------- pkg/controllers/clusteroperator_controller.go | 32 +++--- .../trusted_ca_bundle_controller.go | 42 +++----- .../trusted_ca_bundle_controller_test.go | 6 +- 4 files changed, 92 insertions(+), 88 deletions(-) diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go index 179641fb6..18c246c4a 100644 --- a/pkg/controllers/cloud_config_sync_controller.go +++ b/pkg/controllers/cloud_config_sync_controller.go @@ -2,12 +2,14 @@ package controllers import ( "context" + "errors" "fmt" "reflect" + "sync" "time" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -15,6 +17,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" @@ -40,33 +43,48 @@ const ( type CloudConfigReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - FeatureGateAccess featuregates.FeatureGateAccess - consecutiveFailureSince *time.Time // nil when the last reconcile succeeded + Scheme *runtime.Scheme + FeatureGateAccess featuregates.FeatureGateAccess + failures failureWindow } -// permanentError marks an error as a configuration problem that will not resolve -// without operator or user intervention. The deferred dispatcher in Reconcile -// uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue) -// rather than handleTransientError (failure window, requeue). -type permanentError struct{ cause error } - -func (e *permanentError) Error() string { return e.cause.Error() } -func (e *permanentError) Unwrap() error { return e.cause } +// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use. +type failureWindow struct { + mu sync.Mutex + consecutiveFailureSince *time.Time + lastTransientFailureAt *time.Time +} -// permanent wraps err to signal that the condition is not retriable. -func permanent(err error) error { return &permanentError{cause: err} } +// clear resets the failure window. Call this on every successful reconcile. +func (fw *failureWindow) clear() { + fw.mu.Lock() + defer fw.mu.Unlock() + fw.consecutiveFailureSince = nil + fw.lastTransientFailureAt = nil +} -func isPermanent(err error) bool { - _, ok := err.(*permanentError) - return ok +// observe records a transient failure at now and returns the elapsed time since +// the window started plus a boolean indicating whether the window was just opened +// or restarted. staleAfter controls stale-window detection: if the gap since the +// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable. +func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) { + fw.mu.Lock() + defer fw.mu.Unlock() + stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter + if fw.consecutiveFailureSince == nil || stale { + fw.consecutiveFailureSince = &now + fw.lastTransientFailureAt = &now + return 0, true + } + fw.lastTransientFailureAt = &now + return now.Sub(*fw.consecutiveFailureSince), false } func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { klog.V(1).Infof("Syncing cloud-conf ConfigMap") // Deferred dispatcher: classifies the returned error and calls the right handler. - // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue. // Transient errors enter the failure window and only degrade after the threshold. // All nil-error paths clear the failure window. defer func() { @@ -74,15 +92,15 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) r.clearFailureWindow() return } - if isPermanent(retErr) { - result, retErr = r.handleDegradeError(ctx, retErr) + if errors.Is(retErr, reconcile.TerminalError(nil)) { + result, retErr = r.handleTerminalError(ctx, retErr) } else { result, retErr = r.handleTransientError(ctx, retErr) } }() infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { // No cloud platform: mirror the main controller's behaviour of returning Available. klog.Infof("Infrastructure cluster does not exist. Skipping...") if err := r.setAvailableCondition(ctx); err != nil { @@ -100,8 +118,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) if err != nil { - // nil platformStatus is a permanent misconfiguration. - return ctrl.Result{}, permanent(err) + // nil platformStatus is a terminal misconfiguration. + return ctrl.Result{}, reconcile.TerminalError(err) } if !syncNeeded { if err := r.setAvailableCondition(ctx); err != nil { @@ -115,7 +133,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err != nil { // Unsupported platform won't change without a cluster reconfigure. klog.Errorf("unable to get cloud config transformer function; unsupported platform") - return ctrl.Result{}, permanent(err) + return ctrl.Result{}, reconcile.TerminalError(err) } sourceCM := &corev1.ConfigMap{} @@ -136,7 +154,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil { managedConfigFound = true - } else if errors.IsNotFound(err) { + } else if apierrors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") } else { return ctrl.Result{}, err // transient @@ -149,7 +167,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) Name: infra.Spec.CloudConfig.Name, Namespace: OpenshiftConfigNamespace, } - if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { + if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) { klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") } else if err != nil { return ctrl.Result{}, err // transient @@ -158,13 +176,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) if err != nil { - // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes. - return ctrl.Result{}, permanent(err) + // User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes. + return ctrl.Result{}, reconcile.TerminalError(err) } if r.FeatureGateAccess == nil { - // Operator misconfiguration at startup: permanent. - return ctrl.Result{}, permanent(fmt.Errorf("FeatureGateAccess is not configured")) + // Operator misconfiguration at startup: ermanent. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured")) } features, err := r.FeatureGateAccess.CurrentFeatureGates() @@ -180,8 +198,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // we're not expecting users to put their data in the former. output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) if err != nil { - // Platform-specific transform failed on the current config data: permanent. - return ctrl.Result{}, permanent(err) + // Platform-specific transform failed on the current config data: terminal. + return ctrl.Result{}, reconcile.TerminalError(err) } sourceCM.Data[defaultConfigKey] = output } @@ -193,7 +211,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // If the config does not exist, it will be created later, so we can ignore a Not Found error - if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { + if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !apierrors.IsNotFound(err) { return ctrl.Result{}, err // transient } @@ -221,7 +239,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) // clearFailureWindow resets the transient-error tracking. Called by the deferred // dispatcher in Reconcile on every successful (nil-error) return path. func (r *CloudConfigReconciler) clearFailureWindow() { - r.consecutiveFailureSince = nil + r.failures.clear() } // handleTransientError records the start of a failure window and degrades the @@ -229,13 +247,11 @@ func (r *CloudConfigReconciler) clearFailureWindow() { // returns a non-nil error so controller-runtime requeues with exponential backoff. // Called only from the deferred dispatcher in Reconcile. func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { - now := r.Clock.Now() - if r.consecutiveFailureSince == nil { - r.consecutiveFailureSince = &now + elapsed, started := r.failures.observe(r.Clock.Now(), 0) + if started { klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) return ctrl.Result{}, err } - elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) if elapsed < transientDegradedThreshold { klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) return ctrl.Result{}, err @@ -247,12 +263,12 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er return ctrl.Result{}, err } -// handleDegradeError sets CloudConfigControllerDegraded=True immediately and +// handleTerminalError sets CloudConfigControllerDegraded=True immediately and // returns nil so controller-runtime does NOT requeue. An existing watch on the // relevant resource will re-trigger reconciliation when the problem is fixed. // Called only from the deferred dispatcher in Reconcile. -func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { - klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err) +func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) { + klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err) if setErr := r.setDegradedCondition(ctx); setErr != nil { return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) } @@ -332,7 +348,7 @@ func (r *CloudConfigReconciler) syncCloudConfigData(ctx context.Context, source // check if target config exists, create if not err := r.Get(ctx, client.ObjectKeyFromObject(target), &corev1.ConfigMap{}) - if err != nil && errors.IsNotFound(err) { + if err != nil && apierrors.IsNotFound(err) { return r.Create(ctx, target) } else if err != nil { return err diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index 9d77a254a..7defae2cf 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -19,6 +19,7 @@ package controllers import ( "context" "crypto/tls" + "errors" "fmt" "time" @@ -27,13 +28,14 @@ import ( "github.com/openshift/library-go/pkg/cloudprovider" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" "github.com/openshift/cluster-cloud-controller-manager-operator/pkg/cloud" @@ -59,12 +61,12 @@ const ( // CloudOperatorReconciler reconciles a ClusterOperator object type CloudOperatorReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - watcher ObjectWatcher - ImagesFile string - FeatureGateAccess featuregates.FeatureGateAccess - TLSConfig func(*tls.Config) - consecutiveFailureSince *time.Time // nil when the last reconcile succeeded + Scheme *runtime.Scheme + watcher ObjectWatcher + ImagesFile string + FeatureGateAccess featuregates.FeatureGateAccess + TLSConfig func(*tls.Config) + failures failureWindow } // +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete @@ -85,7 +87,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) r.clearFailureWindow() return } - if isPermanent(retErr) { + if errors.Is(retErr, reconcile.TerminalError(nil)) { result, retErr = r.handleDegradeError(ctx, conditionOverrides, retErr) } else { result, retErr = r.handleTransientError(ctx, conditionOverrides, retErr) @@ -93,7 +95,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) }() infra := &configv1.Infrastructure{} - if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) { klog.Infof("Infrastructure cluster does not exist. Skipping...") if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) @@ -121,7 +123,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) } clusterProxy := &configv1.Proxy{} - if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) { + if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) { klog.Errorf("Unable to retrive Proxy object: %v", err) return ctrl.Result{}, err // transient } @@ -129,7 +131,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSConfig) if err != nil { klog.Errorf("Unable to build operator config %s", err) - return ctrl.Result{}, permanent(err) // permanent: defer calls handleDegradeError + return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError } if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { @@ -151,7 +153,7 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) } func (r *CloudOperatorReconciler) clearFailureWindow() { - r.consecutiveFailureSince = nil + r.failures.clear() } // handleTransientError records the start of a failure window and degrades the @@ -159,13 +161,11 @@ func (r *CloudOperatorReconciler) clearFailureWindow() { // a non-nil error so controller-runtime requeues with exponential backoff. // Called only from the deferred dispatcher in Reconcile. func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) { - now := r.Clock.Now() - if r.consecutiveFailureSince == nil { - r.consecutiveFailureSince = &now + elapsed, started := r.failures.observe(r.Clock.Now(), 0) + if started { klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold) return ctrl.Result{}, err } - elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) if elapsed < aggregatedTransientDegradedThreshold { klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err) return ctrl.Result{}, err diff --git a/pkg/controllers/trusted_ca_bundle_controller.go b/pkg/controllers/trusted_ca_bundle_controller.go index 6d73b9dc2..97ca33baa 100644 --- a/pkg/controllers/trusted_ca_bundle_controller.go +++ b/pkg/controllers/trusted_ca_bundle_controller.go @@ -4,9 +4,9 @@ import ( "bytes" "context" "crypto/x509" + "errors" "fmt" "os" - "time" "github.com/openshift/api/annotations" configv1 "github.com/openshift/api/config/v1" @@ -42,10 +42,9 @@ const ( type TrustedCABundleReconciler struct { ClusterOperatorStatusClient - Scheme *runtime.Scheme - trustBundlePath string - consecutiveFailureSince *time.Time // nil when the last reconcile succeeded - lastTransientFailureAt *time.Time // when the most recent transient error was observed + Scheme *runtime.Scheme + trustBundlePath string + failures failureWindow } // isSpecTrustedCASet returns true if spec.trustedCA of proxyConfig is set. @@ -62,7 +61,7 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ partialRun := false // Deferred dispatcher: classifies the returned error and calls the right handler. - // Permanent errors (wrapped with permanent()) degrade immediately without requeue. + // Permanent errors (wrapped with terminal()) degrade immediately without requeue. // Transient errors enter the failure window and only degrade after the threshold. // Nil-error paths clear the failure window unless partialRun is set. defer func() { @@ -72,7 +71,7 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ } return } - if isPermanent(retErr) { + if errors.Is(retErr, reconcile.TerminalError(nil)) { result, retErr = r.handleDegradeError(ctx, retErr) } else { result, retErr = r.handleTransientError(ctx, retErr) @@ -113,14 +112,14 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ proxyCABundle, mergedTrustBundle, err := r.addProxyCABundle(ctx, proxyConfig, systemTrustBundle) if err != nil { - // Combined cert bundle is corrupt: permanent. - return ctrl.Result{}, permanent(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) + // Combined cert bundle is corrupt: terminal. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add proxy CA to merged bundle: %v", err)) } _, mergedTrustBundle, err = r.addCloudConfigCABundle(ctx, proxyCABundle, mergedTrustBundle) if err != nil { - // Combined cert bundle is corrupt: permanent. - return ctrl.Result{}, permanent(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) + // Combined cert bundle is corrupt: terminal. + return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("can not check and add cloud-config CA to merged bundle: %v", err)) } ccmTrustedConfigMap := r.makeCABundleConfigMap(mergedTrustBundle) @@ -136,8 +135,7 @@ func (r *TrustedCABundleReconciler) Reconcile(ctx context.Context, req ctrl.Requ } func (r *TrustedCABundleReconciler) clearFailureWindow() { - r.consecutiveFailureSince = nil - r.lastTransientFailureAt = nil + r.failures.clear() } // handleTransientError records the start of a failure window and degrades the @@ -145,23 +143,13 @@ func (r *TrustedCABundleReconciler) clearFailureWindow() { // returns a non-nil error so controller-runtime requeues with exponential backoff. // Called only from the deferred dispatcher in Reconcile. func (r *TrustedCABundleReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { - now := r.Clock.Now() - - // Start or restart the failure window. - // Restart if: (a) no window is open, OR (b) the last observed failure was more than - // transientDegradedThreshold ago (stale window). Case (b) handles the scenario where - // a partialRun reconcile returned nil (no requeue) after the system recovered, leaving - // consecutiveFailureSince set but no subsequent successful full reconcile to clear it. - staleWindow := r.lastTransientFailureAt != nil && now.Sub(*r.lastTransientFailureAt) > transientDegradedThreshold - if r.consecutiveFailureSince == nil || staleWindow { - r.consecutiveFailureSince = &now - r.lastTransientFailureAt = &now + // Pass transientDegradedThreshold as the stale-window threshold to detect gaps + // where no reconcile ran (e.g. a partialRun returned nil, resetting the rate limiter). + elapsed, started := r.failures.observe(r.Clock.Now(), transientDegradedThreshold) + if started { klog.V(4).Infof("TrustedCABundleReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) return ctrl.Result{}, err } - - r.lastTransientFailureAt = &now - elapsed := now.Sub(*r.consecutiveFailureSince) if elapsed < transientDegradedThreshold { klog.V(4).Infof("TrustedCABundleReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) return ctrl.Result{}, err diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index 4b7040a03..e9c564a57 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -368,7 +368,7 @@ var _ = Describe("Trusted CA bundle reconciler unit tests", func() { // Step 1: First transient error; failure window opens at T0. _, err := reconciler.Reconcile(ctx, ctrl.Request{}) Expect(err).To(HaveOccurred()) - Expect(reconciler.consecutiveFailureSince).NotTo(BeNil()) + Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil()) // Step 2: Advance clock past the threshold — simulates a gap with no reconciles // (e.g., system recovered, no events fired for a long time). @@ -390,8 +390,8 @@ var _ = Describe("Trusted CA bundle reconciler unit tests", func() { } } // consecutiveFailureSince should now be 'now', not the original T0. - Expect(reconciler.consecutiveFailureSince).NotTo(BeNil()) - Expect(fakeClock.Now().Sub(*reconciler.consecutiveFailureSince)).To(BeNumerically("<", time.Second), + Expect(reconciler.failures.consecutiveFailureSince).NotTo(BeNil()) + Expect(fakeClock.Now().Sub(*reconciler.failures.consecutiveFailureSince)).To(BeNumerically("<", time.Second), "window should have been restarted to ~now, not retained from original T0") })