@@ -2,19 +2,22 @@ package controllers
22
33import (
44 "context"
5+ "errors"
56 "fmt"
67 "reflect"
8+ "sync"
79 "time"
810
911 corev1 "k8s.io/api/core/v1"
10- "k8s.io/apimachinery/pkg/api/errors"
12+ apierrors "k8s.io/apimachinery/pkg/api/errors"
1113 "k8s.io/apimachinery/pkg/runtime"
1214 "k8s.io/klog/v2"
1315 ctrl "sigs.k8s.io/controller-runtime"
1416 "sigs.k8s.io/controller-runtime/pkg/builder"
1517 "sigs.k8s.io/controller-runtime/pkg/client"
1618 "sigs.k8s.io/controller-runtime/pkg/handler"
1719 "sigs.k8s.io/controller-runtime/pkg/predicate"
20+ "sigs.k8s.io/controller-runtime/pkg/reconcile"
1821
1922 configv1 "github.com/openshift/api/config/v1"
2023 "github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
@@ -40,49 +43,64 @@ const (
4043
4144type CloudConfigReconciler struct {
4245 ClusterOperatorStatusClient
43- Scheme * runtime.Scheme
44- FeatureGateAccess featuregates.FeatureGateAccess
45- consecutiveFailureSince * time. Time // nil when the last reconcile succeeded
46+ Scheme * runtime.Scheme
47+ FeatureGateAccess featuregates.FeatureGateAccess
48+ failures failureWindow
4649}
4750
48- // permanentError marks an error as a configuration problem that will not resolve
49- // without operator or user intervention. The deferred dispatcher in Reconcile
50- // uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue)
51- // rather than handleTransientError (failure window, requeue).
52- type permanentError struct { cause error }
53-
54- func (e * permanentError ) Error () string { return e .cause .Error () }
55- func (e * permanentError ) Unwrap () error { return e .cause }
51+ // failureWindow tracks consecutive transient failures. All methods are safe for concurrent use.
52+ type failureWindow struct {
53+ mu sync.Mutex
54+ consecutiveFailureSince * time.Time
55+ lastTransientFailureAt * time.Time
56+ }
5657
57- // permanent wraps err to signal that the condition is not retriable.
58- func permanent (err error ) error { return & permanentError {cause : err } }
58+ // clear resets the failure window. Call this on every successful reconcile.
59+ func (fw * failureWindow ) clear () {
60+ fw .mu .Lock ()
61+ defer fw .mu .Unlock ()
62+ fw .consecutiveFailureSince = nil
63+ fw .lastTransientFailureAt = nil
64+ }
5965
60- func isPermanent (err error ) bool {
61- _ , ok := err .(* permanentError )
62- return ok
66+ // observe records a transient failure at now and returns the elapsed time since
67+ // the window started plus a boolean indicating whether the window was just opened
68+ // or restarted. staleAfter controls stale-window detection: if the gap since the
69+ // last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable.
70+ func (fw * failureWindow ) observe (now time.Time , staleAfter time.Duration ) (elapsed time.Duration , started bool ) {
71+ fw .mu .Lock ()
72+ defer fw .mu .Unlock ()
73+ stale := staleAfter > 0 && fw .lastTransientFailureAt != nil && now .Sub (* fw .lastTransientFailureAt ) > staleAfter
74+ if fw .consecutiveFailureSince == nil || stale {
75+ fw .consecutiveFailureSince = & now
76+ fw .lastTransientFailureAt = & now
77+ return 0 , true
78+ }
79+ fw .lastTransientFailureAt = & now
80+ return now .Sub (* fw .consecutiveFailureSince ), false
6381}
6482
6583func (r * CloudConfigReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (result ctrl.Result , retErr error ) {
6684 klog .V (1 ).Infof ("Syncing cloud-conf ConfigMap" )
6785
6886 // Deferred dispatcher: classifies the returned error and calls the right handler.
69- // Permanent errors (wrapped with permanent ()) degrade immediately without requeue.
87+ // Permanent errors (wrapped with reconcile.TerminalError ()) degrade immediately without requeue.
7088 // Transient errors enter the failure window and only degrade after the threshold.
7189 // All nil-error paths clear the failure window.
7290 defer func () {
7391 if retErr == nil {
7492 r .clearFailureWindow ()
7593 return
7694 }
77- if isPermanent (retErr ) {
78- result , retErr = r .handleDegradeError (ctx , retErr )
95+ if errors . Is (retErr , reconcile . TerminalError ( nil ) ) {
96+ result , retErr = r .handleTerminalError (ctx , retErr )
7997 } else {
8098 result , retErr = r .handleTransientError (ctx , retErr )
8199 }
82100 }()
83101
84102 infra := & configv1.Infrastructure {}
85- if err := r .Get (ctx , client.ObjectKey {Name : infrastructureResourceName }, infra ); errors .IsNotFound (err ) {
103+ if err := r .Get (ctx , client.ObjectKey {Name : infrastructureResourceName }, infra ); apierrors .IsNotFound (err ) {
86104 // No cloud platform: mirror the main controller's behaviour of returning Available.
87105 klog .Infof ("Infrastructure cluster does not exist. Skipping..." )
88106 if err := r .setAvailableCondition (ctx ); err != nil {
@@ -100,8 +118,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
100118
101119 syncNeeded , err := r .isCloudConfigSyncNeeded (infra .Status .PlatformStatus , infra .Spec .CloudConfig )
102120 if err != nil {
103- // nil platformStatus is a permanent misconfiguration.
104- return ctrl.Result {}, permanent (err )
121+ // nil platformStatus is a terminal misconfiguration.
122+ return ctrl.Result {}, reconcile . TerminalError (err )
105123 }
106124 if ! syncNeeded {
107125 if err := r .setAvailableCondition (ctx ); err != nil {
@@ -115,7 +133,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
115133 if err != nil {
116134 // Unsupported platform won't change without a cluster reconfigure.
117135 klog .Errorf ("unable to get cloud config transformer function; unsupported platform" )
118- return ctrl.Result {}, permanent (err )
136+ return ctrl.Result {}, reconcile . TerminalError (err )
119137 }
120138
121139 sourceCM := & corev1.ConfigMap {}
@@ -136,7 +154,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
136154 }
137155 if err := r .Get (ctx , defaultSourceCMObjectKey , sourceCM ); err == nil {
138156 managedConfigFound = true
139- } else if errors .IsNotFound (err ) {
157+ } else if apierrors .IsNotFound (err ) {
140158 klog .Warningf ("managed cloud-config is not found, falling back to infrastructure config" )
141159 } else {
142160 return ctrl.Result {}, err // transient
@@ -149,7 +167,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
149167 Name : infra .Spec .CloudConfig .Name ,
150168 Namespace : OpenshiftConfigNamespace ,
151169 }
152- if err := r .Get (ctx , openshiftUnmanagedCMKey , sourceCM ); errors .IsNotFound (err ) {
170+ if err := r .Get (ctx , openshiftUnmanagedCMKey , sourceCM ); apierrors .IsNotFound (err ) {
153171 klog .Warningf ("managed cloud-config is not found, falling back to default cloud config." )
154172 } else if err != nil {
155173 return ctrl.Result {}, err // transient
@@ -158,13 +176,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
158176
159177 sourceCM , err = r .prepareSourceConfigMap (sourceCM , infra )
160178 if err != nil {
161- // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes.
162- return ctrl.Result {}, permanent (err )
179+ // User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes.
180+ return ctrl.Result {}, reconcile . TerminalError (err )
163181 }
164182
165183 if r .FeatureGateAccess == nil {
166- // Operator misconfiguration at startup: permanent .
167- return ctrl.Result {}, permanent (fmt .Errorf ("FeatureGateAccess is not configured" ))
184+ // Operator misconfiguration at startup: ermanent .
185+ return ctrl.Result {}, reconcile . TerminalError (fmt .Errorf ("FeatureGateAccess is not configured" ))
168186 }
169187
170188 features , err := r .FeatureGateAccess .CurrentFeatureGates ()
@@ -180,8 +198,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
180198 // we're not expecting users to put their data in the former.
181199 output , err := cloudConfigTransformerFn (sourceCM .Data [defaultConfigKey ], infra , network , features )
182200 if err != nil {
183- // Platform-specific transform failed on the current config data: permanent .
184- return ctrl.Result {}, permanent (err )
201+ // Platform-specific transform failed on the current config data: terminal .
202+ return ctrl.Result {}, reconcile . TerminalError (err )
185203 }
186204 sourceCM .Data [defaultConfigKey ] = output
187205 }
@@ -193,7 +211,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
193211 }
194212
195213 // If the config does not exist, it will be created later, so we can ignore a Not Found error
196- if err := r .Get (ctx , targetConfigMapKey , targetCM ); err != nil && ! errors .IsNotFound (err ) {
214+ if err := r .Get (ctx , targetConfigMapKey , targetCM ); err != nil && ! apierrors .IsNotFound (err ) {
197215 return ctrl.Result {}, err // transient
198216 }
199217
@@ -221,21 +239,19 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
221239// clearFailureWindow resets the transient-error tracking. Called by the deferred
222240// dispatcher in Reconcile on every successful (nil-error) return path.
223241func (r * CloudConfigReconciler ) clearFailureWindow () {
224- r .consecutiveFailureSince = nil
242+ r .failures . clear ()
225243}
226244
227245// handleTransientError records the start of a failure window and degrades the
228246// controller only after transientDegradedThreshold has elapsed. It always
229247// returns a non-nil error so controller-runtime requeues with exponential backoff.
230248// Called only from the deferred dispatcher in Reconcile.
231249func (r * CloudConfigReconciler ) handleTransientError (ctx context.Context , err error ) (ctrl.Result , error ) {
232- now := r .Clock .Now ()
233- if r .consecutiveFailureSince == nil {
234- r .consecutiveFailureSince = & now
250+ elapsed , started := r .failures .observe (r .Clock .Now (), 0 )
251+ if started {
235252 klog .V (4 ).Infof ("CloudConfigReconciler: transient failure started (%v), will degrade after %s" , err , transientDegradedThreshold )
236253 return ctrl.Result {}, err
237254 }
238- elapsed := r .Clock .Now ().Sub (* r .consecutiveFailureSince )
239255 if elapsed < transientDegradedThreshold {
240256 klog .V (4 ).Infof ("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v" , elapsed , transientDegradedThreshold , err )
241257 return ctrl.Result {}, err
@@ -247,12 +263,12 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er
247263 return ctrl.Result {}, err
248264}
249265
250- // handleDegradeError sets CloudConfigControllerDegraded=True immediately and
266+ // handleTerminalError sets CloudConfigControllerDegraded=True immediately and
251267// returns nil so controller-runtime does NOT requeue. An existing watch on the
252268// relevant resource will re-trigger reconciliation when the problem is fixed.
253269// Called only from the deferred dispatcher in Reconcile.
254- func (r * CloudConfigReconciler ) handleDegradeError (ctx context.Context , err error ) (ctrl.Result , error ) {
255- klog .Errorf ("CloudConfigReconciler: permanent error, setting degraded: %v" , err )
270+ func (r * CloudConfigReconciler ) handleTerminalError (ctx context.Context , err error ) (ctrl.Result , error ) {
271+ klog .Errorf ("CloudConfigReconciler: terminal error, setting degraded: %v" , err )
256272 if setErr := r .setDegradedCondition (ctx ); setErr != nil {
257273 return ctrl.Result {}, fmt .Errorf ("failed to set degraded condition: %v" , setErr )
258274 }
@@ -332,7 +348,7 @@ func (r *CloudConfigReconciler) syncCloudConfigData(ctx context.Context, source
332348 // check if target config exists, create if not
333349 err := r .Get (ctx , client .ObjectKeyFromObject (target ), & corev1.ConfigMap {})
334350
335- if err != nil && errors .IsNotFound (err ) {
351+ if err != nil && apierrors .IsNotFound (err ) {
336352 return r .Create (ctx , target )
337353 } else if err != nil {
338354 return err
0 commit comments