Skip to content

Commit af628de

Browse files
nrbclaude
andcommitted
Address review feedback
- Use controller-runtime's TerminalError instead of our own type - Use errors.Is instead of type casting - Consolidate the failure tracking fields into their own struct Signed-off-by: Nolan Brubaker <nolan@nbrubaker.com> Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 6dfb25e commit af628de

4 files changed

Lines changed: 92 additions & 88 deletions

File tree

pkg/controllers/cloud_config_sync_controller.go

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,22 @@ package controllers
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"reflect"
8+
"sync"
79
"time"
810

911
corev1 "k8s.io/api/core/v1"
10-
"k8s.io/apimachinery/pkg/api/errors"
12+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1113
"k8s.io/apimachinery/pkg/runtime"
1214
"k8s.io/klog/v2"
1315
ctrl "sigs.k8s.io/controller-runtime"
1416
"sigs.k8s.io/controller-runtime/pkg/builder"
1517
"sigs.k8s.io/controller-runtime/pkg/client"
1618
"sigs.k8s.io/controller-runtime/pkg/handler"
1719
"sigs.k8s.io/controller-runtime/pkg/predicate"
20+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
1821

1922
configv1 "github.com/openshift/api/config/v1"
2023
"github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
@@ -40,49 +43,64 @@ const (
4043

4144
type CloudConfigReconciler struct {
4245
ClusterOperatorStatusClient
43-
Scheme *runtime.Scheme
44-
FeatureGateAccess featuregates.FeatureGateAccess
45-
consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
46+
Scheme *runtime.Scheme
47+
FeatureGateAccess featuregates.FeatureGateAccess
48+
failures failureWindow
4649
}
4750

48-
// permanentError marks an error as a configuration problem that will not resolve
49-
// without operator or user intervention. The deferred dispatcher in Reconcile
50-
// uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue)
51-
// rather than handleTransientError (failure window, requeue).
52-
type permanentError struct{ cause error }
53-
54-
func (e *permanentError) Error() string { return e.cause.Error() }
55-
func (e *permanentError) Unwrap() error { return e.cause }
51+
// failureWindow tracks consecutive transient failures. All methods are safe for concurrent use.
52+
type failureWindow struct {
53+
mu sync.Mutex
54+
consecutiveFailureSince *time.Time
55+
lastTransientFailureAt *time.Time
56+
}
5657

57-
// permanent wraps err to signal that the condition is not retriable.
58-
func permanent(err error) error { return &permanentError{cause: err} }
58+
// clear resets the failure window. Call this on every successful reconcile.
59+
func (fw *failureWindow) clear() {
60+
fw.mu.Lock()
61+
defer fw.mu.Unlock()
62+
fw.consecutiveFailureSince = nil
63+
fw.lastTransientFailureAt = nil
64+
}
5965

60-
func isPermanent(err error) bool {
61-
_, ok := err.(*permanentError)
62-
return ok
66+
// observe records a transient failure at now and returns the elapsed time since
67+
// the window started plus a boolean indicating whether the window was just opened
68+
// or restarted. staleAfter controls stale-window detection: if the gap since the
69+
// last observed failure exceeds staleAfter, the window restarts. Pass 0 to disable.
70+
func (fw *failureWindow) observe(now time.Time, staleAfter time.Duration) (elapsed time.Duration, started bool) {
71+
fw.mu.Lock()
72+
defer fw.mu.Unlock()
73+
stale := staleAfter > 0 && fw.lastTransientFailureAt != nil && now.Sub(*fw.lastTransientFailureAt) > staleAfter
74+
if fw.consecutiveFailureSince == nil || stale {
75+
fw.consecutiveFailureSince = &now
76+
fw.lastTransientFailureAt = &now
77+
return 0, true
78+
}
79+
fw.lastTransientFailureAt = &now
80+
return now.Sub(*fw.consecutiveFailureSince), false
6381
}
6482

6583
func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
6684
klog.V(1).Infof("Syncing cloud-conf ConfigMap")
6785

6886
// Deferred dispatcher: classifies the returned error and calls the right handler.
69-
// Permanent errors (wrapped with permanent()) degrade immediately without requeue.
87+
// Permanent errors (wrapped with reconcile.TerminalError()) degrade immediately without requeue.
7088
// Transient errors enter the failure window and only degrade after the threshold.
7189
// All nil-error paths clear the failure window.
7290
defer func() {
7391
if retErr == nil {
7492
r.clearFailureWindow()
7593
return
7694
}
77-
if isPermanent(retErr) {
78-
result, retErr = r.handleDegradeError(ctx, retErr)
95+
if errors.Is(retErr, reconcile.TerminalError(nil)) {
96+
result, retErr = r.handleTerminalError(ctx, retErr)
7997
} else {
8098
result, retErr = r.handleTransientError(ctx, retErr)
8199
}
82100
}()
83101

84102
infra := &configv1.Infrastructure{}
85-
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
103+
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) {
86104
// No cloud platform: mirror the main controller's behaviour of returning Available.
87105
klog.Infof("Infrastructure cluster does not exist. Skipping...")
88106
if err := r.setAvailableCondition(ctx); err != nil {
@@ -100,8 +118,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
100118

101119
syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
102120
if err != nil {
103-
// nil platformStatus is a permanent misconfiguration.
104-
return ctrl.Result{}, permanent(err)
121+
// nil platformStatus is a terminal misconfiguration.
122+
return ctrl.Result{}, reconcile.TerminalError(err)
105123
}
106124
if !syncNeeded {
107125
if err := r.setAvailableCondition(ctx); err != nil {
@@ -115,7 +133,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
115133
if err != nil {
116134
// Unsupported platform won't change without a cluster reconfigure.
117135
klog.Errorf("unable to get cloud config transformer function; unsupported platform")
118-
return ctrl.Result{}, permanent(err)
136+
return ctrl.Result{}, reconcile.TerminalError(err)
119137
}
120138

121139
sourceCM := &corev1.ConfigMap{}
@@ -136,7 +154,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
136154
}
137155
if err := r.Get(ctx, defaultSourceCMObjectKey, sourceCM); err == nil {
138156
managedConfigFound = true
139-
} else if errors.IsNotFound(err) {
157+
} else if apierrors.IsNotFound(err) {
140158
klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
141159
} else {
142160
return ctrl.Result{}, err // transient
@@ -149,7 +167,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
149167
Name: infra.Spec.CloudConfig.Name,
150168
Namespace: OpenshiftConfigNamespace,
151169
}
152-
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
170+
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); apierrors.IsNotFound(err) {
153171
klog.Warningf("managed cloud-config is not found, falling back to default cloud config.")
154172
} else if err != nil {
155173
return ctrl.Result{}, err // transient
@@ -158,13 +176,13 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
158176

159177
sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
160178
if err != nil {
161-
// User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes.
162-
return ctrl.Result{}, permanent(err)
179+
// User-supplied key mismatch: terminal until the ConfigMap or Infrastructure changes.
180+
return ctrl.Result{}, reconcile.TerminalError(err)
163181
}
164182

165183
if r.FeatureGateAccess == nil {
166-
// Operator misconfiguration at startup: permanent.
167-
return ctrl.Result{}, permanent(fmt.Errorf("FeatureGateAccess is not configured"))
184+
// Operator misconfiguration at startup: ermanent.
185+
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("FeatureGateAccess is not configured"))
168186
}
169187

170188
features, err := r.FeatureGateAccess.CurrentFeatureGates()
@@ -180,8 +198,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
180198
// we're not expecting users to put their data in the former.
181199
output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
182200
if err != nil {
183-
// Platform-specific transform failed on the current config data: permanent.
184-
return ctrl.Result{}, permanent(err)
201+
// Platform-specific transform failed on the current config data: terminal.
202+
return ctrl.Result{}, reconcile.TerminalError(err)
185203
}
186204
sourceCM.Data[defaultConfigKey] = output
187205
}
@@ -193,7 +211,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
193211
}
194212

195213
// If the config does not exist, it will be created later, so we can ignore a Not Found error
196-
if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) {
214+
if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !apierrors.IsNotFound(err) {
197215
return ctrl.Result{}, err // transient
198216
}
199217

@@ -221,21 +239,19 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
221239
// clearFailureWindow resets the transient-error tracking. Called by the deferred
222240
// dispatcher in Reconcile on every successful (nil-error) return path.
223241
func (r *CloudConfigReconciler) clearFailureWindow() {
224-
r.consecutiveFailureSince = nil
242+
r.failures.clear()
225243
}
226244

227245
// handleTransientError records the start of a failure window and degrades the
228246
// controller only after transientDegradedThreshold has elapsed. It always
229247
// returns a non-nil error so controller-runtime requeues with exponential backoff.
230248
// Called only from the deferred dispatcher in Reconcile.
231249
func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
232-
now := r.Clock.Now()
233-
if r.consecutiveFailureSince == nil {
234-
r.consecutiveFailureSince = &now
250+
elapsed, started := r.failures.observe(r.Clock.Now(), 0)
251+
if started {
235252
klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
236253
return ctrl.Result{}, err
237254
}
238-
elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
239255
if elapsed < transientDegradedThreshold {
240256
klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
241257
return ctrl.Result{}, err
@@ -247,12 +263,12 @@ func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err er
247263
return ctrl.Result{}, err
248264
}
249265

250-
// handleDegradeError sets CloudConfigControllerDegraded=True immediately and
266+
// handleTerminalError sets CloudConfigControllerDegraded=True immediately and
251267
// returns nil so controller-runtime does NOT requeue. An existing watch on the
252268
// relevant resource will re-trigger reconciliation when the problem is fixed.
253269
// Called only from the deferred dispatcher in Reconcile.
254-
func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
255-
klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err)
270+
func (r *CloudConfigReconciler) handleTerminalError(ctx context.Context, err error) (ctrl.Result, error) {
271+
klog.Errorf("CloudConfigReconciler: terminal error, setting degraded: %v", err)
256272
if setErr := r.setDegradedCondition(ctx); setErr != nil {
257273
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
258274
}
@@ -332,7 +348,7 @@ func (r *CloudConfigReconciler) syncCloudConfigData(ctx context.Context, source
332348
// check if target config exists, create if not
333349
err := r.Get(ctx, client.ObjectKeyFromObject(target), &corev1.ConfigMap{})
334350

335-
if err != nil && errors.IsNotFound(err) {
351+
if err != nil && apierrors.IsNotFound(err) {
336352
return r.Create(ctx, target)
337353
} else if err != nil {
338354
return err

pkg/controllers/clusteroperator_controller.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package controllers
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"time"
2324

@@ -26,13 +27,14 @@ import (
2627
"github.com/openshift/library-go/pkg/cloudprovider"
2728
"github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
2829
corev1 "k8s.io/api/core/v1"
29-
"k8s.io/apimachinery/pkg/api/errors"
30+
apierrors "k8s.io/apimachinery/pkg/api/errors"
3031
"k8s.io/apimachinery/pkg/runtime"
3132
"k8s.io/klog/v2"
3233
ctrl "sigs.k8s.io/controller-runtime"
3334
"sigs.k8s.io/controller-runtime/pkg/builder"
3435
"sigs.k8s.io/controller-runtime/pkg/client"
3536
"sigs.k8s.io/controller-runtime/pkg/handler"
37+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3638
"sigs.k8s.io/controller-runtime/pkg/source"
3739

3840
"github.com/openshift/cluster-cloud-controller-manager-operator/pkg/cloud"
@@ -58,12 +60,12 @@ const (
5860
// CloudOperatorReconciler reconciles a ClusterOperator object
5961
type CloudOperatorReconciler struct {
6062
ClusterOperatorStatusClient
61-
Scheme *runtime.Scheme
62-
watcher ObjectWatcher
63-
ImagesFile string
64-
FeatureGateAccess featuregates.FeatureGateAccess
65-
TLSProfileSpec configv1.TLSProfileSpec
66-
consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
63+
Scheme *runtime.Scheme
64+
watcher ObjectWatcher
65+
ImagesFile string
66+
FeatureGateAccess featuregates.FeatureGateAccess
67+
TLSProfileSpec configv1.TLSProfileSpec
68+
failures failureWindow
6769
}
6870

6971
// +kubebuilder:rbac:groups=config.openshift.io,resources=clusteroperators,verbs=get;list;watch;create;update;patch;delete
@@ -84,15 +86,15 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
8486
r.clearFailureWindow()
8587
return
8688
}
87-
if isPermanent(retErr) {
89+
if errors.Is(retErr, reconcile.TerminalError(nil)) {
8890
result, retErr = r.handleDegradeError(ctx, conditionOverrides, retErr)
8991
} else {
9092
result, retErr = r.handleTransientError(ctx, conditionOverrides, retErr)
9193
}
9294
}()
9395

9496
infra := &configv1.Infrastructure{}
95-
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
97+
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); apierrors.IsNotFound(err) {
9698
klog.Infof("Infrastructure cluster does not exist. Skipping...")
9799
if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
98100
klog.Errorf("Unable to sync cluster operator status: %s", err)
@@ -120,15 +122,15 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
120122
}
121123

122124
clusterProxy := &configv1.Proxy{}
123-
if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !errors.IsNotFound(err) {
125+
if err := r.Get(ctx, client.ObjectKey{Name: proxyResourceName}, clusterProxy); err != nil && !apierrors.IsNotFound(err) {
124126
klog.Errorf("Unable to retrive Proxy object: %v", err)
125127
return ctrl.Result{}, err // transient
126128
}
127129

128130
operatorConfig, err := config.ComposeConfig(infra, clusterProxy, r.ImagesFile, r.ManagedNamespace, r.FeatureGateAccess, r.TLSProfileSpec)
129131
if err != nil {
130132
klog.Errorf("Unable to build operator config %s", err)
131-
return ctrl.Result{}, permanent(err) // permanent: defer calls handleDegradeError
133+
return ctrl.Result{}, reconcile.TerminalError(err) // permanent: defer calls handleDegradeError
132134
}
133135

134136
if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil {
@@ -150,21 +152,19 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
150152
}
151153

152154
func (r *CloudOperatorReconciler) clearFailureWindow() {
153-
r.consecutiveFailureSince = nil
155+
r.failures.clear()
154156
}
155157

156158
// handleTransientError records the start of a failure window and degrades the
157159
// operator only after aggregatedTransientDegradedThreshold has elapsed. It always returns
158160
// a non-nil error so controller-runtime requeues with exponential backoff.
159161
// Called only from the deferred dispatcher in Reconcile.
160162
func (r *CloudOperatorReconciler) handleTransientError(ctx context.Context, conditionOverrides []configv1.ClusterOperatorStatusCondition, err error) (ctrl.Result, error) {
161-
now := r.Clock.Now()
162-
if r.consecutiveFailureSince == nil {
163-
r.consecutiveFailureSince = &now
163+
elapsed, started := r.failures.observe(r.Clock.Now(), 0)
164+
if started {
164165
klog.V(4).Infof("CloudOperatorReconciler: transient failure started (%v), will degrade after %s", err, aggregatedTransientDegradedThreshold)
165166
return ctrl.Result{}, err
166167
}
167-
elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
168168
if elapsed < aggregatedTransientDegradedThreshold {
169169
klog.V(4).Infof("CloudOperatorReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, aggregatedTransientDegradedThreshold, err)
170170
return ctrl.Result{}, err

0 commit comments

Comments
 (0)