From 2721bba42b669a5c938973d4fc771d15db5e0db0 Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Fri, 29 May 2026 17:19:13 +0100 Subject: [PATCH 1/3] Fix Progressing=True being immediately overwritten by Available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sync() returned nil on success, so Reconcile always called setStatusAvailable immediately after setStatusProgressing. Change sync to return (bool, error) so Reconcile can skip setStatusAvailable when resources were just updated. Also fix applyResources overwriting the updated flag each iteration — only the last resource's status was returned. --- pkg/controllers/clusteroperator_controller.go | 29 +++++---- .../clusteroperator_controller_test.go | 65 +++++++++++++++++++ 2 files changed, 83 insertions(+), 11 deletions(-) diff --git a/pkg/controllers/clusteroperator_controller.go b/pkg/controllers/clusteroperator_controller.go index a5fefa1f3..94f7f1950 100644 --- a/pkg/controllers/clusteroperator_controller.go +++ b/pkg/controllers/clusteroperator_controller.go @@ -121,11 +121,16 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request) return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to build operator config: %w", err)) } - if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil { + progressing, err := r.sync(ctx, operatorConfig, conditionOverrides) + if err != nil { klog.Errorf("Unable to sync operands: %s", err) return ctrl.Result{}, fmt.Errorf("failed to sync operands: %w", err) } + if progressing { + return ctrl.Result{}, nil + } + if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil { klog.Errorf("Unable to sync cluster operator status: %s", err) return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err) @@ -143,33 +148,35 @@ func (r *CloudOperatorReconciler) clearFailureWindow() { r.failures.clear() } -func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error { - // Deploy resources for platform +func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) (bool, error) { resources, err := cloud.GetResources(config) if err != nil { - return err + return false, err } updated, err := r.applyResources(ctx, resources) if err != nil { - return err + return false, err } if updated { - return r.setStatusProgressing(ctx, conditionOverrides) + if err := r.setStatusProgressing(ctx, conditionOverrides); err != nil { + return false, err + } + return true, nil } - return nil + return false, nil } // applyResources will apply all resources as-is to the cluster, allowing adding of custom annotations and lables func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources []client.Object) (bool, error) { - updated := false - var err error + anyUpdated := false for _, resource := range resources { - updated, err = resourceapply.ApplyResource(ctx, r.Client, r.Recorder, resource) + updated, err := resourceapply.ApplyResource(ctx, r.Client, r.Recorder, resource) if err != nil { return false, err } + anyUpdated = anyUpdated || updated if err := r.watcher.Watch(ctx, resource); err != nil { klog.Errorf("Unable to establish watch on object %s '%s': %+v", resource.GetObjectKind().GroupVersionKind(), resource.GetName(), err) @@ -182,7 +189,7 @@ func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources klog.V(2).Info("Resources applied successfully.") } - return updated, nil + return anyUpdated, nil } // SetupWithManager sets up the controller with the Manager. diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index ae93f1513..c47d8c458 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -605,6 +605,71 @@ var _ = Describe("Apply resources should", func() { Expect(dep.Labels[common.CloudControllerManagerProviderLabel]).To(Equal("AWS")) }) + It("reports updated=true when only a non-final resource changed", func() { + operatorConfig := getConfigForPlatform(&configv1.PlatformStatus{Type: configv1.AWSPlatformType}) + awsResources, err := cloud.GetResources(operatorConfig) + Expect(err).To(Succeed()) + Expect(len(awsResources)).To(BeNumerically(">=", 2)) + + resources = append(resources, awsResources...) + + updated, err := reconciler.applyResources(context.TODO(), resources) + Expect(err).ShouldNot(HaveOccurred()) + Expect(updated).To(BeTrue()) + + // Drain recorder events from initial creation + for len(recorder.Events) > 0 { + <-recorder.Events + } + + // Modify only the deployment so it gets updated, while later resources remain unchanged + for i, res := range resources { + if dep, ok := res.(*appsv1.Deployment); ok { + dep.Spec.Replicas = ptr.To[int32](99) + resources[i] = dep + break + } + } + + updated, err = reconciler.applyResources(context.TODO(), resources) + Expect(err).ShouldNot(HaveOccurred()) + Expect(updated).To(BeTrue(), "should report updated even when only a non-final resource changed") + }) + + It("should set Progressing=True on first sync and not signal progressing when resources are stable", func() { + reconciler.ManagedNamespace = DefaultManagedNamespace + + co := &configv1.ClusterOperator{} + co.SetName(clusterOperatorName) + Expect(cl.Create(context.TODO(), co)).To(Succeed()) + + operatorConfig := getConfigForPlatform(&configv1.PlatformStatus{Type: configv1.AWSPlatformType}) + awsResources, err := cloud.GetResources(operatorConfig) + Expect(err).To(Succeed()) + resources = append(resources, awsResources...) + + // First sync: resources do not yet exist, so applyResources reports updated=true. + progressing, err := reconciler.sync(context.TODO(), operatorConfig, nil) + Expect(err).To(Succeed()) + Expect(progressing).To(BeTrue(), "sync should report progressing when resources are newly applied") + + Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + Equal(configv1.ConditionTrue), "Progressing should be True after resources are first applied", + ) + + // Second sync: resources exist and are unchanged, so applyResources reports updated=false. + progressing, err = reconciler.sync(context.TODO(), operatorConfig, nil) + Expect(err).To(Succeed()) + Expect(progressing).To(BeFalse(), "sync should not report progressing when resources are already up to date") + + // Progressing remains True because sync() does not clear it; only setStatusAvailable() does. + Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + Equal(configv1.ConditionTrue), "Progressing should remain True until setStatusAvailable is called", + ) + }) + AfterEach(func() { co := &configv1.ClusterOperator{} err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) From dc9a2b912cb16e069ab74473605f9be4e4b63d44 Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Wed, 3 Jun 2026 15:05:51 +0100 Subject: [PATCH 2/3] Progressing=true test --- .../clusteroperator_controller_test.go | 122 +++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index c47d8c458..bd8dbe978 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -699,7 +699,6 @@ var _ = Describe("Apply resources should", func() { }, timeout).Should(Succeed()) } }) - }) var _ = Describe("CloudOperatorReconciler error handling", func() { @@ -864,3 +863,124 @@ var _ = Describe("CloudOperatorReconciler error handling", func() { "should not be degraded because the failure window was reset by the successful reconcile") }) }) + +var _ = Describe("Reconcile progressing flow", func() { + ctx := context.Background() + var reconciler *CloudOperatorReconciler + var operandResources []client.Object + + BeforeEach(func() { + c, err := cache.New(cfg, cache.Options{}) + Expect(err).To(Succeed()) + + w, err := NewObjectWatcher(WatcherOptions{Cache: c}) + Expect(err).To(Succeed()) + + reconciler = &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: cl, + Clock: clocktesting.NewFakePassiveClock(time.Now()), + ManagedNamespace: DefaultManagedNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + watcher: w, + ImagesFile: testImagesFilePath, + } + + infra := &configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{Name: infrastructureResourceName}, + } + Expect(cl.Create(ctx, infra)).To(Succeed()) + + infra.Status = configv1.InfrastructureStatus{ + PlatformStatus: &configv1.PlatformStatus{Type: configv1.AWSPlatformType}, + Platform: configv1.AWSPlatformType, + InfrastructureTopology: configv1.HighlyAvailableTopologyMode, + ControlPlaneTopology: configv1.HighlyAvailableTopologyMode, + } + Expect(cl.Status().Update(ctx, infra)).To(Succeed()) + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + co.Status.Conditions = []configv1.ClusterOperatorStatusCondition{ + {Type: cloudConfigControllerAvailableCondition, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Now()}, + {Type: cloudConfigControllerDegradedCondition, Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + {Type: trustedCABundleControllerAvailableCondition, Status: configv1.ConditionTrue, LastTransitionTime: metav1.Now()}, + {Type: trustedCABundleControllerDegradedCondition, Status: configv1.ConditionFalse, LastTransitionTime: metav1.Now()}, + } + Expect(cl.Status().Update(ctx, co)).To(Succeed()) + }) + + AfterEach(func() { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() error { + err := cl.Delete(ctx, co) + if err == nil || apierrors.IsNotFound(err) { + return nil + } + return err + }).Should(Succeed()) + } + + infra := &configv1.Infrastructure{ObjectMeta: metav1.ObjectMeta{Name: infrastructureResourceName}} + cl.Delete(ctx, infra) //nolint:errcheck + + for _, res := range operandResources { + cl.Delete(ctx, res) //nolint:errcheck + Eventually(func() error { + err := cl.Get(ctx, client.ObjectKeyFromObject(res), res) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return err + } + return fmt.Errorf("expected operand %s to be deleted", res.GetName()) + }, timeout).Should(Succeed()) + } + }) + + It("sets Progressing=True on first reconcile, then Available on second", func() { + // First Reconcile: resources are created, Progressing=True, Available not set. + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).NotTo(HaveOccurred()) + + // Track created resources for cleanup + operatorConfig := config.OperatorConfig{ + ManagedNamespace: DefaultManagedNamespace, + ImagesReference: config.ImagesReference{ + CloudControllerManagerOperator: "registry.ci.openshift.org/openshift:cluster-cloud-controller-manager-operator", + CloudControllerManagerAWS: "registry.ci.openshift.org/openshift:aws-cloud-controller-manager", + }, + PlatformStatus: &configv1.PlatformStatus{Type: configv1.AWSPlatformType}, + } + operandResources, err = cloud.GetResources(operatorConfig) + Expect(err).NotTo(HaveOccurred()) + + co := &configv1.ClusterOperator{} + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + Equal(configv1.ConditionTrue), "Progressing should be True after first reconcile creates resources", + ) + availCond := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorAvailable) + if availCond != nil { + Expect(availCond.Status).NotTo(Equal(configv1.ConditionTrue), + "Available should not be True while progressing") + } + + // Second Reconcile: nothing changed, Progressing=False, Available=True. + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).NotTo(HaveOccurred()) + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + Equal(configv1.ConditionFalse), "Progressing should be False after second reconcile with no changes", + ) + Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorAvailable).Status).To( + Equal(configv1.ConditionTrue), "Available should be True after second reconcile", + ) + }) +}) From 0ebf17c75e60b1e2f1f3070bf786434f08a4361f Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Wed, 3 Jun 2026 16:57:10 +0100 Subject: [PATCH 3/3] Transient Degraded=true test --- .../cloud_config_sync_controller_test.go | 145 +++++++++++--- .../clusteroperator_controller_test.go | 177 +++++++++++++----- pkg/controllers/reconcile_dispatch_test.go | 44 +++++ .../trusted_ca_bundle_controller_test.go | 30 +-- 4 files changed, 291 insertions(+), 105 deletions(-) create mode 100644 pkg/controllers/reconcile_dispatch_test.go diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go index ba217895f..a9e370ef9 100644 --- a/pkg/controllers/cloud_config_sync_controller_test.go +++ b/pkg/controllers/cloud_config_sync_controller_test.go @@ -2,6 +2,7 @@ package controllers import ( "context" + "fmt" "time" . "github.com/onsi/ginkgo/v2" @@ -23,6 +24,7 @@ import ( metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" ) @@ -259,19 +261,7 @@ var _ = Describe("Cloud config sync controller", func() { mgrCtxCancel() Eventually(mgrStopped, timeout).Should(BeClosed()) - co := &configv1.ClusterOperator{} - err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() error { - return cl.Delete(context.Background(), co) - }).Should(SatisfyAny( - Not(HaveOccurred()), - MatchError(apierrors.IsNotFound, "IsNotFound"), - )) - } - Eventually(func() error { - return cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - }).Should(MatchError(apierrors.IsNotFound, "IsNotFound")) + deleteClusterOperator(context.Background(), cl) By("Cleanup resources") deleteOptions := &client.DeleteOptions{ @@ -457,19 +447,7 @@ var _ = Describe("Cloud config sync reconciler", func() { GracePeriodSeconds: ptr.To[int64](0), } - co := &configv1.ClusterOperator{} - err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() error { - return cl.Delete(context.Background(), co) - }).Should(SatisfyAny( - Not(HaveOccurred()), - MatchError(apierrors.IsNotFound, "IsNotFound"), - )) - } - Eventually(func() error { - return cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - }).Should(MatchError(apierrors.IsNotFound, "IsNotFound")) + deleteClusterOperator(context.Background(), cl) infra := &configv1.Infrastructure{ ObjectMeta: metav1.ObjectMeta{ @@ -867,3 +845,118 @@ var _ = Describe("vSphere managed config sync", func() { }) }) }) + +var _ = Describe("CloudConfigReconciler error handling", func() { + ctx := context.Background() + + AfterEach(func() { + deleteClusterOperator(ctx, cl) + }) + + It("transient error should not set cloudConfigControllerDegraded before threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + getErr := fmt.Errorf("connection refused") + faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}} + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + reconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: &faultyClient, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(nil, nil, nil, nil), + } + + stepSize := transientDegradedThreshold / 5 + for range 4 { + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred(), "transient error should be returned for controller-runtime to requeue") + fakeClock.Step(stepSize) + } + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, cloudConfigControllerDegradedCondition)).To(BeFalse(), + "cloudConfigControllerDegraded should not be True before the transient threshold elapses") + }) + + It("transient error should set cloudConfigControllerDegraded after threshold is crossed", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + getErr := fmt.Errorf("connection refused") + faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}} + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + reconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: &faultyClient, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(nil, nil, nil, nil), + } + + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + fakeClock.Step(transientDegradedThreshold + time.Second) + + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + var degradedCond *configv1.ClusterOperatorStatusCondition + for i := range co.Status.Conditions { + if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { + degradedCond = &co.Status.Conditions[i] + break + } + } + Expect(degradedCond).NotTo(BeNil(), "cloudConfigControllerDegraded condition should exist after threshold is crossed") + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue), + "cloudConfigControllerDegraded should be True after threshold is crossed") + }) + + It("successful reconcile resets the failure window so transient errors must start fresh", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + getErr := fmt.Errorf("connection refused") + faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}} + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + reconciler := &CloudConfigReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: &faultyClient, + Clock: fakeClock, + ManagedNamespace: testManagedNamespace, + }, + Scheme: scheme.Scheme, + FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(nil, nil, nil, nil), + } + + By("opening the failure window with a transient error") + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + By("clearing the fault so Reconcile succeeds via the Infrastructure-not-found path") + getErr = nil + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).NotTo(HaveOccurred()) + + By("advancing past the original threshold and re-injecting the fault") + fakeClock.Step(transientDegradedThreshold + time.Second) + getErr = fmt.Errorf("connection refused") + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, cloudConfigControllerDegradedCondition)).To(BeFalse(), + "cloudConfigControllerDegraded should not be True because the failure window was reset by the successful reconcile") + }) +}) diff --git a/pkg/controllers/clusteroperator_controller_test.go b/pkg/controllers/clusteroperator_controller_test.go index bd8dbe978..a062971bb 100644 --- a/pkg/controllers/clusteroperator_controller_test.go +++ b/pkg/controllers/clusteroperator_controller_test.go @@ -52,18 +52,7 @@ var _ = Describe("Cluster Operator status controller", func() { }) AfterEach(func() { - co := &configv1.ClusterOperator{} - err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() error { - err := cl.Delete(context.Background(), operator) - if err == nil || apierrors.IsNotFound(err) { - return nil - } - return err - }).Should(Succeed()) - } - Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) + deleteClusterOperator(context.Background(), cl) }) type testCase struct { @@ -654,7 +643,9 @@ var _ = Describe("Apply resources should", func() { Expect(progressing).To(BeTrue(), "sync should report progressing when resources are newly applied") Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) - Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + progressingCond := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) + Expect(progressingCond).NotTo(BeNil(), "Progressing condition should exist after first sync") + Expect(progressingCond.Status).To( Equal(configv1.ConditionTrue), "Progressing should be True after resources are first applied", ) @@ -665,24 +656,15 @@ var _ = Describe("Apply resources should", func() { // Progressing remains True because sync() does not clear it; only setStatusAvailable() does. Expect(cl.Get(context.TODO(), client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) - Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + progressingCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) + Expect(progressingCond).NotTo(BeNil(), "Progressing condition should still exist after second sync") + Expect(progressingCond.Status).To( Equal(configv1.ConditionTrue), "Progressing should remain True until setStatusAvailable is called", ) }) AfterEach(func() { - co := &configv1.ClusterOperator{} - err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() error { - err := cl.Delete(context.Background(), co) - if err == nil || apierrors.IsNotFound(err) { - return nil - } - return err - }, timeout).Should(Succeed()) - } - Expect(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) + deleteClusterOperator(context.Background(), cl) for _, operand := range resources { Expect(cl.Delete(context.Background(), operand)).To(Succeed()) @@ -705,17 +687,7 @@ var _ = Describe("CloudOperatorReconciler error handling", func() { ctx := context.Background() AfterEach(func() { - co := &configv1.ClusterOperator{} - if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { - Eventually(func() error { - err := cl.Delete(ctx, co) - if err == nil || apierrors.IsNotFound(err) { - return nil - } - return err - }).Should(Succeed()) - } - Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(MatchError(apierrors.IsNotFound, "ClusterOperator should have been deleted")) + deleteClusterOperator(ctx, cl) infra := &configv1.Infrastructure{ObjectMeta: metav1.ObjectMeta{Name: infrastructureResourceName}} cl.Delete(ctx, infra) //nolint:errcheck @@ -812,7 +784,113 @@ var _ = Describe("CloudOperatorReconciler error handling", func() { Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded).Status).To(Equal(configv1.ConditionTrue)) }) - It("successful reconcile clears the failure window so subsequent transient errors start fresh", func() { + It("transient error should not set OperatorDegraded before threshold", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + getErr := fmt.Errorf("connection refused") + faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}} + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: &faultyClient, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // Reconcile several times, advancing the clock each iteration but + // staying under the threshold. OperatorDegraded must never be set. + stepSize := aggregatedTransientDegradedThreshold / 5 + for range 4 { + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred(), "transient error should be returned for controller-runtime to requeue") + fakeClock.Step(stepSize) + } + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "OperatorDegraded should not be True before the transient threshold elapses") + }) + + It("transient error should set OperatorDegraded after threshold is crossed", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + getErr := fmt.Errorf("connection refused") + faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}} + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: &faultyClient, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + // First Reconcile opens the failure window. + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + // Advance past the threshold. + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + + // Next Reconcile crosses the threshold and sets OperatorDegraded. + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + degradedCond := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded) + Expect(degradedCond).NotTo(BeNil(), "OperatorDegraded condition should exist after threshold is crossed") + Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue), + "OperatorDegraded should be True after threshold is crossed") + }) + + It("successful reconcile resets the failure window so transient errors must start fresh", func() { + fakeClock := clocktesting.NewFakeClock(time.Now()) + getErr := fmt.Errorf("connection refused") + faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}} + + co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}} + Expect(cl.Create(ctx, co)).To(Succeed()) + + reconciler := &CloudOperatorReconciler{ + ClusterOperatorStatusClient: ClusterOperatorStatusClient{ + Client: &faultyClient, + Clock: fakeClock, + ManagedNamespace: defaultManagementNamespace, + Recorder: record.NewFakeRecorder(32), + }, + Scheme: scheme.Scheme, + } + + By("opening the failure window with a transient error") + _, err := reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + By("clearing the fault so Reconcile succeeds via the Infrastructure-not-found path") + getErr = nil + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).NotTo(HaveOccurred()) + + By("advancing past the original threshold and re-injecting the fault") + fakeClock.Step(aggregatedTransientDegradedThreshold + time.Second) + getErr = fmt.Errorf("connection refused") + _, err = reconciler.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(HaveOccurred()) + + Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) + Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, configv1.OperatorDegraded)).To(BeFalse(), + "OperatorDegraded should not be True because the failure window was reset by the successful reconcile") + }) + + It("successful finalizeReconcile clears the failure window so subsequent transient errors start fresh", func() { fakeClock := clocktesting.NewFakeClock(time.Now()) reconciler := &CloudOperatorReconciler{ ClusterOperatorStatusClient: ClusterOperatorStatusClient{ @@ -914,16 +992,7 @@ var _ = Describe("Reconcile progressing flow", func() { }) AfterEach(func() { - co := &configv1.ClusterOperator{} - if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { - Eventually(func() error { - err := cl.Delete(ctx, co) - if err == nil || apierrors.IsNotFound(err) { - return nil - } - return err - }).Should(Succeed()) - } + deleteClusterOperator(ctx, cl) infra := &configv1.Infrastructure{ObjectMeta: metav1.ObjectMeta{Name: infrastructureResourceName}} cl.Delete(ctx, infra) //nolint:errcheck @@ -962,7 +1031,9 @@ var _ = Describe("Reconcile progressing flow", func() { co := &configv1.ClusterOperator{} Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) - Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + progressingCond := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) + Expect(progressingCond).NotTo(BeNil(), "Progressing condition should exist after first reconcile") + Expect(progressingCond.Status).To( Equal(configv1.ConditionTrue), "Progressing should be True after first reconcile creates resources", ) availCond := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorAvailable) @@ -976,10 +1047,14 @@ var _ = Describe("Reconcile progressing flow", func() { Expect(err).NotTo(HaveOccurred()) Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) - Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing).Status).To( + progressingCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorProgressing) + Expect(progressingCond).NotTo(BeNil(), "Progressing condition should exist after second reconcile") + Expect(progressingCond.Status).To( Equal(configv1.ConditionFalse), "Progressing should be False after second reconcile with no changes", ) - Expect(v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorAvailable).Status).To( + availCond = v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorAvailable) + Expect(availCond).NotTo(BeNil(), "Available condition should exist after second reconcile") + Expect(availCond.Status).To( Equal(configv1.ConditionTrue), "Available should be True after second reconcile", ) }) diff --git a/pkg/controllers/reconcile_dispatch_test.go b/pkg/controllers/reconcile_dispatch_test.go new file mode 100644 index 000000000..0c257c20b --- /dev/null +++ b/pkg/controllers/reconcile_dispatch_test.go @@ -0,0 +1,44 @@ +package controllers + +import ( + "context" + "reflect" + + . "github.com/onsi/gomega" + configv1 "github.com/openshift/api/config/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// errorInjectingClient wraps a real client and injects errors on Get calls +// for a specific object type. This simulates transient API server failures +// (e.g. network blips) without depending on any internal controller code path. +// The getErr pointer allows tests to toggle faults between reconcile calls. +type errorInjectingClient struct { + client.Client + getErr *error + failType client.Object +} + +func (c *errorInjectingClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + if c.getErr != nil && *c.getErr != nil && reflect.TypeOf(obj) == reflect.TypeOf(c.failType) { + return *c.getErr + } + return c.Client.Get(ctx, key, obj, opts...) +} + +func deleteClusterOperator(ctx context.Context, cl client.Client) { + co := &configv1.ClusterOperator{} + if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { + Eventually(func() error { + err := cl.Delete(ctx, co) + if err == nil || apierrors.IsNotFound(err) { + return nil + } + return err + }).Should(Succeed()) + } + Eventually(func() error { + return cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co) + }).Should(MatchError(apierrors.IsNotFound, "IsNotFound")) +} diff --git a/pkg/controllers/trusted_ca_bundle_controller_test.go b/pkg/controllers/trusted_ca_bundle_controller_test.go index f3dd6cbed..2aefc2d27 100644 --- a/pkg/controllers/trusted_ca_bundle_controller_test.go +++ b/pkg/controllers/trusted_ca_bundle_controller_test.go @@ -162,20 +162,7 @@ var _ = Describe("Trusted CA bundle sync controller", func() { GracePeriodSeconds: ptr.To[int64](0), } - co := &v1.ClusterOperator{} - err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co) - if err == nil || !apierrors.IsNotFound(err) { - Eventually(func() error { - err := cl.Delete(context.Background(), co) - if err == nil || apierrors.IsNotFound(err) { - return nil - } - return err - }).Should(Succeed()) - } - Eventually(func() bool { - return apierrors.IsNotFound(cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)) - }).Should(BeTrue()) + deleteClusterOperator(context.Background(), cl) if proxyResource != nil { Expect(cl.Delete(ctx, proxyResource, deleteOptions)).To(Succeed()) @@ -326,20 +313,7 @@ var _ = Describe("Trusted CA bundle reconciler unit tests", func() { ctx := context.Background() AfterEach(func() { - co := &v1.ClusterOperator{} - if err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err == nil { - Expect(cl.Delete(ctx, co)).To(Succeed()) - Eventually(func() error { - err := cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co) - if apierrors.IsNotFound(err) { - return nil - } - if err != nil { - return err - } - return fmt.Errorf("expected ClusterOperator to be deleted") - }).Should(Succeed()) - } + deleteClusterOperator(ctx, cl) }) It("reconcile should succeed and be available if no proxy resource found", func() {