Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 119 additions & 26 deletions pkg/controllers/cloud_config_sync_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package controllers

import (
"context"
"fmt"
"time"

. "github.com/onsi/ginkgo/v2"
Expand All @@ -23,6 +24,7 @@ import (
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"
"github.com/openshift/library-go/pkg/operator/configobserver/featuregates"
)

Expand Down Expand Up @@ -259,19 +261,7 @@ var _ = Describe("Cloud config sync controller", func() {
mgrCtxCancel()
Eventually(mgrStopped, timeout).Should(BeClosed())

co := &configv1.ClusterOperator{}
err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
if err == nil || !apierrors.IsNotFound(err) {
Eventually(func() error {
return cl.Delete(context.Background(), co)
}).Should(SatisfyAny(
Not(HaveOccurred()),
MatchError(apierrors.IsNotFound, "IsNotFound"),
))
}
Eventually(func() error {
return cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
}).Should(MatchError(apierrors.IsNotFound, "IsNotFound"))
deleteClusterOperator(context.Background(), cl)

By("Cleanup resources")
deleteOptions := &client.DeleteOptions{
Expand Down Expand Up @@ -457,19 +447,7 @@ var _ = Describe("Cloud config sync reconciler", func() {
GracePeriodSeconds: ptr.To[int64](0),
}

co := &configv1.ClusterOperator{}
err := cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
if err == nil || !apierrors.IsNotFound(err) {
Eventually(func() error {
return cl.Delete(context.Background(), co)
}).Should(SatisfyAny(
Not(HaveOccurred()),
MatchError(apierrors.IsNotFound, "IsNotFound"),
))
}
Eventually(func() error {
return cl.Get(context.Background(), client.ObjectKey{Name: clusterOperatorName}, co)
}).Should(MatchError(apierrors.IsNotFound, "IsNotFound"))
deleteClusterOperator(context.Background(), cl)

infra := &configv1.Infrastructure{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -867,3 +845,118 @@ var _ = Describe("vSphere managed config sync", func() {
})
})
})

var _ = Describe("CloudConfigReconciler error handling", func() {
ctx := context.Background()

AfterEach(func() {
deleteClusterOperator(ctx, cl)
})

It("transient error should not set cloudConfigControllerDegraded before threshold", func() {
fakeClock := clocktesting.NewFakeClock(time.Now())
getErr := fmt.Errorf("connection refused")
faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}}

co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}}
Expect(cl.Create(ctx, co)).To(Succeed())

reconciler := &CloudConfigReconciler{
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
Client: &faultyClient,
Clock: fakeClock,
ManagedNamespace: testManagedNamespace,
},
Scheme: scheme.Scheme,
FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(nil, nil, nil, nil),
}

stepSize := transientDegradedThreshold / 5
for range 4 {
_, err := reconciler.Reconcile(ctx, reconcile.Request{})
Expect(err).To(HaveOccurred(), "transient error should be returned for controller-runtime to requeue")
fakeClock.Step(stepSize)
}

Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, cloudConfigControllerDegradedCondition)).To(BeFalse(),
"cloudConfigControllerDegraded should not be True before the transient threshold elapses")
})

It("transient error should set cloudConfigControllerDegraded after threshold is crossed", func() {
fakeClock := clocktesting.NewFakeClock(time.Now())
getErr := fmt.Errorf("connection refused")
faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}}

co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}}
Expect(cl.Create(ctx, co)).To(Succeed())

reconciler := &CloudConfigReconciler{
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
Client: &faultyClient,
Clock: fakeClock,
ManagedNamespace: testManagedNamespace,
},
Scheme: scheme.Scheme,
FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(nil, nil, nil, nil),
}

_, err := reconciler.Reconcile(ctx, reconcile.Request{})
Expect(err).To(HaveOccurred())

fakeClock.Step(transientDegradedThreshold + time.Second)

_, err = reconciler.Reconcile(ctx, reconcile.Request{})
Expect(err).To(HaveOccurred())

Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil(), "cloudConfigControllerDegraded condition should exist after threshold is crossed")
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue),
"cloudConfigControllerDegraded should be True after threshold is crossed")
})

It("successful reconcile resets the failure window so transient errors must start fresh", func() {
fakeClock := clocktesting.NewFakeClock(time.Now())
getErr := fmt.Errorf("connection refused")
faultyClient := errorInjectingClient{Client: cl, getErr: &getErr, failType: &configv1.Infrastructure{}}

co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: clusterOperatorName}}
Expect(cl.Create(ctx, co)).To(Succeed())

reconciler := &CloudConfigReconciler{
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
Client: &faultyClient,
Clock: fakeClock,
ManagedNamespace: testManagedNamespace,
},
Scheme: scheme.Scheme,
FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(nil, nil, nil, nil),
}

By("opening the failure window with a transient error")
_, err := reconciler.Reconcile(ctx, reconcile.Request{})
Expect(err).To(HaveOccurred())

By("clearing the fault so Reconcile succeeds via the Infrastructure-not-found path")
getErr = nil
_, err = reconciler.Reconcile(ctx, reconcile.Request{})
Expect(err).NotTo(HaveOccurred())

By("advancing past the original threshold and re-injecting the fault")
fakeClock.Step(transientDegradedThreshold + time.Second)
getErr = fmt.Errorf("connection refused")
_, err = reconciler.Reconcile(ctx, reconcile.Request{})
Expect(err).To(HaveOccurred())

Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
Expect(v1helpers.IsStatusConditionTrue(co.Status.Conditions, cloudConfigControllerDegradedCondition)).To(BeFalse(),
"cloudConfigControllerDegraded should not be True because the failure window was reset by the successful reconcile")
})
})
29 changes: 18 additions & 11 deletions pkg/controllers/clusteroperator_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,16 @@ func (r *CloudOperatorReconciler) Reconcile(ctx context.Context, _ ctrl.Request)
return ctrl.Result{}, reconcile.TerminalError(fmt.Errorf("failed to build operator config: %w", err))
}

if err := r.sync(ctx, operatorConfig, conditionOverrides); err != nil {
progressing, err := r.sync(ctx, operatorConfig, conditionOverrides)
if err != nil {
klog.Errorf("Unable to sync operands: %s", err)
return ctrl.Result{}, fmt.Errorf("failed to sync operands: %w", err)
}

if progressing {
return ctrl.Result{}, nil
}

if err := r.setStatusAvailable(ctx, conditionOverrides); err != nil {
klog.Errorf("Unable to sync cluster operator status: %s", err)
return ctrl.Result{}, fmt.Errorf("error syncing ClusterOperatorStatus: %w", err)
Expand All @@ -143,33 +148,35 @@ func (r *CloudOperatorReconciler) clearFailureWindow() {
r.failures.clear()
}

func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) error {
// Deploy resources for platform
func (r *CloudOperatorReconciler) sync(ctx context.Context, config config.OperatorConfig, conditionOverrides []configv1.ClusterOperatorStatusCondition) (bool, error) {
resources, err := cloud.GetResources(config)
if err != nil {
return err
return false, err
}
updated, err := r.applyResources(ctx, resources)
if err != nil {
return err
return false, err
}
if updated {
return r.setStatusProgressing(ctx, conditionOverrides)
if err := r.setStatusProgressing(ctx, conditionOverrides); err != nil {
return false, err
}
return true, nil
}

return nil
return false, nil
}

// applyResources will apply all resources as-is to the cluster, allowing adding of custom annotations and lables
func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources []client.Object) (bool, error) {
updated := false
var err error
anyUpdated := false

for _, resource := range resources {
updated, err = resourceapply.ApplyResource(ctx, r.Client, r.Recorder, resource)
updated, err := resourceapply.ApplyResource(ctx, r.Client, r.Recorder, resource)
if err != nil {
return false, err
}
anyUpdated = anyUpdated || updated

if err := r.watcher.Watch(ctx, resource); err != nil {
klog.Errorf("Unable to establish watch on object %s '%s': %+v", resource.GetObjectKind().GroupVersionKind(), resource.GetName(), err)
Expand All @@ -182,7 +189,7 @@ func (r *CloudOperatorReconciler) applyResources(ctx context.Context, resources
klog.V(2).Info("Resources applied successfully.")
}

return updated, nil
return anyUpdated, nil
}

// SetupWithManager sets up the controller with the Manager.
Expand Down
Loading