From 44f5686ac302aaf2fa7734ea7180f605c06a31c6 Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 11 Jan 2026 05:17:30 +0000 Subject: [PATCH 1/2] test: add e2e tests for workload resilience when catalog is deleted --- .../catalog-deletion-resilience.feature | 330 ++++++++++++++++++ test/e2e/steps/steps.go | 33 ++ 2 files changed, 363 insertions(+) create mode 100644 test/e2e/features/catalog-deletion-resilience.feature diff --git a/test/e2e/features/catalog-deletion-resilience.feature b/test/e2e/features/catalog-deletion-resilience.feature new file mode 100644 index 0000000000..959ba7a4f0 --- /dev/null +++ b/test/e2e/features/catalog-deletion-resilience.feature @@ -0,0 +1,330 @@ +Feature: Workload resilience when catalog is deleted + + As an OLM user, I want my installed extensions to continue working + even if the catalog they were installed from is deleted. + + Background: + Given OLM is available + And ClusterCatalog "test" serves bundles + And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + + # STANDARD RUNTIME TESTS + + Scenario: Extension continues running after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + When ClusterCatalog "test" is deleted + Then resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + And ClusterExtension reports Installed as True + + Scenario: Resources are restored after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "configmap/test-configmap" exists + When ClusterCatalog "test" is deleted + And resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + + Scenario: Config changes work without catalog + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to add preflight config + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + install: + preflight: + crdUpgradeSafety: + enforcement: None + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension reports Installed as True + + Scenario: Version upgrade blocked without catalog + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to change version + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.1" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension reports Progressing as True with Reason Retrying + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + + # BOXCUTTER RUNTIME (EXPERIMENTAL) TESTS + + @BoxcutterRuntime + Scenario: Extension with revisions continues running after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And resource "deployment/test-operator" is available + When ClusterCatalog "test" is deleted + Then resource "deployment/test-operator" is available + And ClusterExtension reports Installed as True + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Revision resources are restored after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And resource "configmap/test-configmap" exists + When ClusterCatalog "test" is deleted + And resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Revision remains available when workload recovers after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.2.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + When ClusterCatalog "test" is deleted + And resource "deployment/test-operator" reports as not ready + Then ClusterExtensionRevision "${NAME}-1" reports Available as False with Reason ProbeFailure + When resource "deployment/test-operator" reports as ready + Then ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Version upgrade with revisions blocked without catalog + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to change version + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.2.0" + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension reports Progressing as True with Reason Retrying + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Multiple revisions remain stable after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + When ClusterExtension is updated to version "1.0.2" + Then ClusterExtension reports "${NAME}-1, ${NAME}-2" as active revisions + When ClusterCatalog "test" is deleted + Then ClusterExtension reports "${NAME}-1, ${NAME}-2" as active revisions + And ClusterExtensionRevision "${NAME}-2" reports Progressing as True with Reason Succeeded + And resource "deployment/test-operator" is available diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index e1a55934dd..eda348ed1d 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -85,6 +85,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles) sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) + sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted) sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace) sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned) @@ -664,6 +665,38 @@ func TagCatalogImage(name, oldTag, newTag string) error { return crane.Tag(imageRef, newTag, crane.Insecure) } +func CatalogIsDeleted(ctx context.Context, catalogName string) error { + catalogFullName := fmt.Sprintf("%s-catalog", catalogName) + _, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true") + if err != nil { + return fmt.Errorf("failed to delete catalog: %v", err) + } + + // Wait for catalog to be fully deleted, respecting context cancellation. + ticker := time.NewTicker(tick) + defer ticker.Stop() + + deadline := time.Now().Add(timeout) + for { + select { + case <-ctx.Done(): + // Return the context error if we timed out or were cancelled while waiting. + return ctx.Err() + case <-ticker.C: + // Check if we've exceeded the timeout + if time.Now().After(deadline) { + return fmt.Errorf("timeout waiting for catalog %s to be deleted", catalogFullName) + } + + _, err := k8sClient("get", "clustercatalog", catalogFullName) + if err != nil { + // Catalog is deleted - verification succeeded + return nil + } + } + } +} + func PrometheusMetricsAreReturned(ctx context.Context) error { sc := scenarioCtx(ctx) for podName, mr := range sc.metricsResponse { From e59f51751f67193367121c842d6be1f38605b6bb Mon Sep 17 00:00:00 2001 From: Camila Macedo <7708031+camilamacedo86@users.noreply.github.com> Date: Sun, 11 Jan 2026 05:39:18 +0000 Subject: [PATCH 2/2] (fix) catalog deletion resilience support Enables installed extensions to continue working when their source catalog becomes unavailable or is deleted. When resolution fails due to catalog unavailability, the operator now continues reconciling with the currently installed bundle instead of failing. Changes: - Resolution falls back to installed bundle when catalog unavailable - Unpacking skipped when maintaining current installed state - Helm and Boxcutter appliers handle nil contentFS gracefully - Version upgrades properly blocked without catalog access This ensures workloads remain stable and operational even when the catalog they were installed from is temporarily unavailable or deleted, while appropriately preventing version changes that require catalog access. --- cmd/operator-controller/main.go | 4 +- .../operator-controller/applier/boxcutter.go | 25 +- internal/operator-controller/applier/helm.go | 49 +++ .../clusterextension_controller.go | 2 + .../clusterextension_controller_test.go | 301 ++++++++++++++++++ .../clusterextension_reconcile_steps.go | 138 +++++++- .../controllers/suite_test.go | 2 +- 7 files changed, 506 insertions(+), 15 deletions(-) diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index a1d03feee4..2c9f59992c 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -626,7 +626,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl controllers.HandleFinalizers(c.finalizers), controllers.MigrateStorage(storageMigrator), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundleWithBoxcutter(appl), } @@ -737,7 +737,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{ controllers.HandleFinalizers(c.finalizers), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundle(appl), } diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go index c69ab5a1a0..5d5eb44847 100644 --- a/internal/operator-controller/applier/boxcutter.go +++ b/internal/operator-controller/applier/boxcutter.go @@ -303,22 +303,35 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, obj client.Object) erro } func (bc *Boxcutter) apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) { - // Generate desired revision - desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) + // List all existing revisions + existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) if err != nil { return false, "", err } - if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { - return false, "", fmt.Errorf("set ownerref: %w", err) + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, we should use the existing installed revision without generating a new one. + if contentFS == nil { + if len(existingRevisions) == 0 { + return false, "", fmt.Errorf("cannot maintain workload: no catalog content available and no previously installed revision found") + } + // Returning true here signals that the rollout has succeeded using the current revision. The + // ClusterExtensionRevision controller will continue to reconcile, apply, and maintain the + // resources defined in that revision via Server-Side Apply, ensuring the workload keeps running + // even when catalog access (and thus new revision content) is unavailable. + return true, "", nil } - // List all existing revisions - existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) + // Generate desired revision + desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) if err != nil { return false, "", err } + if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { + return false, "", fmt.Errorf("set ownerref: %w", err) + } + currentRevision := &ocv1.ClusterExtensionRevision{} state := StateNeedsInstall // check if we can update the current revision. diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 22ed096fe1..93c4cdf9b7 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -103,6 +103,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE } func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) { + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, reconcile the existing Helm release if it exists. + if contentFS == nil { + ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext) + if err != nil { + return false, "", err + } + return h.reconcileExistingRelease(ctx, ac, ext) + } + chrt, err := h.buildHelmChart(contentFS, ext) if err != nil { return false, "", err @@ -197,6 +207,45 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte return true, "", nil } +// reconcileExistingRelease reconciles an existing Helm release without catalog access. +// This is used when the catalog is unavailable but we need to maintain the current installation. +// It reconciles the release and sets up watchers to ensure resources are maintained. +func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) { + rel, err := ac.Get(ext.GetName()) + if errors.Is(err, driver.ErrReleaseNotFound) { + return false, "", fmt.Errorf("cannot maintain workload: no catalog content available and no previously installed Helm release found") + } + if err != nil { + return false, "", fmt.Errorf("getting current release: %w", err) + } + + // Reconcile the existing release to ensure resources are maintained + if err := ac.Reconcile(rel); err != nil { + // Reconcile failed - resources NOT maintained + // Return false (rollout failed) with error + return false, "", err + } + + // At this point: Reconcile succeeded - resources ARE maintained + // The operations below are for setting up monitoring (watches). + // If they fail, the resources are still successfully reconciled and maintained, + // so we return true (rollout succeeded) even though monitoring setup failed. + relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name)) + if err != nil { + return true, "", err + } + klog.FromContext(ctx).Info("watching managed objects") + cache, err := h.Manager.Get(ctx, ext) + if err != nil { + return true, "", err + } + if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil { + return true, "", err + } + + return true, "", nil +} + func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.HelmChartProvider == nil { return nil, errors.New("HelmChartProvider is nil") diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index 7f3192b0fd..f381152e6c 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req // ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension, // and assigns a specified reason and custom message to any missing condition. +// +//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) { for _, condType := range conditionsets.ConditionTypes { cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType) diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index 1b09a43adf..01de355ac2 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -1611,3 +1611,304 @@ func TestGetInstalledBundleHistory(t *testing.T) { } } } + +// TestResolutionFallbackToInstalledBundle tests the catalog deletion resilience fallback logic +func TestResolutionFallbackToInstalledBundle(t *testing.T) { + t.Run("falls back when catalog unavailable and no version change", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // Resolver fails (simulating catalog unavailable) + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + // Applier succeeds (resources maintained) + d.Applier = &MockApplier{ + installCompleted: true, + installStatus: "", + err: nil, + } + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - should fall back + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should succeed (fallback to installed, then apply succeeds) + // Catalog watch will trigger reconciliation when catalog becomes available again + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows successful reconciliation + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Progressing should be Succeeded (apply completed successfully) + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Installed should be True (maintaining current version) + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + }) + + t.Run("fails when version upgrade requested without catalog", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension requesting version upgrade + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Version: "1.0.1", // Requesting upgrade + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail (can't upgrade without catalog) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying + require.NoError(t, cl.Get(ctx, extKey, ext)) + cond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status) + require.Equal(t, ocv1.ReasonRetrying, cond.Reason) + }) + + t.Run("auto-updates when catalog becomes available after fallback", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First attempt: catalog unavailable, then becomes available + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog temporarily unavailable") + } + // Second reconcile (triggered by catalog watch): catalog available with new version + v := bundle.VersionRelease{Version: bsemver.MustParse("2.0.0")} + return &declcfg.Bundle{ + Name: "test.2.0.0", + Package: "test-pkg", + Image: "test-image:2.0.0", + }, &v, nil, nil + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - auto-update to latest + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog unavailable, falls back to v1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Second reconcile: simulating catalog watch trigger, catalog now available with v2.0.0 + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Should have upgraded to v2.0.0 + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "2.0.0", ext.Status.Install.Bundle.Version) + + // Verify resolution was attempted twice (fallback, then success) + require.Equal(t, 2, resolveAttempt) + }) +} + +func TestCheckCatalogsExist(t *testing.T) { + t.Run("returns false when no catalogs exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist") + }) + + t.Run("returns false when CRD doesn't exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + // ClusterCatalog CRD is not installed in test env + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "should not return error when CRD doesn't exist") + require.False(t, exists, "should treat missing CRD as no catalogs exist") + }) + + t.Run("returns false when no selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: nil, // No selector + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist (no selector)") + }) + + t.Run("returns false when empty selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{}, // Empty selector (matches everything) + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "empty selector should not cause error") + require.False(t, exists, "should return false when no catalogs exist (empty selector)") + }) + + t.Run("returns error for invalid selector", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "invalid", + Operator: "InvalidOperator", // Invalid operator + Values: []string{"value"}, + }, + }, + }, + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.Error(t, err, "should return error for invalid selector") + require.Contains(t, err.Error(), "invalid catalog selector") + require.False(t, exists) + }) +} diff --git a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go index 09dd2ce7d2..fe037b60a7 100644 --- a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go +++ b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go @@ -19,10 +19,12 @@ package controllers import ( "context" "errors" + "fmt" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/log" @@ -83,7 +85,14 @@ func RetrieveRevisionStates(r RevisionStatesGetter) ReconcileStepFunc { } } -func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { +// ResolveBundle resolves the bundle to install or roll out for a ClusterExtension. +// It requires a controller-runtime client (in addition to the resolve.Resolver) to enable +// intelligent error handling when resolution fails. The client is used to check if catalogs +// matching the extension's selector still exist in the cluster, allowing the controller to +// distinguish between "catalog deleted" (fall back to installed bundle) and "transient failure" +// (retry resolution). This ensures workload resilience during catalog outages while maintaining +// responsiveness during catalog updates. +func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) var resolvedRevisionMetadata *RevisionMetadata @@ -95,11 +104,7 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } resolvedBundle, resolvedBundleVersion, resolvedDeprecation, err := r.Resolve(ctx, ext, bm) if err != nil { - // Note: We don't distinguish between resolution-specific errors and generic errors - setStatusProgressing(ext, err) - setInstalledStatusFromRevisionStates(ext, state.revisionStates) - ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error()) - return nil, err + return handleResolutionError(ctx, c, state, ext, err) } // set deprecation status after _successful_ resolution @@ -134,9 +139,130 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } } +// handleResolutionError handles the case when bundle resolution fails. +// If a bundle is already installed and the spec isn't requesting a version change, +// we check if the catalogs have been deleted. If so, we fall back to using the +// installed bundle to maintain the current state (catalog deletion resilience). +// However, if catalogs still exist, we retry to allow for transient failures or catalog updates. +// If the spec explicitly requests a different version, we must fail and retry regardless. +func handleResolutionError(ctx context.Context, c client.Client, state *reconcileState, ext *ocv1.ClusterExtension, err error) (*ctrl.Result, error) { + l := log.FromContext(ctx) + + // If we have an installed bundle, check if we can fall back to it + if state.revisionStates.Installed != nil { + // Check if the spec is requesting a specific version that differs from installed + specVersion := "" + if ext.Spec.Source.Catalog != nil { + specVersion = ext.Spec.Source.Catalog.Version + } + installedVersion := state.revisionStates.Installed.Version + + // If spec requests a different version, we cannot fall back - must fail and retry + if specVersion != "" && specVersion != installedVersion { + l.Info("resolution failed and spec requests version change - cannot fall back", + "error", err, + "requestedVersion", specVersion, + "installedVersion", installedVersion) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err + } + + // No version change requested - check if catalogs exist + // Only fall back if catalogs have been deleted + catalogsExist, catalogCheckErr := CheckCatalogsExist(ctx, c, ext) + if catalogCheckErr != nil { + l.Error(catalogCheckErr, "error checking if catalogs exist, will retry resolution") + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err + } + + if catalogsExist { + // Catalogs exist but resolution failed - likely a transient issue (catalog updating, cache stale, etc.) + // Retry resolution instead of falling back + l.Info("resolution failed but catalogs exist - retrying instead of falling back", + "error", err) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err + } + + // Catalogs don't exist (deleted) - fall back to installed bundle to maintain current state. + // The controller watches ClusterCatalog resources, so when catalogs become available again, + // a reconcile will be triggered automatically, allowing the extension to upgrade. + l.Info("resolution failed and catalogs deleted - continuing with installed bundle", "error", err, "installedBundle", state.revisionStates.Installed.BundleMetadata) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + state.resolvedRevisionMetadata = state.revisionStates.Installed + // Return no error to allow Apply step to run and maintain resources. + return nil, nil + } + + // No installed bundle and resolution failed - cannot proceed + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err +} + +// CheckCatalogsExist checks if any ClusterCatalogs matching the extension's selector exist. +// Returns true if at least one matching catalog exists, false if none exist. +// Treats "CRD doesn't exist" errors as "no catalogs exist" (returns false, nil). +// Returns an error only if the check itself fails unexpectedly. +func CheckCatalogsExist(ctx context.Context, c client.Client, ext *ocv1.ClusterExtension) (bool, error) { + var catalogList *ocv1.ClusterCatalogList + var listErr error + + if ext.Spec.Source.Catalog == nil || ext.Spec.Source.Catalog.Selector == nil { + // No selector means all catalogs match - check if any catalogs exist at all + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.Limit(1)) + } else { + // Convert label selector to k8slabels.Selector + // Note: An empty LabelSelector matches everything by default + selector, err := metav1.LabelSelectorAsSelector(ext.Spec.Source.Catalog.Selector) + if err != nil { + return false, fmt.Errorf("invalid catalog selector: %w", err) + } + + // List catalogs matching the selector (limit to 1 since we only care if any exist) + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.MatchingLabelsSelector{Selector: selector}, client.Limit(1)) + } + + if listErr != nil { + // Check if the error is because the ClusterCatalog CRD doesn't exist + // This can happen if catalogd is not installed, which means no catalogs exist + if apimeta.IsNoMatchError(listErr) { + return false, nil + } + return false, fmt.Errorf("failed to list catalogs: %w", listErr) + } + + return len(catalogList.Items) > 0, nil +} + func UnpackBundle(i imageutil.Puller, cache imageutil.Cache) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) + + // Skip unpacking if we're using an already-installed bundle + // (e.g., when catalog is unavailable but we're maintaining current state) + if state.revisionStates != nil && + state.revisionStates.Installed != nil && + state.resolvedRevisionMetadata.Name == state.revisionStates.Installed.Name && + state.resolvedRevisionMetadata.Version == state.revisionStates.Installed.Version { + l.Info("skipping unpack - using installed bundle content") + // imageFS will remain nil; applier implementations MUST handle nil imageFS by using + // existing installed content. See Helm.reconcileExistingRelease() and Boxcutter.apply() + // for nil contentFS handling. + return nil, nil + } + l.Info("unpacking resolved bundle") imageFS, _, _, err := i.Pull(ctx, ext.GetName(), state.resolvedRevisionMetadata.Image, cache) if err != nil { diff --git a/internal/operator-controller/controllers/suite_test.go b/internal/operator-controller/controllers/suite_test.go index 6258a5d307..57305a75fd 100644 --- a/internal/operator-controller/controllers/suite_test.go +++ b/internal/operator-controller/controllers/suite_test.go @@ -106,7 +106,7 @@ func newClientAndReconciler(t *testing.T, opts ...reconcilerOption) (client.Clie } reconciler.ReconcileSteps = []controllers.ReconcileStepFunc{controllers.HandleFinalizers(d.Finalizers), controllers.RetrieveRevisionStates(d.RevisionStatesGetter)} if r := d.Resolver; r != nil { - reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r)) + reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r, cl)) } if i := d.ImagePuller; i != nil { reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.UnpackBundle(i, d.ImageCache))