diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index a1d03feee4..2c9f59992c 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -626,7 +626,7 @@ func (c *boxcutterReconcilerConfigurator) Configure(ceReconciler *controllers.Cl controllers.HandleFinalizers(c.finalizers), controllers.MigrateStorage(storageMigrator), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundleWithBoxcutter(appl), } @@ -737,7 +737,7 @@ func (c *helmReconcilerConfigurator) Configure(ceReconciler *controllers.Cluster ceReconciler.ReconcileSteps = []controllers.ReconcileStepFunc{ controllers.HandleFinalizers(c.finalizers), controllers.RetrieveRevisionStates(revisionStatesGetter), - controllers.ResolveBundle(c.resolver), + controllers.ResolveBundle(c.resolver, c.mgr.GetClient()), controllers.UnpackBundle(c.imagePuller, c.imageCache), controllers.ApplyBundle(appl), } diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go index c69ab5a1a0..5d5eb44847 100644 --- a/internal/operator-controller/applier/boxcutter.go +++ b/internal/operator-controller/applier/boxcutter.go @@ -303,22 +303,35 @@ func (bc *Boxcutter) createOrUpdate(ctx context.Context, obj client.Object) erro } func (bc *Boxcutter) apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels, revisionAnnotations map[string]string) (bool, string, error) { - // Generate desired revision - desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) + // List all existing revisions + existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) if err != nil { return false, "", err } - if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { - return false, "", fmt.Errorf("set ownerref: %w", err) + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, we should use the existing installed revision without generating a new one. + if contentFS == nil { + if len(existingRevisions) == 0 { + return false, "", fmt.Errorf("cannot maintain workload: no catalog content available and no previously installed revision found") + } + // Returning true here signals that the rollout has succeeded using the current revision. The + // ClusterExtensionRevision controller will continue to reconcile, apply, and maintain the + // resources defined in that revision via Server-Side Apply, ensuring the workload keeps running + // even when catalog access (and thus new revision content) is unavailable. + return true, "", nil } - // List all existing revisions - existingRevisions, err := bc.getExistingRevisions(ctx, ext.GetName()) + // Generate desired revision + desiredRevision, err := bc.RevisionGenerator.GenerateRevision(ctx, contentFS, ext, objectLabels, revisionAnnotations) if err != nil { return false, "", err } + if err := controllerutil.SetControllerReference(ext, desiredRevision, bc.Scheme); err != nil { + return false, "", fmt.Errorf("set ownerref: %w", err) + } + currentRevision := &ocv1.ClusterExtensionRevision{} state := StateNeedsInstall // check if we can update the current revision. diff --git a/internal/operator-controller/applier/helm.go b/internal/operator-controller/applier/helm.go index 22ed096fe1..93c4cdf9b7 100644 --- a/internal/operator-controller/applier/helm.go +++ b/internal/operator-controller/applier/helm.go @@ -103,6 +103,16 @@ func (h *Helm) runPreAuthorizationChecks(ctx context.Context, ext *ocv1.ClusterE } func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExtension, objectLabels map[string]string, storageLabels map[string]string) (bool, string, error) { + // If contentFS is nil, we're maintaining the current state without catalog access. + // In this case, reconcile the existing Helm release if it exists. + if contentFS == nil { + ac, err := h.ActionClientGetter.ActionClientFor(ctx, ext) + if err != nil { + return false, "", err + } + return h.reconcileExistingRelease(ctx, ac, ext) + } + chrt, err := h.buildHelmChart(contentFS, ext) if err != nil { return false, "", err @@ -197,6 +207,45 @@ func (h *Helm) Apply(ctx context.Context, contentFS fs.FS, ext *ocv1.ClusterExte return true, "", nil } +// reconcileExistingRelease reconciles an existing Helm release without catalog access. +// This is used when the catalog is unavailable but we need to maintain the current installation. +// It reconciles the release and sets up watchers to ensure resources are maintained. +func (h *Helm) reconcileExistingRelease(ctx context.Context, ac helmclient.ActionInterface, ext *ocv1.ClusterExtension) (bool, string, error) { + rel, err := ac.Get(ext.GetName()) + if errors.Is(err, driver.ErrReleaseNotFound) { + return false, "", fmt.Errorf("cannot maintain workload: no catalog content available and no previously installed Helm release found") + } + if err != nil { + return false, "", fmt.Errorf("getting current release: %w", err) + } + + // Reconcile the existing release to ensure resources are maintained + if err := ac.Reconcile(rel); err != nil { + // Reconcile failed - resources NOT maintained + // Return false (rollout failed) with error + return false, "", err + } + + // At this point: Reconcile succeeded - resources ARE maintained + // The operations below are for setting up monitoring (watches). + // If they fail, the resources are still successfully reconciled and maintained, + // so we return true (rollout succeeded) even though monitoring setup failed. + relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name)) + if err != nil { + return true, "", err + } + klog.FromContext(ctx).Info("watching managed objects") + cache, err := h.Manager.Get(ctx, ext) + if err != nil { + return true, "", err + } + if err := cache.Watch(ctx, h.Watcher, relObjects...); err != nil { + return true, "", err + } + + return true, "", nil +} + func (h *Helm) buildHelmChart(bundleFS fs.FS, ext *ocv1.ClusterExtension) (*chart.Chart, error) { if h.HelmChartProvider == nil { return nil, errors.New("HelmChartProvider is nil") diff --git a/internal/operator-controller/controllers/clusterextension_controller.go b/internal/operator-controller/controllers/clusterextension_controller.go index 7f3192b0fd..f381152e6c 100644 --- a/internal/operator-controller/controllers/clusterextension_controller.go +++ b/internal/operator-controller/controllers/clusterextension_controller.go @@ -168,6 +168,8 @@ func (r *ClusterExtensionReconciler) Reconcile(ctx context.Context, req ctrl.Req // ensureAllConditionsWithReason checks that all defined condition types exist in the given ClusterExtension, // and assigns a specified reason and custom message to any missing condition. +// +//nolint:unparam // reason parameter is designed to be flexible, even if current callers use the same value func ensureAllConditionsWithReason(ext *ocv1.ClusterExtension, reason v1alpha1.ConditionReason, message string) { for _, condType := range conditionsets.ConditionTypes { cond := apimeta.FindStatusCondition(ext.Status.Conditions, condType) diff --git a/internal/operator-controller/controllers/clusterextension_controller_test.go b/internal/operator-controller/controllers/clusterextension_controller_test.go index 1b09a43adf..01de355ac2 100644 --- a/internal/operator-controller/controllers/clusterextension_controller_test.go +++ b/internal/operator-controller/controllers/clusterextension_controller_test.go @@ -1611,3 +1611,304 @@ func TestGetInstalledBundleHistory(t *testing.T) { } } } + +// TestResolutionFallbackToInstalledBundle tests the catalog deletion resilience fallback logic +func TestResolutionFallbackToInstalledBundle(t *testing.T) { + t.Run("falls back when catalog unavailable and no version change", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // Resolver fails (simulating catalog unavailable) + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + // Applier succeeds (resources maintained) + d.Applier = &MockApplier{ + installCompleted: true, + installStatus: "", + err: nil, + } + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension with no version specified + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - should fall back + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should succeed (fallback to installed, then apply succeeds) + // Catalog watch will trigger reconciliation when catalog becomes available again + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows successful reconciliation + require.NoError(t, cl.Get(ctx, extKey, ext)) + + // Progressing should be Succeeded (apply completed successfully) + progCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, progCond) + require.Equal(t, metav1.ConditionTrue, progCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, progCond.Reason) + + // Installed should be True (maintaining current version) + instCond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeInstalled) + require.NotNil(t, instCond) + require.Equal(t, metav1.ConditionTrue, instCond.Status) + require.Equal(t, ocv1.ReasonSucceeded, instCond.Reason) + }) + + t.Run("fails when version upgrade requested without catalog", func(t *testing.T) { + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + return nil, nil, nil, fmt.Errorf("catalog unavailable") + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + }, + }, + } + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + // Create ClusterExtension requesting version upgrade + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Version: "1.0.1", // Requesting upgrade + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // Reconcile should fail (can't upgrade without catalog) + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.Error(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Verify status shows Retrying + require.NoError(t, cl.Get(ctx, extKey, ext)) + cond := apimeta.FindStatusCondition(ext.Status.Conditions, ocv1.TypeProgressing) + require.NotNil(t, cond) + require.Equal(t, metav1.ConditionTrue, cond.Status) + require.Equal(t, ocv1.ReasonRetrying, cond.Reason) + }) + + t.Run("auto-updates when catalog becomes available after fallback", func(t *testing.T) { + resolveAttempt := 0 + cl, reconciler := newClientAndReconciler(t, func(d *deps) { + // First attempt: catalog unavailable, then becomes available + d.Resolver = resolve.Func(func(_ context.Context, _ *ocv1.ClusterExtension, _ *ocv1.BundleMetadata) (*declcfg.Bundle, *bundle.VersionRelease, *declcfg.Deprecation, error) { + resolveAttempt++ + if resolveAttempt == 1 { + // First reconcile: catalog unavailable + return nil, nil, nil, fmt.Errorf("catalog temporarily unavailable") + } + // Second reconcile (triggered by catalog watch): catalog available with new version + v := bundle.VersionRelease{Version: bsemver.MustParse("2.0.0")} + return &declcfg.Bundle{ + Name: "test.2.0.0", + Package: "test-pkg", + Image: "test-image:2.0.0", + }, &v, nil, nil + }) + d.RevisionStatesGetter = &MockRevisionStatesGetter{ + RevisionStates: &controllers.RevisionStates{ + Installed: &controllers.RevisionMetadata{ + Package: "test-pkg", + BundleMetadata: ocv1.BundleMetadata{Name: "test.1.0.0", Version: "1.0.0"}, + Image: "test-image:1.0.0", + }, + }, + } + d.ImagePuller = &imageutil.MockPuller{ImageFS: fstest.MapFS{}} + d.Applier = &MockApplier{installCompleted: true} + }) + + ctx := context.Background() + extKey := types.NamespacedName{Name: fmt.Sprintf("test-%s", rand.String(8))} + + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{Name: extKey.Name}, + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + // No version - auto-update to latest + }, + }, + Namespace: "default", + ServiceAccount: ocv1.ServiceAccountReference{Name: "default"}, + }, + } + require.NoError(t, cl.Create(ctx, ext)) + + // First reconcile: catalog unavailable, falls back to v1.0.0 + res, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "1.0.0", ext.Status.Install.Bundle.Version) + + // Second reconcile: simulating catalog watch trigger, catalog now available with v2.0.0 + res, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: extKey}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, res) + + // Should have upgraded to v2.0.0 + require.NoError(t, cl.Get(ctx, extKey, ext)) + require.Equal(t, "2.0.0", ext.Status.Install.Bundle.Version) + + // Verify resolution was attempted twice (fallback, then success) + require.Equal(t, 2, resolveAttempt) + }) +} + +func TestCheckCatalogsExist(t *testing.T) { + t.Run("returns false when no catalogs exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist") + }) + + t.Run("returns false when CRD doesn't exist", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + }, + }, + }, + } + + // ClusterCatalog CRD is not installed in test env + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "should not return error when CRD doesn't exist") + require.False(t, exists, "should treat missing CRD as no catalogs exist") + }) + + t.Run("returns false when no selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: nil, // No selector + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err) + require.False(t, exists, "should return false when no catalogs exist (no selector)") + }) + + t.Run("returns false when empty selector provided", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{}, // Empty selector (matches everything) + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.NoError(t, err, "empty selector should not cause error") + require.False(t, exists, "should return false when no catalogs exist (empty selector)") + }) + + t.Run("returns error for invalid selector", func(t *testing.T) { + cl := newClient(t) + ctx := context.Background() + + ext := &ocv1.ClusterExtension{ + Spec: ocv1.ClusterExtensionSpec{ + Source: ocv1.SourceConfig{ + SourceType: "Catalog", + Catalog: &ocv1.CatalogFilter{ + PackageName: "test-pkg", + Selector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "invalid", + Operator: "InvalidOperator", // Invalid operator + Values: []string{"value"}, + }, + }, + }, + }, + }, + }, + } + + exists, err := controllers.CheckCatalogsExist(ctx, cl, ext) + require.Error(t, err, "should return error for invalid selector") + require.Contains(t, err.Error(), "invalid catalog selector") + require.False(t, exists) + }) +} diff --git a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go index 09dd2ce7d2..fe037b60a7 100644 --- a/internal/operator-controller/controllers/clusterextension_reconcile_steps.go +++ b/internal/operator-controller/controllers/clusterextension_reconcile_steps.go @@ -19,10 +19,12 @@ package controllers import ( "context" "errors" + "fmt" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/finalizer" "sigs.k8s.io/controller-runtime/pkg/log" @@ -83,7 +85,14 @@ func RetrieveRevisionStates(r RevisionStatesGetter) ReconcileStepFunc { } } -func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { +// ResolveBundle resolves the bundle to install or roll out for a ClusterExtension. +// It requires a controller-runtime client (in addition to the resolve.Resolver) to enable +// intelligent error handling when resolution fails. The client is used to check if catalogs +// matching the extension's selector still exist in the cluster, allowing the controller to +// distinguish between "catalog deleted" (fall back to installed bundle) and "transient failure" +// (retry resolution). This ensures workload resilience during catalog outages while maintaining +// responsiveness during catalog updates. +func ResolveBundle(r resolve.Resolver, c client.Client) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) var resolvedRevisionMetadata *RevisionMetadata @@ -95,11 +104,7 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } resolvedBundle, resolvedBundleVersion, resolvedDeprecation, err := r.Resolve(ctx, ext, bm) if err != nil { - // Note: We don't distinguish between resolution-specific errors and generic errors - setStatusProgressing(ext, err) - setInstalledStatusFromRevisionStates(ext, state.revisionStates) - ensureAllConditionsWithReason(ext, ocv1.ReasonFailed, err.Error()) - return nil, err + return handleResolutionError(ctx, c, state, ext, err) } // set deprecation status after _successful_ resolution @@ -134,9 +139,130 @@ func ResolveBundle(r resolve.Resolver) ReconcileStepFunc { } } +// handleResolutionError handles the case when bundle resolution fails. +// If a bundle is already installed and the spec isn't requesting a version change, +// we check if the catalogs have been deleted. If so, we fall back to using the +// installed bundle to maintain the current state (catalog deletion resilience). +// However, if catalogs still exist, we retry to allow for transient failures or catalog updates. +// If the spec explicitly requests a different version, we must fail and retry regardless. +func handleResolutionError(ctx context.Context, c client.Client, state *reconcileState, ext *ocv1.ClusterExtension, err error) (*ctrl.Result, error) { + l := log.FromContext(ctx) + + // If we have an installed bundle, check if we can fall back to it + if state.revisionStates.Installed != nil { + // Check if the spec is requesting a specific version that differs from installed + specVersion := "" + if ext.Spec.Source.Catalog != nil { + specVersion = ext.Spec.Source.Catalog.Version + } + installedVersion := state.revisionStates.Installed.Version + + // If spec requests a different version, we cannot fall back - must fail and retry + if specVersion != "" && specVersion != installedVersion { + l.Info("resolution failed and spec requests version change - cannot fall back", + "error", err, + "requestedVersion", specVersion, + "installedVersion", installedVersion) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err + } + + // No version change requested - check if catalogs exist + // Only fall back if catalogs have been deleted + catalogsExist, catalogCheckErr := CheckCatalogsExist(ctx, c, ext) + if catalogCheckErr != nil { + l.Error(catalogCheckErr, "error checking if catalogs exist, will retry resolution") + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err + } + + if catalogsExist { + // Catalogs exist but resolution failed - likely a transient issue (catalog updating, cache stale, etc.) + // Retry resolution instead of falling back + l.Info("resolution failed but catalogs exist - retrying instead of falling back", + "error", err) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err + } + + // Catalogs don't exist (deleted) - fall back to installed bundle to maintain current state. + // The controller watches ClusterCatalog resources, so when catalogs become available again, + // a reconcile will be triggered automatically, allowing the extension to upgrade. + l.Info("resolution failed and catalogs deleted - continuing with installed bundle", "error", err, "installedBundle", state.revisionStates.Installed.BundleMetadata) + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + state.resolvedRevisionMetadata = state.revisionStates.Installed + // Return no error to allow Apply step to run and maintain resources. + return nil, nil + } + + // No installed bundle and resolution failed - cannot proceed + setStatusProgressing(ext, err) + setInstalledStatusFromRevisionStates(ext, state.revisionStates) + ensureAllConditionsWithReason(ext, ocv1.ReasonRetrying, err.Error()) + return nil, err +} + +// CheckCatalogsExist checks if any ClusterCatalogs matching the extension's selector exist. +// Returns true if at least one matching catalog exists, false if none exist. +// Treats "CRD doesn't exist" errors as "no catalogs exist" (returns false, nil). +// Returns an error only if the check itself fails unexpectedly. +func CheckCatalogsExist(ctx context.Context, c client.Client, ext *ocv1.ClusterExtension) (bool, error) { + var catalogList *ocv1.ClusterCatalogList + var listErr error + + if ext.Spec.Source.Catalog == nil || ext.Spec.Source.Catalog.Selector == nil { + // No selector means all catalogs match - check if any catalogs exist at all + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.Limit(1)) + } else { + // Convert label selector to k8slabels.Selector + // Note: An empty LabelSelector matches everything by default + selector, err := metav1.LabelSelectorAsSelector(ext.Spec.Source.Catalog.Selector) + if err != nil { + return false, fmt.Errorf("invalid catalog selector: %w", err) + } + + // List catalogs matching the selector (limit to 1 since we only care if any exist) + catalogList = &ocv1.ClusterCatalogList{} + listErr = c.List(ctx, catalogList, client.MatchingLabelsSelector{Selector: selector}, client.Limit(1)) + } + + if listErr != nil { + // Check if the error is because the ClusterCatalog CRD doesn't exist + // This can happen if catalogd is not installed, which means no catalogs exist + if apimeta.IsNoMatchError(listErr) { + return false, nil + } + return false, fmt.Errorf("failed to list catalogs: %w", listErr) + } + + return len(catalogList.Items) > 0, nil +} + func UnpackBundle(i imageutil.Puller, cache imageutil.Cache) ReconcileStepFunc { return func(ctx context.Context, state *reconcileState, ext *ocv1.ClusterExtension) (*ctrl.Result, error) { l := log.FromContext(ctx) + + // Skip unpacking if we're using an already-installed bundle + // (e.g., when catalog is unavailable but we're maintaining current state) + if state.revisionStates != nil && + state.revisionStates.Installed != nil && + state.resolvedRevisionMetadata.Name == state.revisionStates.Installed.Name && + state.resolvedRevisionMetadata.Version == state.revisionStates.Installed.Version { + l.Info("skipping unpack - using installed bundle content") + // imageFS will remain nil; applier implementations MUST handle nil imageFS by using + // existing installed content. See Helm.reconcileExistingRelease() and Boxcutter.apply() + // for nil contentFS handling. + return nil, nil + } + l.Info("unpacking resolved bundle") imageFS, _, _, err := i.Pull(ctx, ext.GetName(), state.resolvedRevisionMetadata.Image, cache) if err != nil { diff --git a/internal/operator-controller/controllers/suite_test.go b/internal/operator-controller/controllers/suite_test.go index 6258a5d307..57305a75fd 100644 --- a/internal/operator-controller/controllers/suite_test.go +++ b/internal/operator-controller/controllers/suite_test.go @@ -106,7 +106,7 @@ func newClientAndReconciler(t *testing.T, opts ...reconcilerOption) (client.Clie } reconciler.ReconcileSteps = []controllers.ReconcileStepFunc{controllers.HandleFinalizers(d.Finalizers), controllers.RetrieveRevisionStates(d.RevisionStatesGetter)} if r := d.Resolver; r != nil { - reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r)) + reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.ResolveBundle(r, cl)) } if i := d.ImagePuller; i != nil { reconciler.ReconcileSteps = append(reconciler.ReconcileSteps, controllers.UnpackBundle(i, d.ImageCache)) diff --git a/test/e2e/features/catalog-deletion-resilience.feature b/test/e2e/features/catalog-deletion-resilience.feature new file mode 100644 index 0000000000..959ba7a4f0 --- /dev/null +++ b/test/e2e/features/catalog-deletion-resilience.feature @@ -0,0 +1,330 @@ +Feature: Workload resilience when catalog is deleted + + As an OLM user, I want my installed extensions to continue working + even if the catalog they were installed from is deleted. + + Background: + Given OLM is available + And ClusterCatalog "test" serves bundles + And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + + # STANDARD RUNTIME TESTS + + Scenario: Extension continues running after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + When ClusterCatalog "test" is deleted + Then resource "deployment/test-operator" is available + And resource "configmap/test-configmap" is available + And ClusterExtension reports Installed as True + + Scenario: Resources are restored after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And resource "configmap/test-configmap" exists + When ClusterCatalog "test" is deleted + And resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + + Scenario: Config changes work without catalog + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to add preflight config + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + install: + preflight: + crdUpgradeSafety: + enforcement: None + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension reports Installed as True + + Scenario: Version upgrade blocked without catalog + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to change version + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.1" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension reports Progressing as True with Reason Retrying + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + + # BOXCUTTER RUNTIME (EXPERIMENTAL) TESTS + + @BoxcutterRuntime + Scenario: Extension with revisions continues running after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And resource "deployment/test-operator" is available + When ClusterCatalog "test" is deleted + Then resource "deployment/test-operator" is available + And ClusterExtension reports Installed as True + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Revision resources are restored after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And resource "configmap/test-configmap" exists + When ClusterCatalog "test" is deleted + And resource "configmap/test-configmap" is removed + Then resource "configmap/test-configmap" is eventually restored + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Revision remains available when workload recovers after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.2.0" + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + When ClusterCatalog "test" is deleted + And resource "deployment/test-operator" reports as not ready + Then ClusterExtensionRevision "${NAME}-1" reports Available as False with Reason ProbeFailure + When resource "deployment/test-operator" reports as ready + Then ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Version upgrade with revisions blocked without catalog + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + When ClusterCatalog "test" is deleted + And ClusterExtension is updated to change version + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.2.0" + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension reports Progressing as True with Reason Retrying + And bundle "test-operator.1.0.0" is installed in version "1.0.0" + And ClusterExtensionRevision "${NAME}-1" reports Available as True with Reason ProbesSucceeded + + @BoxcutterRuntime + Scenario: Multiple revisions remain stable after catalog deletion + Given ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + version: "1.0.0" + upgradeConstraintPolicy: SelfCertified + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + And ClusterExtension is rolled out + And ClusterExtension is available + When ClusterExtension is updated to version "1.0.2" + Then ClusterExtension reports "${NAME}-1, ${NAME}-2" as active revisions + When ClusterCatalog "test" is deleted + Then ClusterExtension reports "${NAME}-1, ${NAME}-2" as active revisions + And ClusterExtensionRevision "${NAME}-2" reports Progressing as True with Reason Succeeded + And resource "deployment/test-operator" is available diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index e1a55934dd..eda348ed1d 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -85,6 +85,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterCatalog "([^"]+)" serves bundles$`, CatalogServesBundles) sc.Step(`^"([^"]+)" catalog image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) sc.Step(`^(?i)ClusterCatalog "([^"]+)" image version "([^"]+)" is also tagged as "([^"]+)"$`, TagCatalogImage) + sc.Step(`^(?i)ClusterCatalog "([^"]+)" is deleted$`, CatalogIsDeleted) sc.Step(`^(?i)operator "([^"]+)" target namespace is "([^"]+)"$`, OperatorTargetNamespace) sc.Step(`^(?i)Prometheus metrics are returned in the response$`, PrometheusMetricsAreReturned) @@ -664,6 +665,38 @@ func TagCatalogImage(name, oldTag, newTag string) error { return crane.Tag(imageRef, newTag, crane.Insecure) } +func CatalogIsDeleted(ctx context.Context, catalogName string) error { + catalogFullName := fmt.Sprintf("%s-catalog", catalogName) + _, err := k8sClient("delete", "clustercatalog", catalogFullName, "--ignore-not-found=true") + if err != nil { + return fmt.Errorf("failed to delete catalog: %v", err) + } + + // Wait for catalog to be fully deleted, respecting context cancellation. + ticker := time.NewTicker(tick) + defer ticker.Stop() + + deadline := time.Now().Add(timeout) + for { + select { + case <-ctx.Done(): + // Return the context error if we timed out or were cancelled while waiting. + return ctx.Err() + case <-ticker.C: + // Check if we've exceeded the timeout + if time.Now().After(deadline) { + return fmt.Errorf("timeout waiting for catalog %s to be deleted", catalogFullName) + } + + _, err := k8sClient("get", "clustercatalog", catalogFullName) + if err != nil { + // Catalog is deleted - verification succeeded + return nil + } + } + } +} + func PrometheusMetricsAreReturned(ctx context.Context) error { sc := scenarioCtx(ctx) for podName, mr := range sc.metricsResponse {