From 87d2a41b433ff81559557c0d256a926211def9cf Mon Sep 17 00:00:00 2001 From: Gianluca Mardente Date: Sun, 5 Apr 2026 16:46:02 +0200 Subject: [PATCH] (bug) Fix HelmReleaseSummary.FailureMessage never persisted after chart deployment failure When a Helm chart deployment fails, setHelmFailureMessageOnHelmChartSummary correctly sets FailureMessage on clusterSummary.Status.HelmReleaseSummaries[i] in memory, but this value was never written back to the API server. The root cause is a sequencing issue: updateStatusForReferencedHelmReleases (which persists HelmReleaseSummaries) runs before walkChartsAndDeploy. After walkChartsAndDeploy fails and sets the FailureMessage in memory, updateStatusForNonReferencedHelmReleases then re-fetches a fresh ClusterSummary from the API and overwrites the status, so discarding the in-memory change entirely. As a result, HelmReleaseSummaries[].FailureMessage always remained empty even after repeated failures, while featureSummaries[].failureMessage (set through a separate path) correctly reflected the error. A secondary bug: setHelmFailureMessageOnHelmChartSummary was passed currentChart (the raw, pre-template spec) instead of instantiatedChart. The lookup compares ReleaseName/ReleaseNamespace against values stored in HelmReleaseSummaries, which are the instantiated values. When those fields contain Go templates, the lookup would silently find no match and the FailureMessage would not be set at all. Fix consist in: 1. Pass instantiatedChart (post-template) instead of currentChart to setHelmFailureMessageOnHelmChartSummary so the lookup always matches what is stored in HelmReleaseSummaries. 2. In updateStatusForNonReferencedHelmReleases, before overwriting the status with the freshly fetched ClusterSummary, build an index of the in-memory FailureMessage values set by walkChartsAndDeploy and merge them into the entries being written. This ensures the failure from the current reconciliation round is persisted to the API. --- Makefile | 3 + controllers/handlers_helm.go | 16 ++- test/fv/helm_failure_message_test.go | 143 +++++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 test/fv/helm_failure_message_test.go diff --git a/Makefile b/Makefile index 50d6bd5f..ecbb7c94 100644 --- a/Makefile +++ b/Makefile @@ -445,6 +445,9 @@ deploy-projectsveltos: $(KUSTOMIZE) $(ENVSUBST) $(KUBECTL) # Install sveltoscluster-manager $(KUBECTL) apply -f https://raw.githubusercontent.com/projectsveltos/sveltoscluster-manager/$(TAG)/manifest/manifest.yaml + # Install register-mgmt-cluster + $(KUBECTL) apply -f https://raw.githubusercontent.com/projectsveltos/register-mgmt-cluster/$(TAG)/manifest/manifest.yaml + @echo "Waiting for projectsveltos addon-controller to be available..." $(KUBECTL) wait --for=condition=Available deployment/addon-controller -n projectsveltos --timeout=$(TIMEOUT) diff --git a/controllers/handlers_helm.go b/controllers/handlers_helm.go index 2d5db80c..3a1e3a37 100644 --- a/controllers/handlers_helm.go +++ b/controllers/handlers_helm.go @@ -1009,7 +1009,7 @@ func walkChartsAndDeploy(ctx context.Context, c client.Client, clusterSummary *c var currentRelease *releaseInfo currentRelease, report, err = handleChart(ctx, clusterSummary, mgmtResources, instantiatedChart, kubeconfig, isPullMode, logger) - setHelmFailureMessageOnHelmChartSummary(clusterSummary, currentChart, err) + setHelmFailureMessageOnHelmChartSummary(clusterSummary, instantiatedChart, err) if err != nil { if clusterSummary.Spec.ClusterProfileSpec.ContinueOnError { errorMsg += fmt.Sprintf("chart: %s, release: %s, %v\n", @@ -2729,11 +2729,23 @@ func updateStatusForNonReferencedHelmReleases(ctx context.Context, c client.Clie return clusterSummary, err } + // Index the in-memory FailureMessages written by walkChartsAndDeploy so they are + // not lost when we overwrite the status from the freshly fetched currentClusterSummary. + inMemoryFailure := make(map[string]*string, len(clusterSummary.Status.HelmReleaseSummaries)) + for i := range clusterSummary.Status.HelmReleaseSummaries { + s := &clusterSummary.Status.HelmReleaseSummaries[i] + inMemoryFailure[helmInfo(s.ReleaseNamespace, s.ReleaseName)] = s.FailureMessage + } + helmReleaseSummaries := make([]configv1beta1.HelmChartSummary, 0, len(currentClusterSummary.Status.HelmReleaseSummaries)) for i := range currentClusterSummary.Status.HelmReleaseSummaries { summary := ¤tClusterSummary.Status.HelmReleaseSummaries[i] if _, ok := currentlyReferenced[helmInfo(summary.ReleaseNamespace, summary.ReleaseName)]; ok { - helmReleaseSummaries = append(helmReleaseSummaries, *summary) + entry := *summary + if msg, exists := inMemoryFailure[helmInfo(summary.ReleaseNamespace, summary.ReleaseName)]; exists { + entry.FailureMessage = msg + } + helmReleaseSummaries = append(helmReleaseSummaries, entry) } } diff --git a/test/fv/helm_failure_message_test.go b/test/fv/helm_failure_message_test.go new file mode 100644 index 00000000..1251eb74 --- /dev/null +++ b/test/fv/helm_failure_message_test.go @@ -0,0 +1,143 @@ +/* +Copyright 2026. projectsveltos.io. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fv_test + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + + configv1beta1 "github.com/projectsveltos/addon-controller/api/v1beta1" + "github.com/projectsveltos/addon-controller/lib/clusterops" + libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" +) + +var _ = Describe("Helm with conflicts", func() { + const ( + namePrefix = "helm-failures-" + mgmt = "mgmt" + ) + + It("ClusterProfile deploying helm charts with failures", Label("FV", "PULLMODE", "EXTENDED"), func() { + Byf("Create a ClusterProfile matching mgmt Cluster") + + clusterProfile := &configv1beta1.ClusterProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + randomString(), + }, + Spec: configv1beta1.Spec{ + ClusterRefs: []corev1.ObjectReference{ + { + APIVersion: libsveltosv1beta1.GroupVersion.String(), + Kind: libsveltosv1beta1.SveltosClusterKind, + Namespace: mgmt, + Name: mgmt, + }, + }, + }, + } + clusterProfile.Spec.SyncMode = configv1beta1.SyncModeContinuous + Expect(k8sClient.Create(context.TODO(), clusterProfile)).To(Succeed()) + Byf("Created ClusterProfile %s", clusterProfile.Name) + + verifyClusterSummary(clusterops.ClusterProfileLabelName, + clusterProfile.Name, &clusterProfile.Spec, + mgmt, mgmt, string(libsveltosv1beta1.ClusterTypeSveltos)) + + currentClusterProfile := &configv1beta1.ClusterProfile{} + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + Expect(k8sClient.Get(context.TODO(), + types.NamespacedName{Name: clusterProfile.Name}, currentClusterProfile)).To(Succeed()) + currentClusterProfile.Spec.HelmCharts = []configv1beta1.HelmChart{ + { + RepositoryURL: "oci://ghcr.io/k0rdent/catalog/charts", + RepositoryName: "ingress-nginx", + ChartName: "ingress-nginx", + ChartVersion: "4.13.0", + ReleaseName: "nginx", + ReleaseNamespace: "nginx", + HelmChartAction: configv1beta1.HelmChartActionInstall, + Options: &configv1beta1.HelmOptions{ + InstallOptions: configv1beta1.HelmInstallOptions{ + CreateNamespace: true, + Replace: true, + }, + Timeout: &metav1.Duration{Duration: time.Second}, + }, + RegistryCredentialsConfig: &configv1beta1.RegistryCredentialsConfig{ + PlainHTTP: true, + }, + Values: `ingress-nginx: + global: + image: + registry: this.does.not.exist.com`, + }, + { + RepositoryURL: "oci://ghcr.io/k0rdent/catalog/charts", + RepositoryName: "postgres-operator", + ChartName: "postgres-operator", + ChartVersion: "1.15.1", + ReleaseName: "postgres-operator", + ReleaseNamespace: "postgres-operator", + HelmChartAction: configv1beta1.HelmChartActionInstall, + RegistryCredentialsConfig: &configv1beta1.RegistryCredentialsConfig{ + PlainHTTP: true, + }, + }, + } + return k8sClient.Update(context.TODO(), currentClusterProfile) + }) + Expect(err).To(BeNil()) + + Expect(k8sClient.Get(context.TODO(), + types.NamespacedName{Name: clusterProfile.Name}, currentClusterProfile)).To(Succeed()) + + clusterSummary := verifyClusterSummary(clusterops.ClusterProfileLabelName, + currentClusterProfile.Name, ¤tClusterProfile.Spec, + mgmt, mgmt, string(libsveltosv1beta1.ClusterTypeSveltos)) + + By("Verifying helmReleaseSummaries reports error") + Eventually(func() bool { + currentClusterSummary := &configv1beta1.ClusterSummary{} + err = k8sClient.Get(context.TODO(), + types.NamespacedName{Namespace: clusterSummary.Namespace, Name: clusterSummary.Name}, + currentClusterSummary) + if err != nil { + return false + } + + for i := range currentClusterSummary.Status.HelmReleaseSummaries { + if currentClusterSummary.Status.HelmReleaseSummaries[i].FailureMessage != nil && + *currentClusterSummary.Status.HelmReleaseSummaries[i].FailureMessage == "context deadline exceeded" { + return true + } + } + return false + }, timeout, pollingInterval).Should(BeTrue()) + + Expect(k8sClient.Get(context.TODO(), + types.NamespacedName{Name: clusterProfile.Name}, currentClusterProfile)).To(Succeed()) + + Expect(k8sClient.Delete(context.TODO(), currentClusterProfile)).To(Succeed()) + }) +})