From c440a575b193bcf8c4c27522e9fb4a5b78aa16c9 Mon Sep 17 00:00:00 2001 From: Gianluca Mardente Date: Sun, 7 Jun 2026 14:06:28 +0200 Subject: [PATCH] (bug) fixes various bugs and performance issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Bugs** - Corrected a logging mistake where the cluster name was accidentally populated with the cluster namespace in two places, making log lines useless when debugging by cluster name. - Fixed wrong control flow in the pull-mode agent status handler: one error type was being checked in the wrong branch (it was only reachable when there was no error, so it never fired), and the function could panic with a nil dereference when the agent returned an error without a status payload. - Fixed a data race in the Helm chart manager: one method was reading from a shared map without holding the mutex, while concurrent reconcilers could be writing to it. **Performance** - The dependency manager's background update loop was holding a write lock — which blocks all concurrent reconcilers — across a batch of API calls. It now snapshots what needs updating, releases the lock, makes the API calls, then re-acquires the lock only to clear the completed entries. - The dependency manager's startup rebuild loop was spinning at full speed retrying on any API error with no delay, which could hammer the API server during an outage. It now waits 5 seconds between retries and respects context cancellation. --- controllers/chartmanager/chartmanager.go | 3 ++ controllers/clustersummary_deployer.go | 21 +++++++----- controllers/dependencymanager/manager.go | 42 ++++++++++++++++++++---- test/pullmode-sveltosapplier.yaml | 2 +- 4 files changed, 53 insertions(+), 15 deletions(-) diff --git a/controllers/chartmanager/chartmanager.go b/controllers/chartmanager/chartmanager.go index 3d59d7a8..0a89bb69 100644 --- a/controllers/chartmanager/chartmanager.go +++ b/controllers/chartmanager/chartmanager.go @@ -433,6 +433,9 @@ func (m *instance) GetManagerForChart(clusterNamespace, clusterName string, clusterKey := m.getClusterKey(clusterNamespace, clusterName, clusterType) releaseKey := m.GetReleaseKey(chart.ReleaseNamespace, chart.ReleaseName) + m.chartMux.Lock() + defer m.chartMux.Unlock() + if _, ok := m.perClusterChartMap[clusterKey]; !ok { return "", fmt.Errorf("no ClusterSummary manging helm chart %s (release %s)", chart.ChartName, chart.ReleaseName) diff --git a/controllers/clustersummary_deployer.go b/controllers/clustersummary_deployer.go index f6edfc2d..f8413986 100644 --- a/controllers/clustersummary_deployer.go +++ b/controllers/clustersummary_deployer.go @@ -77,7 +77,7 @@ func (r *ClusterSummaryReconciler) deployFeature(ctx context.Context, clusterSum clusterSummary := clusterSummaryScope.ClusterSummary logger = logger.WithValues("clusternamespace", clusterSummary.Spec.ClusterNamespace, - "clustername", clusterSummary.Spec.ClusterNamespace, + "clustername", clusterSummary.Spec.ClusterName, "applicant", clusterSummary.Name, "feature", string(f.id)) logger.V(logs.LogDebug).Info("request to deploy") @@ -414,7 +414,7 @@ func (r *ClusterSummaryReconciler) undeployFeature(ctx context.Context, clusterS clusterSummary := clusterSummaryScope.ClusterSummary logger = logger.WithValues("clusternamespace", clusterSummary.Spec.ClusterNamespace, - "clustername", clusterSummary.Spec.ClusterNamespace, + "clustername", clusterSummary.Spec.ClusterName, "applicant", clusterSummary.Name, "feature", string(f.id)) logger.V(logs.LogDebug).Info("request to un-deploy") @@ -732,15 +732,20 @@ func (r *ClusterSummaryReconciler) proceesAgentDeploymentStatus(ctx context.Cont provisioning := libsveltosv1beta1.FeatureStatusProvisioning return &provisioning, true } - errorMsg := err.Error() - clusterSummaryScope.SetFailureMessage(f.id, &errorMsg) - } else if pullmode.IsActionNotSetToDeploy(err) { - _ = pullmode.TerminateDeploymentTracking(ctx, r.Client, clusterSummary.Spec.ClusterNamespace, - clusterSummary.Spec.ClusterName, clusterSummary.Kind, clusterSummary.Name, string(f.id), logger) - } else if status.FailureMessage != nil { + if pullmode.IsActionNotSetToDeploy(err) { + _ = pullmode.TerminateDeploymentTracking(ctx, r.Client, clusterSummary.Spec.ClusterNamespace, + clusterSummary.Spec.ClusterName, clusterSummary.Kind, clusterSummary.Name, string(f.id), logger) + } else { + errorMsg := err.Error() + clusterSummaryScope.SetFailureMessage(f.id, &errorMsg) + } + } else if status != nil && status.FailureMessage != nil { clusterSummaryScope.SetFailureMessage(f.id, status.FailureMessage) } + if status == nil { + return nil, false + } return status.DeploymentStatus, false } diff --git a/controllers/dependencymanager/manager.go b/controllers/dependencymanager/manager.go index cf5d5e16..43cd8f04 100644 --- a/controllers/dependencymanager/manager.go +++ b/controllers/dependencymanager/manager.go @@ -489,10 +489,22 @@ func (m *instance) updateClusterProfile(ctx context.Context, c client.Client, cl func (m *instance) updateProfiles(ctx context.Context, c client.Client, logger logr.Logger) { const interval = 30 * time.Second for { + // Snapshot the dirty set and their cluster deployments under the lock. m.chartMux.Lock() + toUpdate := make([]corev1.ObjectReference, 0, len(m.profileToBeUpdated)) + for profile := range m.profileToBeUpdated { + toUpdate = append(toUpdate, profile) + } + clustersByProfile := make(map[corev1.ObjectReference]ClusterDeployments, len(toUpdate)) + for i := range toUpdate { + clustersByProfile[toUpdate[i]] = m.profileClusterRequests.getClusterDeployments(&toUpdate[i]) + } + m.chartMux.Unlock() + // Make API calls without holding the lock. + succeeded := make([]corev1.ObjectReference, 0, len(toUpdate)) canceled := false - for profile := range m.profileToBeUpdated { + for i := range toUpdate { select { case <-ctx.Done(): canceled = true @@ -501,15 +513,22 @@ func (m *instance) updateProfiles(ctx context.Context, c client.Client, logger l if canceled { break } - clusters := m.profileClusterRequests.getClusterDeployments(&profile) + profile := toUpdate[i] + clusters := clustersByProfile[profile] logger.V(logs.LogDebug).Info(fmt.Sprintf("updating prerequestite profile %s/%s", profile.Namespace, profile.Name)) - err := m.updateProfileInstance(ctx, c, &profile, clusters) - if err == nil { - delete(m.profileToBeUpdated, profile) + if err := m.updateProfileInstance(ctx, c, &profile, clusters); err == nil { + succeeded = append(succeeded, profile) } } - m.chartMux.Unlock() + // Remove successfully updated entries under the lock. + if len(succeeded) > 0 { + m.chartMux.Lock() + for i := range succeeded { + delete(m.profileToBeUpdated, succeeded[i]) + } + m.chartMux.Unlock() + } if canceled { return @@ -524,11 +543,22 @@ func (m *instance) updateProfiles(ctx context.Context, c client.Client, logger l } func (m *instance) rebuildState(ctx context.Context, c client.Client, logger logr.Logger) { + const retryInterval = 5 * time.Second for { if err := m.rebuildStateWithProfiles(ctx, c, logger); err != nil { + select { + case <-ctx.Done(): + return + case <-time.After(retryInterval): + } continue } if err := m.rebuildStateWithClusterProfiles(ctx, c, logger); err != nil { + select { + case <-ctx.Done(): + return + case <-time.After(retryInterval): + } continue } break diff --git a/test/pullmode-sveltosapplier.yaml b/test/pullmode-sveltosapplier.yaml index 462d4a0c..a3c952e6 100644 --- a/test/pullmode-sveltosapplier.yaml +++ b/test/pullmode-sveltosapplier.yaml @@ -99,7 +99,7 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace - image: docker.io/projectsveltos/sveltos-applier@sha256:8262f5598d29079afe5d71c221d9db2e89a752b9798fa16ab03a3496abc631c2 + image: docker.io/projectsveltos/sveltos-applier@sha256:d9009aaba4eea6fce0a94c17f9ed93fb495bc1bedee829bad0cf7c7a37725491 livenessProbe: failureThreshold: 3 httpGet: