diff --git a/internal/controller/datadogagent/feature/gpu/feature.go b/internal/controller/datadogagent/feature/gpu/feature.go index 539df8d96..4146e361f 100644 --- a/internal/controller/datadogagent/feature/gpu/feature.go +++ b/internal/controller/datadogagent/feature/gpu/feature.go @@ -14,6 +14,7 @@ import ( "github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume" + "github.com/DataDog/datadog-operator/pkg/kubernetes" ) func init() { @@ -27,6 +28,8 @@ func buildFeature(*feature.Options) feature.Feature { } type gpuFeature struct { + // owner is the DatadogAgent or DatadogAgentInternal object that owns this feature + owner metav1.Object // podRuntimeClassName is the value to set in the runtimeClassName // configuration of the agent pod. If this is empty, the runtimeClassName // will not be changed. @@ -42,11 +45,12 @@ func (f *gpuFeature) ID() feature.IDType { } // Configure is used to configure the feature from a v2alpha1.DatadogAgent instance. -func (f *gpuFeature) Configure(_ metav1.Object, ddaSpec *v2alpha1.DatadogAgentSpec, _ *v2alpha1.RemoteConfigConfiguration) (reqComp feature.RequiredComponents) { +func (f *gpuFeature) Configure(dda metav1.Object, ddaSpec *v2alpha1.DatadogAgentSpec, _ *v2alpha1.RemoteConfigConfiguration) (reqComp feature.RequiredComponents) { if ddaSpec.Features == nil || ddaSpec.Features.GPU == nil || !apiutils.BoolValue(ddaSpec.Features.GPU.Enabled) { return reqComp } + f.owner = dda f.isPrivilegedModeEnabled = apiutils.BoolValue(ddaSpec.Features.GPU.PrivilegedMode) f.patchCgroupPermissions = apiutils.BoolValue(ddaSpec.Features.GPU.PatchCgroupPermissions) @@ -77,6 +81,48 @@ func (f *gpuFeature) Configure(_ metav1.Object, ddaSpec *v2alpha1.DatadogAgentSp // ManageDependencies allows a feature to manage its dependencies. // Feature's dependencies should be added in the store. func (f *gpuFeature) ManageDependencies(managers feature.ResourceManagers, provider string) error { + // When GPU privileged mode is enabled, system-probe container is required + // and needs the seccomp ConfigMap. Create it here as a feature dependency. + logger := managers.Store().Logger().WithName("gpu-feature") + logger.V(2).Info("ManageDependencies called", + "privilegedMode", f.isPrivilegedModeEnabled, + "hasOwner", f.owner != nil) + + if f.isPrivilegedModeEnabled && f.owner != nil { + configMapName := common.GetDefaultSeccompConfigMapName(f.owner) + namespace := f.owner.GetNamespace() + logger.V(2).Info("Creating system-probe seccomp ConfigMap", + "name", configMapName, + "namespace", namespace) + + // Create the system-probe seccomp ConfigMap with default profile + seccompConfigData := agent.DefaultSeccompConfigDataForSystemProbe() + if seccompConfigData != nil { + logger.V(2).Info("Got seccomp config data", + "dataKeys", len(seccompConfigData)) + + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: namespace, + }, + Data: seccompConfigData, + } + // Use Store().AddOrUpdate like kubernetesstatecore does + if err := managers.Store().AddOrUpdate(kubernetes.ConfigMapKind, configMap); err != nil { + logger.Error(err, "Failed to add ConfigMap to store") + return err + } + logger.Info("Successfully added system-probe seccomp ConfigMap to store", + "name", configMapName) + } else { + logger.V(2).Info("No seccomp config data returned") + } + } else { + logger.V(2).Info("Skipping seccomp ConfigMap creation", + "privilegedMode", f.isPrivilegedModeEnabled, + "hasOwner", f.owner != nil) + } return nil } diff --git a/internal/controller/datadogagent/feature/gpu/feature_test.go b/internal/controller/datadogagent/feature/gpu/feature_test.go index 3370683aa..b1ab52af7 100644 --- a/internal/controller/datadogagent/feature/gpu/feature_test.go +++ b/internal/controller/datadogagent/feature/gpu/feature_test.go @@ -7,6 +7,9 @@ import ( "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + logf "sigs.k8s.io/controller-runtime/pkg/log" apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common" "github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1" @@ -16,6 +19,8 @@ import ( "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/fake" "github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test" + "github.com/DataDog/datadog-operator/internal/controller/datadogagent/store" + "github.com/DataDog/datadog-operator/pkg/kubernetes" ) const alternativeRuntimeClass = "nvidia-like" @@ -408,3 +413,84 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) { tests.Run(t, buildFeature) } + +func Test_GPUFeature_ManageDependencies(t *testing.T) { + logger := logf.Log.WithName("test") + + tests := []struct { + name string + dda *v2alpha1.DatadogAgent + gpuFeature *gpuFeature + wantSeccompConfigMap bool + wantConfigMapName string + }{ + { + name: "GPU privileged mode enabled - should create seccomp ConfigMap", + dda: &v2alpha1.DatadogAgent{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dda", + Namespace: "default", + }, + }, + gpuFeature: &gpuFeature{ + isPrivilegedModeEnabled: true, + }, + wantSeccompConfigMap: true, + wantConfigMapName: "test-dda-system-probe-seccomp", + }, + { + name: "GPU privileged mode disabled - should not create seccomp ConfigMap", + dda: &v2alpha1.DatadogAgent{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dda", + Namespace: "default", + }, + }, + gpuFeature: &gpuFeature{ + isPrivilegedModeEnabled: false, + }, + wantSeccompConfigMap: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Set the owner in the feature + tt.gpuFeature.owner = tt.dda + + // Create a fake store and managers + scheme := runtime.NewScheme() + _ = corev1.AddToScheme(scheme) + _ = v2alpha1.AddToScheme(scheme) + storeOptions := &store.StoreOptions{ + Scheme: scheme, + Logger: logger, + } + depsStore := store.NewStore(tt.dda, storeOptions) + managers := feature.NewResourceManagers(depsStore) + + // Call ManageDependencies + err := tt.gpuFeature.ManageDependencies(managers, "") + assert.NoError(t, err) + + // Check if the ConfigMap was added to the store + if tt.wantSeccompConfigMap { + obj, found := depsStore.Get(kubernetes.ConfigMapKind, tt.dda.GetNamespace(), tt.wantConfigMapName) + assert.True(t, found, "Expected seccomp ConfigMap to be created") + + if found { + cm := obj.(*corev1.ConfigMap) + assert.Equal(t, tt.wantConfigMapName, cm.Name) + assert.Equal(t, tt.dda.GetNamespace(), cm.Namespace) + + // Check that it contains the seccomp profile + _, hasSeccompKey := cm.Data[common.SystemProbeSeccompKey] + assert.True(t, hasSeccompKey, "ConfigMap should contain system-probe-seccomp.json key") + } + } else { + _, found := depsStore.Get(kubernetes.ConfigMapKind, tt.dda.GetNamespace(), tt.wantConfigMapName) + assert.False(t, found, "Expected no seccomp ConfigMap to be created") + } + }) + } +} diff --git a/internal/controller/datadogagentinternal/controller_reconcile_v2.go b/internal/controller/datadogagentinternal/controller_reconcile_v2.go index 3a89c0421..b5c77c227 100644 --- a/internal/controller/datadogagentinternal/controller_reconcile_v2.go +++ b/internal/controller/datadogagentinternal/controller_reconcile_v2.go @@ -61,14 +61,20 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger depsStore, resourceManagers := r.setupDependencies(instance, logger) var err error - // only manage dependencies for default DDAIs + // only manage global dependencies for default DDAIs (profile DDAIs should share these) if !isDDAILabeledWithProfile(instance) { if err = r.manageGlobalDependencies(logger, instance, resourceManagers, requiredComponents); err != nil { return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now) } - if err = r.manageFeatureDependencies(logger, enabledFeatures, resourceManagers); err != nil { - return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now) - } + } + + // Always manage feature dependencies (even for profile DDAIs, as features may differ between profiles) + if err = r.manageFeatureDependencies(logger, enabledFeatures, resourceManagers); err != nil { + return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now) + } + + // Only manage override dependencies for default DDAIs + if !isDDAILabeledWithProfile(instance) { if err = r.overrideDependencies(logger, resourceManagers, instance); err != nil { return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now) } @@ -104,12 +110,10 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err, now) } - // 4. Apply and cleanup dependencies. - // only manage dependencies for default DDAIs - if !isDDAILabeledWithProfile(instance) { - if err = r.applyAndCleanupDependencies(ctx, logger, depsStore); err != nil { - return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err, now) - } + // 4. Apply all dependencies. + // Always apply dependencies, even for profile DDAIs, as they may have feature-specific dependencies + if err = r.applyAndCleanupDependencies(ctx, logger, depsStore); err != nil { + return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err, now) } // Always requeue