Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion internal/controller/datadogagent/feature/gpu/feature.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/component/agent"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/object/volume"
"github.com/DataDog/datadog-operator/pkg/kubernetes"
)

func init() {
Expand All @@ -27,6 +28,8 @@ func buildFeature(*feature.Options) feature.Feature {
}

type gpuFeature struct {
// owner is the DatadogAgent or DatadogAgentInternal object that owns this feature
owner metav1.Object
// podRuntimeClassName is the value to set in the runtimeClassName
// configuration of the agent pod. If this is empty, the runtimeClassName
// will not be changed.
Expand All @@ -42,11 +45,12 @@ func (f *gpuFeature) ID() feature.IDType {
}

// Configure is used to configure the feature from a v2alpha1.DatadogAgent instance.
func (f *gpuFeature) Configure(_ metav1.Object, ddaSpec *v2alpha1.DatadogAgentSpec, _ *v2alpha1.RemoteConfigConfiguration) (reqComp feature.RequiredComponents) {
func (f *gpuFeature) Configure(dda metav1.Object, ddaSpec *v2alpha1.DatadogAgentSpec, _ *v2alpha1.RemoteConfigConfiguration) (reqComp feature.RequiredComponents) {
if ddaSpec.Features == nil || ddaSpec.Features.GPU == nil || !apiutils.BoolValue(ddaSpec.Features.GPU.Enabled) {
return reqComp
}

f.owner = dda
f.isPrivilegedModeEnabled = apiutils.BoolValue(ddaSpec.Features.GPU.PrivilegedMode)
f.patchCgroupPermissions = apiutils.BoolValue(ddaSpec.Features.GPU.PatchCgroupPermissions)

Expand Down Expand Up @@ -77,6 +81,48 @@ func (f *gpuFeature) Configure(_ metav1.Object, ddaSpec *v2alpha1.DatadogAgentSp
// ManageDependencies allows a feature to manage its dependencies.
// Feature's dependencies should be added in the store.
func (f *gpuFeature) ManageDependencies(managers feature.ResourceManagers, provider string) error {
// When GPU privileged mode is enabled, system-probe container is required
// and needs the seccomp ConfigMap. Create it here as a feature dependency.
logger := managers.Store().Logger().WithName("gpu-feature")
logger.V(2).Info("ManageDependencies called",
"privilegedMode", f.isPrivilegedModeEnabled,
"hasOwner", f.owner != nil)

if f.isPrivilegedModeEnabled && f.owner != nil {
configMapName := common.GetDefaultSeccompConfigMapName(f.owner)
namespace := f.owner.GetNamespace()
logger.V(2).Info("Creating system-probe seccomp ConfigMap",
"name", configMapName,
"namespace", namespace)

// Create the system-probe seccomp ConfigMap with default profile
seccompConfigData := agent.DefaultSeccompConfigDataForSystemProbe()
if seccompConfigData != nil {
logger.V(2).Info("Got seccomp config data",
"dataKeys", len(seccompConfigData))

configMap := &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: configMapName,
Namespace: namespace,
},
Data: seccompConfigData,
}
// Use Store().AddOrUpdate like kubernetesstatecore does
if err := managers.Store().AddOrUpdate(kubernetes.ConfigMapKind, configMap); err != nil {
logger.Error(err, "Failed to add ConfigMap to store")
return err
}
logger.Info("Successfully added system-probe seccomp ConfigMap to store",
"name", configMapName)
} else {
logger.V(2).Info("No seccomp config data returned")
}
} else {
logger.V(2).Info("Skipping seccomp ConfigMap creation",
"privilegedMode", f.isPrivilegedModeEnabled,
"hasOwner", f.owner != nil)
}
return nil
}

Expand Down
86 changes: 86 additions & 0 deletions internal/controller/datadogagent/feature/gpu/feature_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
logf "sigs.k8s.io/controller-runtime/pkg/log"

apicommon "github.com/DataDog/datadog-operator/api/datadoghq/common"
"github.com/DataDog/datadog-operator/api/datadoghq/v2alpha1"
Expand All @@ -16,6 +19,8 @@ import (
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/fake"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/feature/test"
"github.com/DataDog/datadog-operator/internal/controller/datadogagent/store"
"github.com/DataDog/datadog-operator/pkg/kubernetes"
)

const alternativeRuntimeClass = "nvidia-like"
Expand Down Expand Up @@ -408,3 +413,84 @@ func Test_GPUMonitoringFeature_Configure(t *testing.T) {

tests.Run(t, buildFeature)
}

func Test_GPUFeature_ManageDependencies(t *testing.T) {
logger := logf.Log.WithName("test")

tests := []struct {
name string
dda *v2alpha1.DatadogAgent
gpuFeature *gpuFeature
wantSeccompConfigMap bool
wantConfigMapName string
}{
{
name: "GPU privileged mode enabled - should create seccomp ConfigMap",
dda: &v2alpha1.DatadogAgent{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dda",
Namespace: "default",
},
},
gpuFeature: &gpuFeature{
isPrivilegedModeEnabled: true,
},
wantSeccompConfigMap: true,
wantConfigMapName: "test-dda-system-probe-seccomp",
},
{
name: "GPU privileged mode disabled - should not create seccomp ConfigMap",
dda: &v2alpha1.DatadogAgent{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dda",
Namespace: "default",
},
},
gpuFeature: &gpuFeature{
isPrivilegedModeEnabled: false,
},
wantSeccompConfigMap: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Set the owner in the feature
tt.gpuFeature.owner = tt.dda

// Create a fake store and managers
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
_ = v2alpha1.AddToScheme(scheme)
storeOptions := &store.StoreOptions{
Scheme: scheme,
Logger: logger,
}
depsStore := store.NewStore(tt.dda, storeOptions)
managers := feature.NewResourceManagers(depsStore)

// Call ManageDependencies
err := tt.gpuFeature.ManageDependencies(managers, "")
assert.NoError(t, err)

// Check if the ConfigMap was added to the store
if tt.wantSeccompConfigMap {
obj, found := depsStore.Get(kubernetes.ConfigMapKind, tt.dda.GetNamespace(), tt.wantConfigMapName)
assert.True(t, found, "Expected seccomp ConfigMap to be created")

if found {
cm := obj.(*corev1.ConfigMap)
assert.Equal(t, tt.wantConfigMapName, cm.Name)
assert.Equal(t, tt.dda.GetNamespace(), cm.Namespace)

// Check that it contains the seccomp profile
_, hasSeccompKey := cm.Data[common.SystemProbeSeccompKey]
assert.True(t, hasSeccompKey, "ConfigMap should contain system-probe-seccomp.json key")
}
} else {
_, found := depsStore.Get(kubernetes.ConfigMapKind, tt.dda.GetNamespace(), tt.wantConfigMapName)
assert.False(t, found, "Expected no seccomp ConfigMap to be created")
}
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,20 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
depsStore, resourceManagers := r.setupDependencies(instance, logger)

var err error
// only manage dependencies for default DDAIs
// only manage global dependencies for default DDAIs (profile DDAIs should share these)
if !isDDAILabeledWithProfile(instance) {
if err = r.manageGlobalDependencies(logger, instance, resourceManagers, requiredComponents); err != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now)
}
if err = r.manageFeatureDependencies(logger, enabledFeatures, resourceManagers); err != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now)
}
}

// Always manage feature dependencies (even for profile DDAIs, as features may differ between profiles)
if err = r.manageFeatureDependencies(logger, enabledFeatures, resourceManagers); err != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now)
}

// Only manage override dependencies for default DDAIs
if !isDDAILabeledWithProfile(instance) {
if err = r.overrideDependencies(logger, resourceManagers, instance); err != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, reconcile.Result{}, err, now)
}
Expand Down Expand Up @@ -104,12 +110,10 @@ func (r *Reconciler) reconcileInstanceV2(ctx context.Context, logger logr.Logger
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err, now)
}

// 4. Apply and cleanup dependencies.
// only manage dependencies for default DDAIs
if !isDDAILabeledWithProfile(instance) {
if err = r.applyAndCleanupDependencies(ctx, logger, depsStore); err != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err, now)
}
// 4. Apply all dependencies.
// Always apply dependencies, even for profile DDAIs, as they may have feature-specific dependencies
if err = r.applyAndCleanupDependencies(ctx, logger, depsStore); err != nil {
return r.updateStatusIfNeededV2(logger, instance, newStatus, result, err, now)
}

// Always requeue
Expand Down
Loading