From 304da047e484185d980121655b69b61372b55d1b Mon Sep 17 00:00:00 2001 From: dberkerdem Date: Sun, 1 Feb 2026 22:49:54 +0300 Subject: [PATCH 1/3] feat: add --disable-gpu-monitoring flag Add a new command line flag to disable GPU monitoring (NVML) at startup. ## Motivation On GPU nodes, the NVML library holds device handles that can interfere with GPU management operations. Specifically: - **GPU reset operations fail**: When `nvidia-smi --gpu-reset` is executed, it cannot reset GPUs that have open handles from other processes. The coroot-node-agent's GPU monitoring holds these handles, blocking resets. - **Driver version mismatches**: Some NVML API functions (e.g., `nvmlDeviceSetMemClkVfOffset`) require specific driver versions. On nodes with older drivers, the agent may crash or behave unexpectedly. - **Redundant collection**: Many environments already collect GPU metrics via dedicated exporters like dcgm-exporter, making coroot's GPU metrics redundant. ## Usage The flag can be set via: - CLI: `--disable-gpu-monitoring` - Environment: `DISABLE_GPU_MONITORING=true` When enabled, the agent skips NVML initialization entirely and produces no GPU metrics (`node_gpu_*`, `container_gpu_*`). ## Changes - `flags/flags.go`: Add `DisableGPUMonitoring` flag - `gpu/gpu.go`: Add `NewDisabledCollector()`, modify `NewCollector()` to accept disable parameter, add nil check in `Close()` - `main.go`: Pass flag value to GPU collector constructor - `install.sh`: Add `DISABLE_GPU_MONITORING` to allowed env vars --- flags/flags.go | 11 ++++++----- gpu/gpu.go | 16 +++++++++++++++- install.sh | 2 +- main.go | 8 ++++---- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/flags/flags.go b/flags/flags.go index 23a7dd8..c80323f 100644 --- a/flags/flags.go +++ b/flags/flags.go @@ -9,11 +9,12 @@ import ( ) var ( - ListenAddress = kingpin.Flag("listen", "Listen address - ip:port or :port").Default("0.0.0.0:80").Envar("LISTEN").String() - CgroupRoot = kingpin.Flag("cgroupfs-root", "The mount point of the host cgroupfs root").Default("/sys/fs/cgroup").Envar("CGROUPFS_ROOT").String() - DisableLogParsing = kingpin.Flag("disable-log-parsing", "Disable container log parsing").Default("false").Envar("DISABLE_LOG_PARSING").Bool() - DisablePinger = kingpin.Flag("disable-pinger", "Don't ping upstreams").Default("false").Envar("DISABLE_PINGER").Bool() - DisableL7Tracing = kingpin.Flag("disable-l7-tracing", "Disable L7 tracing").Default("false").Envar("DISABLE_L7_TRACING").Bool() + ListenAddress = kingpin.Flag("listen", "Listen address - ip:port or :port").Default("0.0.0.0:80").Envar("LISTEN").String() + CgroupRoot = kingpin.Flag("cgroupfs-root", "The mount point of the host cgroupfs root").Default("/sys/fs/cgroup").Envar("CGROUPFS_ROOT").String() + DisableLogParsing = kingpin.Flag("disable-log-parsing", "Disable container log parsing").Default("false").Envar("DISABLE_LOG_PARSING").Bool() + DisablePinger = kingpin.Flag("disable-pinger", "Don't ping upstreams").Default("false").Envar("DISABLE_PINGER").Bool() + DisableL7Tracing = kingpin.Flag("disable-l7-tracing", "Disable L7 tracing").Default("false").Envar("DISABLE_L7_TRACING").Bool() + DisableGPUMonitoring = kingpin.Flag("disable-gpu-monitoring", "Disable GPU monitoring (NVML)").Default("false").Envar("DISABLE_GPU_MONITORING").Bool() ContainerAllowlist = kingpin.Flag("container-allowlist", "List of allowed containers (regex patterns)").Envar("CONTAINER_ALLOWLIST").Strings() ContainerDenylist = kingpin.Flag("container-denylist", "List of denied containers (regex patterns)").Envar("CONTAINER_DENYLIST").Strings() diff --git a/gpu/gpu.go b/gpu/gpu.go index f91d690..ce7bd90 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -87,7 +87,18 @@ type ProcessUsageSample struct { MemoryPercent uint32 } -func NewCollector() (*Collector, error) { +// NewDisabledCollector returns a collector that does not initialize NVML +// and collects no GPU metrics. Use this when GPU monitoring is disabled. +func NewDisabledCollector() *Collector { + return &Collector{ + ProcessUsageSampleCh: nil, + } +} + +func NewCollector(isDisabled bool) (*Collector, error) { + if isDisabled { + return NewDisabledCollector(), nil + } c := &Collector{ ProcessUsageSampleCh: make(chan ProcessUsageSample, 100), } @@ -227,6 +238,9 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) { } func (c *Collector) Close() { + if c.iface == nil { + return + } c.iface.Shutdown() } diff --git a/install.sh b/install.sh index 1b420b2..81735d1 100644 --- a/install.sh +++ b/install.sh @@ -16,7 +16,7 @@ SYSTEMD_SERVICE=${SYSTEM_NAME}.service UNINSTALL_SH=${BIN_DIR}/${SYSTEM_NAME}-uninstall.sh FILE_SERVICE=${SYSTEMD_DIR}/${SYSTEMD_SERVICE} FILE_ENV=${SYSTEMD_DIR}/${SYSTEMD_SERVICE}.env -ENV_VARS="^(LISTEN|CGROUPFS_ROOT|DISABLE_LOG_PARSING|DISABLE_PINGER|DISABLE_L7_TRACING|TRACK_PUBLIC_NETWORK|EPHEMERAL_PORT_RANGE|PROVIDER|REGION|AVAILABILITY_ZONE|INSTANCE_TYPE|INSTANCE_LIFE_CYCLE|LOG_PER_SECOND|LOG_BURST|COLLECTOR_ENDPOINT|API_KEY|METRICS_ENDPOINT|TRACES_ENDPOINT|LOGS_ENDPOINT|PROFILES_ENDPOINT|SCRAPE_INTERVAL|WAL_DIR)" +ENV_VARS="^(LISTEN|CGROUPFS_ROOT|DISABLE_LOG_PARSING|DISABLE_PINGER|DISABLE_L7_TRACING|DISABLE_GPU_MONITORING|TRACK_PUBLIC_NETWORK|EPHEMERAL_PORT_RANGE|PROVIDER|REGION|AVAILABILITY_ZONE|INSTANCE_TYPE|INSTANCE_LIFE_CYCLE|LOG_PER_SECOND|LOG_BURST|COLLECTOR_ENDPOINT|API_KEY|METRICS_ENDPOINT|TRACES_ENDPOINT|LOGS_ENDPOINT|PROFILES_ENDPOINT|SCRAPE_INTERVAL|WAL_DIR)" info() { diff --git a/main.go b/main.go index 9e7ca4e..e192a8b 100644 --- a/main.go +++ b/main.go @@ -143,10 +143,10 @@ func main() { klog.Exitln(err) } - gpuCollector, err := gpu.NewCollector() - if err != nil { - klog.Warningln("failed to initialize GPU collector:", err) - } + gpuCollector, err := gpu.NewCollector(*flags.DisableGPUMonitoring) + if err != nil { + klog.Warningln("failed to initialize GPU collector:", err) + } if err := registerer.Register(gpuCollector); err != nil { klog.Exitln(err) } From 207fc749985d5e498ac4b54d65db898f4d5aa996 Mon Sep 17 00:00:00 2001 From: dberkerdem Date: Sun, 1 Feb 2026 22:54:09 +0300 Subject: [PATCH 2/3] chore: fix indent --- main.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.go b/main.go index e192a8b..c575e38 100644 --- a/main.go +++ b/main.go @@ -144,9 +144,9 @@ func main() { } gpuCollector, err := gpu.NewCollector(*flags.DisableGPUMonitoring) - if err != nil { - klog.Warningln("failed to initialize GPU collector:", err) - } + if err != nil { + klog.Warningln("failed to initialize GPU collector:", err) + } if err := registerer.Register(gpuCollector); err != nil { klog.Exitln(err) } From 9a2454e33cb043cbb162f285e6b690bf9d87aff4 Mon Sep 17 00:00:00 2001 From: Nikolay Sivko Date: Mon, 2 Feb 2026 09:26:22 -0300 Subject: [PATCH 3/3] simplify code during review --- gpu/gpu.go | 18 +++++------------- main.go | 2 +- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index ce7bd90..232718b 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -12,6 +12,7 @@ import ( "time" "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/coroot/coroot-node-agent/flags" "github.com/coroot/coroot-node-agent/proc" "github.com/prometheus/client_golang/prometheus" "k8s.io/klog/v2" @@ -87,22 +88,13 @@ type ProcessUsageSample struct { MemoryPercent uint32 } -// NewDisabledCollector returns a collector that does not initialize NVML -// and collects no GPU metrics. Use this when GPU monitoring is disabled. -func NewDisabledCollector() *Collector { - return &Collector{ - ProcessUsageSampleCh: nil, - } -} - -func NewCollector(isDisabled bool) (*Collector, error) { - if isDisabled { - return NewDisabledCollector(), nil - } +func NewCollector() (*Collector, error) { c := &Collector{ ProcessUsageSampleCh: make(chan ProcessUsageSample, 100), } - + if *flags.DisableGPUMonitoring { + return c, nil + } libPath, err := findNvidiaMLLib() if err != nil { klog.Infoln(err) diff --git a/main.go b/main.go index c575e38..9e7ca4e 100644 --- a/main.go +++ b/main.go @@ -143,7 +143,7 @@ func main() { klog.Exitln(err) } - gpuCollector, err := gpu.NewCollector(*flags.DisableGPUMonitoring) + gpuCollector, err := gpu.NewCollector() if err != nil { klog.Warningln("failed to initialize GPU collector:", err) }