From 9fe1aca7b42e2dffe746258d8f63f49f56c7cee1 Mon Sep 17 00:00:00 2001 From: Lars Erik Wik Date: Mon, 18 May 2026 15:16:49 +0200 Subject: [PATCH 1/2] Fixed daemon hang on SIGTERM during child process wait ShellCommandReturnsZero retried waitpid() unconditionally on EINTR, so daemons (cf-serverd, cf-execd, cf-monitord) blocked waiting for a child process -- such as cf-promises during policy validation -- stayed unresponsive to SIGTERM until the child finished. The signal handler set PENDING_TERMINATION but the main loop never got control back to check it. Now, when waitpid is interrupted and termination is pending, the child is stopped via ProcessSignalTerminate (SIGINT -> SIGTERM -> SIGKILL) and reaped, so the daemon's main loop can exit promptly. Ticket: ENT-13720 Changelog: Title Signed-off-by: Lars Erik Wik (cherry picked from commit 243a10f6bd590925f690a530349be309c87299af) --- libpromises/unix.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/libpromises/unix.c b/libpromises/unix.c index c8edb69c85..b78ccb8c58 100644 --- a/libpromises/unix.c +++ b/libpromises/unix.c @@ -25,6 +25,7 @@ #include #include #include +#include /* IsPendingTermination() */ #include /* StringToInt64() */ #ifdef HAVE_SYS_UIO_H @@ -229,6 +230,23 @@ bool ShellCommandReturnsZero(const char *command, ShellType shell) { return false; } + /* A daemon received a terminating signal while we are blocked on + * waitpid(). Stop the child so we can return control to the main + * loop, otherwise the daemon would stay unresponsive to SIGTERM + * for as long as the child runs (e.g. a stuck cf-promises during + * policy validation). */ + if (IsPendingTermination()) + { + Log(LOG_LEVEL_VERBOSE, + "Termination pending; aborting child '%s' (pid %jd)", + command, (intmax_t) pid); + ProcessSignalTerminate(pid); + while (waitpid(pid, &status, 0) < 0 && errno == EINTR) + { + /* Child has been signalled; just reap it. */ + } + return false; + } } return (WEXITSTATUS(status) == 0); From 5de6907b1e3ba29e98b20dab8df4f7437950ee6d Mon Sep 17 00:00:00 2001 From: Lars Erik Wik Date: Mon, 18 May 2026 18:32:24 +0200 Subject: [PATCH 2/2] Poll waitpid(WNOHANG) so daemons can react to SIGTERM The previous attempt checked IsPendingTermination() in the EINTR branch of the blocking waitpid() loop, but that branch is never reached: signal() on Linux/glibc installs handlers with SA_RESTART, so the kernel transparently restarts waitpid() after the handler runs and the userspace EINTR check never fires. The daemon stays blocked in waitpid() until the child exits on its own, which is the exact symptom we set out to fix. Poll the child with waitpid(WNOHANG) instead, so we get control back between iterations and can react to PENDING_TERMINATION regardless of whether the signal interrupts the syscall. nanosleep() between polls keeps the loop from busy-spinning; since it is never restarted across signals, SIGTERM wakes us up promptly and the 100 ms interval is only an upper bound on idle wakeup latency. References (Linux man-pages 6.9.1): signal(2): "By default, in glibc 2 and later, the signal() wrapper function does not invoke the kernel system call. Instead, it calls sigaction(2) using flags that supply BSD semantics. [...] The BSD semantics are equivalent to calling sigaction(2) with the following flags: sa.sa_flags = SA_RESTART;" signal(7), "Interruption of system calls and library functions by signal handlers": "If a blocked call to one of the following interfaces is interrupted by a signal handler, then the call is automatically restarted after the signal handler returns if the SA_RESTART flag was used; otherwise the call fails with the error EINTR: [...] wait(2), wait3(2), wait4(2), waitid(2), and waitpid(2)." signal(7), same section: "The following interfaces are never restarted after being interrupted by a signal handler, regardless of the use of SA_RESTART; they always fail with the error EINTR when interrupted by a signal handler: [...] Sleep interfaces: clock_nanosleep(2), nanosleep(2), and usleep(3)." Ticket: ENT-13720 Signed-off-by: Lars Erik Wik (cherry picked from commit ec2627ec983be8dc233cba3b7e38ee7cd0fefe77) --- libpromises/unix.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/libpromises/unix.c b/libpromises/unix.c index b78ccb8c58..8fc26dd118 100644 --- a/libpromises/unix.c +++ b/libpromises/unix.c @@ -224,17 +224,31 @@ bool ShellCommandReturnsZero(const char *command, ShellType shell) { ALARM_PID = pid; - while (waitpid(pid, &status, 0) < 0) + /* Poll for the child instead of blocking in waitpid(). signal() on + * Linux/glibc installs handlers with SA_RESTART, so SIGTERM does not + * interrupt a blocking waitpid() and PENDING_TERMINATION is never + * observed until the child exits on its own. With WNOHANG we get + * control back between iterations and can react to a pending + * termination (e.g. a stuck cf-promises during policy validation + * keeping the daemon unresponsive to SIGTERM). nanosleep() is + * interruptible regardless of SA_RESTART, so SIGTERM wakes us up + * promptly. */ + while (true) { - if (errno != EINTR) + pid_t wait_result = waitpid(pid, &status, WNOHANG); + if (wait_result == pid) { + break; /* child exited and was reaped */ + } + if (wait_result < 0) + { + if (errno == EINTR) + { + continue; + } return false; } - /* A daemon received a terminating signal while we are blocked on - * waitpid(). Stop the child so we can return control to the main - * loop, otherwise the daemon would stay unresponsive to SIGTERM - * for as long as the child runs (e.g. a stuck cf-promises during - * policy validation). */ + /* wait_result == 0: child is still running */ if (IsPendingTermination()) { Log(LOG_LEVEL_VERBOSE, @@ -247,6 +261,11 @@ bool ShellCommandReturnsZero(const char *command, ShellType shell) } return false; } + struct timespec poll_interval = { + .tv_sec = 0, + .tv_nsec = 100000000 /* 100 ms */ + }; + nanosleep(&poll_interval, NULL); } return (WEXITSTATUS(status) == 0);