DataDog · gh-worker-dd-mergequeue-cf854d · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
@@ -25,12 +25,6 @@ fail-fast = true
 filter = 'test(::single_threaded_tests::)'
 test-group = 'single-threaded'
 
-# test_crash_tracking_bin_sigchld_exec deadlocks on CentOS 7; cap it so it
-# doesn't block the suite if someone runs it with --include-ignored.
-[[profile.default.overrides]]
-filter = 'test(test_crash_tracking_bin_sigchld_exec)'
-slow-timeout = { period = "60s", terminate-after = 1 }
-
 # Run prebuild script before bin_tests to build all artifacts upfront
 [[profile.default.scripts]]
 filter = 'package(bin_tests)'

@@ -39,24 +39,12 @@ macro_rules! crash_tracking_tests {
     };
 }
 
-// Ignored: deadlocks on CentOS 7 (see nextest.toml for timeout cap when run with
-// --include-ignored).
-#[test]
-#[cfg_attr(miri, ignore)]
-#[ignore = "deadlocks on CentOS 7 — tracking issue: SigChldExec handling"]
-fn test_crash_tracking_bin_sigchld_exec() {
-    run_standard_crash_test_refactored(
-        BuildProfile::Debug,
-        TestMode::SigChldExec,
-        CrashType::NullDeref,
-    );
-}
-
 // Generate all simple crash tracking tests using the macro
 crash_tracking_tests! {
     (test_crash_tracking_bin_debug, BuildProfile::Debug, TestMode::DoNothing, CrashType::NullDeref),
     (test_crash_tracking_bin_sigpipe, BuildProfile::Debug, TestMode::SigPipe, CrashType::NullDeref),
     (test_crash_tracking_bin_sigchld, BuildProfile::Debug, TestMode::SigChld, CrashType::NullDeref),
+    (test_crash_tracking_bin_sigchld_exec, BuildProfile::Debug, TestMode::SigChldExec, CrashType::NullDeref),
     (test_crash_tracking_bin_sigstack, BuildProfile::Release, TestMode::DoNothingSigStack, CrashType::NullDeref),
     (test_crash_tracking_bin_sigpipe_sigstack, BuildProfile::Release, TestMode::SigPipeSigStack, CrashType::NullDeref),
     (test_crash_tracking_bin_sigchld_sigstack, BuildProfile::Release, TestMode::SigChldSigStack, CrashType::NullDeref),

@@ -5,7 +5,7 @@
 
 use super::collector_manager::Collector;
 use super::receiver_manager::Receiver;
-use super::saguard::SaGuard;
+use super::saguard::{SaGuard, SuppressionMode};
 use super::signal_handler_manager::chain_signal_handler;
 use crate::crash_info::Metadata;
 use crate::shared::configuration::CrashtrackerConfiguration;
@@ -264,13 +264,18 @@ fn handle_posix_signal_impl(
         return Ok(());
     }
 
-    // Block SIGCHLD and SIGPIPE during crash handling. Our collector spawns child processes
-    // and writes to pipes, both of which can generate these signals. Rather than risk
-    // re-entering a signal handler or aborting due to SIGPIPE, suppress them for the
-    // duration and restore when the guard drops
-    let _sa_guard = SaGuard::new(&[
-        nix::sys::signal::Signal::SIGCHLD,
-        nix::sys::signal::Signal::SIGPIPE,
+    // Suppress SIGPIPE and defer SIGCHLD during crash handling.
+    // SIGCHLD is block-only because SIG_IGN changes child reaping semantics (waitpid/ECHILD),
+    // which can interfere with receiver/collector process cleanup.
+    let _sa_guard = SaGuard::new_with_modes(&[
+        (
+            nix::sys::signal::Signal::SIGCHLD,
+            SuppressionMode::BlockOnly,
+        ),
+        (
+            nix::sys::signal::Signal::SIGPIPE,
+            SuppressionMode::IgnoreAndBlock,
+        ),
     ]);
 
     // Take config and metadata out of global storage.

@@ -3,6 +3,7 @@
 
 use libdd_common::timeout::TimeoutManager;
 use libdd_common::unix_utils::{reap_child_non_blocking, wait_for_pollhup};
+use nix::errno::Errno;
 use nix::unistd::Pid;
 use std::os::unix::io::RawFd;
 
@@ -24,15 +25,29 @@ impl ProcessHandle {
             // If we have less than the minimum amount of time, give ourselves a few scheduler
             // slices worth of headroom to help guarantee that we don't leak a zombie process.
             let kill_result = unsafe { libc::kill(pid, libc::SIGKILL) };
-            debug_assert_eq!(kill_result, 0, "kill failed with result: {kill_result}");
+            if kill_result != 0 {
+                let errno = Errno::last_raw();
+                if errno == libc::ESRCH {
+                    // Child has already exited, which is fine; no action needed
+                } else {
+                    eprintln!("Warning: kill({pid}, SIGKILL) failed with errno {errno}");
+                    debug_assert_eq!(
+                        errno,
+                        libc::ESRCH,
+                        "kill failed with result: {kill_result}, errno: {errno}"
+                    );
+                }
+            }
 
             // `self` is actually a handle to a child process and `self.pid` is the child process's
             // pid.
             let child_pid = Pid::from_raw(pid);
             let result = reap_child_non_blocking(child_pid, timeout_manager);
-            debug_assert_eq!(
-                result,
-                Ok(true),
+            // Ok(true):  child was successfully reaped
+            // Ok(false): child was already reaped (ECHILD) -> no zombie risk
+            //            This can happen because we defer SIGCHLD during crash handling
+            debug_assert!(
+                matches!(result, Ok(true) | Ok(false)),
                 "reap_child_non_blocking failed: {result:?}"
             );
         }

@@ -2,28 +2,38 @@
 // SPDX-License-Identifier: Apache-2.0
 use nix::sys::signal::{self, SaFlags, SigAction, SigHandler};
 
-// Provides a lexically-scoped guard for signals
-// During execution of the signal handler, it cannot be guaranteed that the signal is handled
-// without SA_NODEFER, thus it also cannot be guaranteed that signals like SIGCHLD and SIGPIPE will
-// _not_ be emitted during this handler as a result of the handler itself. At the same time, it
-// isn't known whether it is safe to merely block all signals, as the user's own handler will be
-// given the chance to execute after ours. Thus, we need to prevent the emission of signals we
-// might create (and cannot be created during a signal handler except by our own execution) and
-// defer any other signals.
-// To put it another way, it is conceivable that the crash handling code will emit SIGCHLD or
-// SIGPIPE, and instead of risking responding to those signals, it needs to suppress them. On the
-// other hand, it can't just "block" (`sigprocmask()`) those signals because this will only defer
-// them to the next handler.
+// Provides a lexically-scoped guard for signal suppression.
+//
+// During crash handling we may generate signals such as SIGPIPE (pipe writes) and SIGCHLD
+// (fork/exec child lifecycle). We want to prevent re-entrant handling while preserving process
+// semantics needed by cleanup code.
+//
+// This guard supports per-signal policy:
+// - IgnoreAndBlock: block delivery and temporarily set disposition to SIG_IGN.
+// - BlockOnly: block delivery while leaving disposition unchanged.
+//
+// In practice, SIGPIPE is usually IgnoreAndBlock, while SIGCHLD should usually be BlockOnly
+// because SIG_IGN for SIGCHLD can change child-reaping semantics (waitpid/ECHILD behavior).
 pub struct SaGuard<const N: usize> {
-    old_sigactions: [(signal::Signal, signal::SigAction); N],
+    old_sigactions: [(signal::Signal, Option<signal::SigAction>); N],
     old_sigmask: signal::SigSet,
 }
 
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum SuppressionMode {
+    /// Block delivery and set disposition to SIG_IGN while the guard is active.
+    IgnoreAndBlock,
+    /// Only block delivery while the guard is active
+    BlockOnly,
+}
+
 impl<const N: usize> SaGuard<N> {
-    pub fn new(signals: &[signal::Signal; N]) -> anyhow::Result<Self> {
+    pub fn new_with_modes(
+        signals: &[(signal::Signal, SuppressionMode); N],
+    ) -> anyhow::Result<Self> {
         // Create an empty signal set for suppressing signals
         let mut suppressed_signals = signal::SigSet::empty();
-        for signal in signals {
+        for (signal, _) in signals {
             suppressed_signals.add(*signal);
         }
 
@@ -36,26 +46,22 @@ impl<const N: usize> SaGuard<N> {
         )?;
 
         // Initialize array for saving old signal actions
-        let mut old_sigactions = [(
-            signal::Signal::SIGINT,
-            SigAction::new(
-                SigHandler::SigDfl,
-                SaFlags::empty(),
-                signal::SigSet::empty(),
-            ),
-        ); N];
-
-        // Set SIG_IGN for the specified signals and save old handlers
-        for (i, &signal) in signals.iter().enumerate() {
-            let old_sigaction = unsafe {
-                signal::sigaction(
-                    signal,
-                    &SigAction::new(
-                        SigHandler::SigIgn,
-                        SaFlags::empty(),
-                        signal::SigSet::empty(),
-                    ),
-                )?
+        let mut old_sigactions = [(signal::Signal::SIGINT, None); N];
+
+        // Set SIG_IGN for configured signals and save old handlers when disposition changes
+        for (i, &(signal, mode)) in signals.iter().enumerate() {
+            let old_sigaction = match mode {
+                SuppressionMode::IgnoreAndBlock => Some(unsafe {
+                    signal::sigaction(
+                        signal,
+                        &SigAction::new(
+                            SigHandler::SigIgn,
+                            SaFlags::empty(),
+                            signal::SigSet::empty(),
+                        ),
+                    )?
+                }),
+                SuppressionMode::BlockOnly => None,
             };
             old_sigactions[i] = (signal, old_sigaction);
         }
@@ -69,14 +75,17 @@ impl<const N: usize> SaGuard<N> {
 
 impl<const N: usize> Drop for SaGuard<N> {
     fn drop(&mut self) {
-        // Restore the original signal actions
+        // Restore the original signal actions first, before unblocking signals.
+        // This prevents a window where deferred signals could fire with the wrong handler.
         for &(signal, old_sigaction) in &self.old_sigactions {
-            unsafe {
-                let _ = signal::sigaction(signal, &old_sigaction);
+            if let Some(old_sigaction) = old_sigaction {
+                unsafe {
+                    let _ = signal::sigaction(signal, &old_sigaction);
+                }
             }
         }
 
-        // Restore the original signal mask
+        // Now restore the original signal mask, which will deliver any deferred signals
         let _ = signal::sigprocmask(
             signal::SigmaskHow::SIG_SETMASK,
             Some(&self.old_sigmask),
@@ -91,11 +100,19 @@ mod single_threaded_tests {
     use nix::sys::signal::{self, Signal};
     use nix::unistd::Pid;
     use std::sync::atomic::{AtomicBool, Ordering};
+    use std::sync::Mutex;
+
+    // These tests mutate global signal state, so we need to lock to avoid race conditions
+    // even in single-threaded mode, as signal state can persist between test runs
+    static SIGNAL_TEST_LOCK: Mutex<()> = Mutex::new(());
 
     #[test]
     #[cfg_attr(miri, ignore)]
     fn signal_is_ignored_while_guard_is_active() {
-        let _guard = SaGuard::<1>::new(&[Signal::SIGUSR1]).unwrap();
+        let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
+        let _guard =
+            SaGuard::<1>::new_with_modes(&[(Signal::SIGUSR1, SuppressionMode::IgnoreAndBlock)])
+                .unwrap();
 
         // Send SIGUSR1 to the process. The default action is to terminate, so if
         // the guard didn't set SIG_IGN this test process would die
@@ -108,6 +125,7 @@ mod single_threaded_tests {
     #[test]
     #[cfg_attr(miri, ignore)]
     fn original_handler_restored_after_drop() {
+        let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
         static HANDLER_CALLED: AtomicBool = AtomicBool::new(false);
 
         extern "C" fn custom_handler(_: libc::c_int) {
@@ -124,7 +142,9 @@ mod single_threaded_tests {
 
         // Create then drop the guard (dropped when out of scope)
         {
-            let _guard = SaGuard::<1>::new(&[Signal::SIGUSR2]).unwrap();
+            let _guard =
+                SaGuard::<1>::new_with_modes(&[(Signal::SIGUSR2, SuppressionMode::IgnoreAndBlock)])
+                    .unwrap();
             signal::kill(Pid::this(), Signal::SIGUSR2).unwrap();
             assert!(
                 !HANDLER_CALLED.load(Ordering::SeqCst),
@@ -150,10 +170,67 @@ mod single_threaded_tests {
     #[test]
     #[cfg_attr(miri, ignore)]
     fn multiple_signals_ignored() {
-        let _guard = SaGuard::<2>::new(&[Signal::SIGUSR1, Signal::SIGUSR2]).unwrap();
+        let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
+        let _guard = SaGuard::<2>::new_with_modes(&[
+            (Signal::SIGUSR1, SuppressionMode::IgnoreAndBlock),
+            (Signal::SIGUSR2, SuppressionMode::IgnoreAndBlock),
+        ])
+        .unwrap();
 
         // Both signals should be safely ignored
         signal::kill(Pid::this(), Signal::SIGUSR1).unwrap();
         signal::kill(Pid::this(), Signal::SIGUSR2).unwrap();
     }
+
+    #[test]
+    #[cfg_attr(miri, ignore)]
+    fn block_only_defers_signal_delivery() -> anyhow::Result<()> {
+        let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
+        static SIGUSR1_COUNT: AtomicBool = AtomicBool::new(false);
+
+        extern "C" fn sigusr1_handler(_: libc::c_int) {
+            SIGUSR1_COUNT.store(true, Ordering::SeqCst);
+        }
+
+        let sig = Signal::SIGUSR1;
+
+        // Install a known handler and save the previous one so we can restore it
+        let old_action = unsafe {
+            signal::sigaction(
+                sig,
+                &SigAction::new(
+                    SigHandler::Handler(sigusr1_handler),
+                    SaFlags::empty(),
+                    signal::SigSet::empty(),
+                ),
+            )?
+        };
+
+        // Reset handler state
+        SIGUSR1_COUNT.store(false, Ordering::SeqCst);
+
+        {
+            let _guard = SaGuard::<1>::new_with_modes(&[(sig, SuppressionMode::BlockOnly)])?;
+
+            // Send SIGUSR1 to ourselves while it is blocked
+            signal::raise(sig)?;
+
+            // Because the signal is blocked, the handler should not have run yet
+            assert!(
+                !SIGUSR1_COUNT.load(Ordering::SeqCst),
+                "Handler should not be called while signal is blocked by BlockOnly guard"
+            );
+        } // guard drops here; old mask is restored, SIGUSR1 should now be delivered
+          // After unblocking, the signal should be handled
+        assert!(
+            SIGUSR1_COUNT.load(Ordering::SeqCst),
+            "Handler should be called after BlockOnly guard drops and pending signal is delivered"
+        );
+        // Restore the prev disposition
+        unsafe {
+            signal::sigaction(sig, &old_action)?;
+        }
+
+        Ok(())
+    }
 }