Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .config/nextest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,6 @@ fail-fast = true
filter = 'test(::single_threaded_tests::)'
test-group = 'single-threaded'

# test_crash_tracking_bin_sigchld_exec deadlocks on CentOS 7; cap it so it
# doesn't block the suite if someone runs it with --include-ignored.
[[profile.default.overrides]]
filter = 'test(test_crash_tracking_bin_sigchld_exec)'
slow-timeout = { period = "60s", terminate-after = 1 }

# Run prebuild script before bin_tests to build all artifacts upfront
[[profile.default.scripts]]
filter = 'package(bin_tests)'
Expand Down
14 changes: 1 addition & 13 deletions bin_tests/tests/crashtracker_bin_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,12 @@ macro_rules! crash_tracking_tests {
};
}

// Ignored: deadlocks on CentOS 7 (see nextest.toml for timeout cap when run with
// --include-ignored).
#[test]
#[cfg_attr(miri, ignore)]
#[ignore = "deadlocks on CentOS 7 — tracking issue: SigChldExec handling"]
fn test_crash_tracking_bin_sigchld_exec() {
run_standard_crash_test_refactored(
BuildProfile::Debug,
TestMode::SigChldExec,
CrashType::NullDeref,
);
}

// Generate all simple crash tracking tests using the macro
crash_tracking_tests! {
(test_crash_tracking_bin_debug, BuildProfile::Debug, TestMode::DoNothing, CrashType::NullDeref),
(test_crash_tracking_bin_sigpipe, BuildProfile::Debug, TestMode::SigPipe, CrashType::NullDeref),
(test_crash_tracking_bin_sigchld, BuildProfile::Debug, TestMode::SigChld, CrashType::NullDeref),
(test_crash_tracking_bin_sigchld_exec, BuildProfile::Debug, TestMode::SigChldExec, CrashType::NullDeref),
(test_crash_tracking_bin_sigstack, BuildProfile::Release, TestMode::DoNothingSigStack, CrashType::NullDeref),
(test_crash_tracking_bin_sigpipe_sigstack, BuildProfile::Release, TestMode::SigPipeSigStack, CrashType::NullDeref),
(test_crash_tracking_bin_sigchld_sigstack, BuildProfile::Release, TestMode::SigChldSigStack, CrashType::NullDeref),
Expand Down
21 changes: 13 additions & 8 deletions libdd-crashtracker/src/collector/crash_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

use super::collector_manager::Collector;
use super::receiver_manager::Receiver;
use super::saguard::SaGuard;
use super::saguard::{SaGuard, SuppressionMode};
use super::signal_handler_manager::chain_signal_handler;
use crate::crash_info::Metadata;
use crate::shared::configuration::CrashtrackerConfiguration;
Expand Down Expand Up @@ -264,13 +264,18 @@ fn handle_posix_signal_impl(
return Ok(());
}

// Block SIGCHLD and SIGPIPE during crash handling. Our collector spawns child processes
// and writes to pipes, both of which can generate these signals. Rather than risk
// re-entering a signal handler or aborting due to SIGPIPE, suppress them for the
// duration and restore when the guard drops
let _sa_guard = SaGuard::new(&[
nix::sys::signal::Signal::SIGCHLD,
nix::sys::signal::Signal::SIGPIPE,
// Suppress SIGPIPE and defer SIGCHLD during crash handling.
// SIGCHLD is block-only because SIG_IGN changes child reaping semantics (waitpid/ECHILD),
// which can interfere with receiver/collector process cleanup.
let _sa_guard = SaGuard::new_with_modes(&[
(
nix::sys::signal::Signal::SIGCHLD,
SuppressionMode::BlockOnly,
),
(
nix::sys::signal::Signal::SIGPIPE,
SuppressionMode::IgnoreAndBlock,
),
]);

// Take config and metadata out of global storage.
Expand Down
23 changes: 19 additions & 4 deletions libdd-crashtracker/src/collector/process_handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

use libdd_common::timeout::TimeoutManager;
use libdd_common::unix_utils::{reap_child_non_blocking, wait_for_pollhup};
use nix::errno::Errno;
use nix::unistd::Pid;
use std::os::unix::io::RawFd;

Expand All @@ -24,15 +25,29 @@ impl ProcessHandle {
// If we have less than the minimum amount of time, give ourselves a few scheduler
// slices worth of headroom to help guarantee that we don't leak a zombie process.
let kill_result = unsafe { libc::kill(pid, libc::SIGKILL) };
debug_assert_eq!(kill_result, 0, "kill failed with result: {kill_result}");
if kill_result != 0 {
let errno = Errno::last_raw();
if errno == libc::ESRCH {
// Child has already exited, which is fine; no action needed
} else {
eprintln!("Warning: kill({pid}, SIGKILL) failed with errno {errno}");
debug_assert_eq!(
errno,
libc::ESRCH,
"kill failed with result: {kill_result}, errno: {errno}"
);
}
}

// `self` is actually a handle to a child process and `self.pid` is the child process's
// pid.
let child_pid = Pid::from_raw(pid);
let result = reap_child_non_blocking(child_pid, timeout_manager);
debug_assert_eq!(
result,
Ok(true),
// Ok(true): child was successfully reaped
// Ok(false): child was already reaped (ECHILD) -> no zombie risk
// This can happen because we defer SIGCHLD during crash handling
debug_assert!(
matches!(result, Ok(true) | Ok(false)),
"reap_child_non_blocking failed: {result:?}"
);
}
Expand Down
161 changes: 119 additions & 42 deletions libdd-crashtracker/src/collector/saguard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,38 @@
// SPDX-License-Identifier: Apache-2.0
use nix::sys::signal::{self, SaFlags, SigAction, SigHandler};

// Provides a lexically-scoped guard for signals
// During execution of the signal handler, it cannot be guaranteed that the signal is handled
// without SA_NODEFER, thus it also cannot be guaranteed that signals like SIGCHLD and SIGPIPE will
// _not_ be emitted during this handler as a result of the handler itself. At the same time, it
// isn't known whether it is safe to merely block all signals, as the user's own handler will be
// given the chance to execute after ours. Thus, we need to prevent the emission of signals we
// might create (and cannot be created during a signal handler except by our own execution) and
// defer any other signals.
// To put it another way, it is conceivable that the crash handling code will emit SIGCHLD or
// SIGPIPE, and instead of risking responding to those signals, it needs to suppress them. On the
// other hand, it can't just "block" (`sigprocmask()`) those signals because this will only defer
// them to the next handler.
// Provides a lexically-scoped guard for signal suppression.
//
// During crash handling we may generate signals such as SIGPIPE (pipe writes) and SIGCHLD
Comment thread
r1viollet marked this conversation as resolved.
// (fork/exec child lifecycle). We want to prevent re-entrant handling while preserving process
// semantics needed by cleanup code.
//
// This guard supports per-signal policy:
// - IgnoreAndBlock: block delivery and temporarily set disposition to SIG_IGN.
// - BlockOnly: block delivery while leaving disposition unchanged.
//
// In practice, SIGPIPE is usually IgnoreAndBlock, while SIGCHLD should usually be BlockOnly
// because SIG_IGN for SIGCHLD can change child-reaping semantics (waitpid/ECHILD behavior).
pub struct SaGuard<const N: usize> {
old_sigactions: [(signal::Signal, signal::SigAction); N],
old_sigactions: [(signal::Signal, Option<signal::SigAction>); N],
old_sigmask: signal::SigSet,
}

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SuppressionMode {
/// Block delivery and set disposition to SIG_IGN while the guard is active.
IgnoreAndBlock,
/// Only block delivery while the guard is active
BlockOnly,
}

impl<const N: usize> SaGuard<N> {
pub fn new(signals: &[signal::Signal; N]) -> anyhow::Result<Self> {
pub fn new_with_modes(
signals: &[(signal::Signal, SuppressionMode); N],
) -> anyhow::Result<Self> {
// Create an empty signal set for suppressing signals
let mut suppressed_signals = signal::SigSet::empty();
for signal in signals {
for (signal, _) in signals {
suppressed_signals.add(*signal);
}

Expand All @@ -36,26 +46,22 @@ impl<const N: usize> SaGuard<N> {
)?;

// Initialize array for saving old signal actions
let mut old_sigactions = [(
signal::Signal::SIGINT,
SigAction::new(
SigHandler::SigDfl,
SaFlags::empty(),
signal::SigSet::empty(),
),
); N];

// Set SIG_IGN for the specified signals and save old handlers
for (i, &signal) in signals.iter().enumerate() {
let old_sigaction = unsafe {
signal::sigaction(
signal,
&SigAction::new(
SigHandler::SigIgn,
SaFlags::empty(),
signal::SigSet::empty(),
),
)?
let mut old_sigactions = [(signal::Signal::SIGINT, None); N];

// Set SIG_IGN for configured signals and save old handlers when disposition changes
for (i, &(signal, mode)) in signals.iter().enumerate() {
let old_sigaction = match mode {
SuppressionMode::IgnoreAndBlock => Some(unsafe {
signal::sigaction(
signal,
&SigAction::new(
SigHandler::SigIgn,
SaFlags::empty(),
signal::SigSet::empty(),
),
)?
}),
SuppressionMode::BlockOnly => None,
};
old_sigactions[i] = (signal, old_sigaction);
}
Expand All @@ -69,14 +75,17 @@ impl<const N: usize> SaGuard<N> {

impl<const N: usize> Drop for SaGuard<N> {
fn drop(&mut self) {
// Restore the original signal actions
// Restore the original signal actions first, before unblocking signals.
// This prevents a window where deferred signals could fire with the wrong handler.
for &(signal, old_sigaction) in &self.old_sigactions {
unsafe {
let _ = signal::sigaction(signal, &old_sigaction);
if let Some(old_sigaction) = old_sigaction {
unsafe {
let _ = signal::sigaction(signal, &old_sigaction);
}
}
}

// Restore the original signal mask
// Now restore the original signal mask, which will deliver any deferred signals
let _ = signal::sigprocmask(
signal::SigmaskHow::SIG_SETMASK,
Some(&self.old_sigmask),
Expand All @@ -91,11 +100,19 @@ mod single_threaded_tests {
use nix::sys::signal::{self, Signal};
use nix::unistd::Pid;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;

// These tests mutate global signal state, so we need to lock to avoid race conditions
// even in single-threaded mode, as signal state can persist between test runs
static SIGNAL_TEST_LOCK: Mutex<()> = Mutex::new(());

#[test]
#[cfg_attr(miri, ignore)]
fn signal_is_ignored_while_guard_is_active() {
let _guard = SaGuard::<1>::new(&[Signal::SIGUSR1]).unwrap();
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
let _guard =
SaGuard::<1>::new_with_modes(&[(Signal::SIGUSR1, SuppressionMode::IgnoreAndBlock)])
.unwrap();

// Send SIGUSR1 to the process. The default action is to terminate, so if
// the guard didn't set SIG_IGN this test process would die
Expand All @@ -108,6 +125,7 @@ mod single_threaded_tests {
#[test]
#[cfg_attr(miri, ignore)]
fn original_handler_restored_after_drop() {
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
static HANDLER_CALLED: AtomicBool = AtomicBool::new(false);

extern "C" fn custom_handler(_: libc::c_int) {
Expand All @@ -124,7 +142,9 @@ mod single_threaded_tests {

// Create then drop the guard (dropped when out of scope)
{
let _guard = SaGuard::<1>::new(&[Signal::SIGUSR2]).unwrap();
let _guard =
SaGuard::<1>::new_with_modes(&[(Signal::SIGUSR2, SuppressionMode::IgnoreAndBlock)])
.unwrap();
signal::kill(Pid::this(), Signal::SIGUSR2).unwrap();
assert!(
!HANDLER_CALLED.load(Ordering::SeqCst),
Expand All @@ -150,10 +170,67 @@ mod single_threaded_tests {
#[test]
#[cfg_attr(miri, ignore)]
fn multiple_signals_ignored() {
let _guard = SaGuard::<2>::new(&[Signal::SIGUSR1, Signal::SIGUSR2]).unwrap();
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
let _guard = SaGuard::<2>::new_with_modes(&[
(Signal::SIGUSR1, SuppressionMode::IgnoreAndBlock),
(Signal::SIGUSR2, SuppressionMode::IgnoreAndBlock),
])
.unwrap();

// Both signals should be safely ignored
signal::kill(Pid::this(), Signal::SIGUSR1).unwrap();
signal::kill(Pid::this(), Signal::SIGUSR2).unwrap();
}

#[test]
#[cfg_attr(miri, ignore)]
fn block_only_defers_signal_delivery() -> anyhow::Result<()> {
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
static SIGUSR1_COUNT: AtomicBool = AtomicBool::new(false);

extern "C" fn sigusr1_handler(_: libc::c_int) {
SIGUSR1_COUNT.store(true, Ordering::SeqCst);
}

let sig = Signal::SIGUSR1;

// Install a known handler and save the previous one so we can restore it
let old_action = unsafe {
signal::sigaction(
sig,
&SigAction::new(
SigHandler::Handler(sigusr1_handler),
SaFlags::empty(),
signal::SigSet::empty(),
),
)?
};

// Reset handler state
SIGUSR1_COUNT.store(false, Ordering::SeqCst);

{
let _guard = SaGuard::<1>::new_with_modes(&[(sig, SuppressionMode::BlockOnly)])?;

// Send SIGUSR1 to ourselves while it is blocked
signal::raise(sig)?;

// Because the signal is blocked, the handler should not have run yet
assert!(
!SIGUSR1_COUNT.load(Ordering::SeqCst),
"Handler should not be called while signal is blocked by BlockOnly guard"
);
} // guard drops here; old mask is restored, SIGUSR1 should now be delivered
// After unblocking, the signal should be handled
assert!(
SIGUSR1_COUNT.load(Ordering::SeqCst),
"Handler should be called after BlockOnly guard drops and pending signal is delivered"
);
// Restore the prev disposition
unsafe {
signal::sigaction(sig, &old_action)?;
}

Ok(())
}
}
Loading