Skip to content

Commit 9e2b7b9

Browse files
authored
fix(crashtracking): fix SIGCHLD signal guarding while in CT signal handler (#1807)
# What does this PR do? This PR fixes a regression where SaGuard suppressed `SIGCHLD `with `SIG_IGN`, which can alter child reaping behavior (`waitpid`/`ECHILD`) and break `sigchld_exec` crash-test flows Changes 1. Add per-signal suppression modes in `SaGuard` 2. Use split policy in crash handling 3. Loosen up restirctions in child cleanup in `ProcessHandle::finish` 4. serialize signal-global unit tests to avoid race condition modifying global state for saguard unit tests # Motivation Context: We were originally not blocking any signals. In [fix(crashtracking): guard sigchld and sigpipe during crashtracker signal handler execution](#1771), we started blocking sigchld and sigpipe. However, blocking sigchld causes deadlocks for the following test [test(crashtracking): skip sigchld_exec test on CentOS 7 deadlock](#1804) This change works because it prevents interference with the application's expected signal handling while still protecting the crash handler from re-entrant signals # Additional Notes # How to test the change? `cargo nextest run -p libdd-crashtracker --lib -- collector::saguard::single_threaded_tests` `cargo nextest run -p bin_tests test_crash_tracking_bin_sigchld_exec` Co-authored-by: gyuheon.oh <[email protected]>
1 parent 54cdf99 commit 9e2b7b9

5 files changed

Lines changed: 152 additions & 73 deletions

File tree

.config/nextest.toml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,6 @@ fail-fast = true
2525
filter = 'test(::single_threaded_tests::)'
2626
test-group = 'single-threaded'
2727

28-
# test_crash_tracking_bin_sigchld_exec deadlocks on CentOS 7; cap it so it
29-
# doesn't block the suite if someone runs it with --include-ignored.
30-
[[profile.default.overrides]]
31-
filter = 'test(test_crash_tracking_bin_sigchld_exec)'
32-
slow-timeout = { period = "60s", terminate-after = 1 }
33-
3428
# Run prebuild script before bin_tests to build all artifacts upfront
3529
[[profile.default.scripts]]
3630
filter = 'package(bin_tests)'

bin_tests/tests/crashtracker_bin_test.rs

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,24 +39,12 @@ macro_rules! crash_tracking_tests {
3939
};
4040
}
4141

42-
// Ignored: deadlocks on CentOS 7 (see nextest.toml for timeout cap when run with
43-
// --include-ignored).
44-
#[test]
45-
#[cfg_attr(miri, ignore)]
46-
#[ignore = "deadlocks on CentOS 7 — tracking issue: SigChldExec handling"]
47-
fn test_crash_tracking_bin_sigchld_exec() {
48-
run_standard_crash_test_refactored(
49-
BuildProfile::Debug,
50-
TestMode::SigChldExec,
51-
CrashType::NullDeref,
52-
);
53-
}
54-
5542
// Generate all simple crash tracking tests using the macro
5643
crash_tracking_tests! {
5744
(test_crash_tracking_bin_debug, BuildProfile::Debug, TestMode::DoNothing, CrashType::NullDeref),
5845
(test_crash_tracking_bin_sigpipe, BuildProfile::Debug, TestMode::SigPipe, CrashType::NullDeref),
5946
(test_crash_tracking_bin_sigchld, BuildProfile::Debug, TestMode::SigChld, CrashType::NullDeref),
47+
(test_crash_tracking_bin_sigchld_exec, BuildProfile::Debug, TestMode::SigChldExec, CrashType::NullDeref),
6048
(test_crash_tracking_bin_sigstack, BuildProfile::Release, TestMode::DoNothingSigStack, CrashType::NullDeref),
6149
(test_crash_tracking_bin_sigpipe_sigstack, BuildProfile::Release, TestMode::SigPipeSigStack, CrashType::NullDeref),
6250
(test_crash_tracking_bin_sigchld_sigstack, BuildProfile::Release, TestMode::SigChldSigStack, CrashType::NullDeref),

libdd-crashtracker/src/collector/crash_handler.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
use super::collector_manager::Collector;
77
use super::receiver_manager::Receiver;
8-
use super::saguard::SaGuard;
8+
use super::saguard::{SaGuard, SuppressionMode};
99
use super::signal_handler_manager::chain_signal_handler;
1010
use crate::crash_info::Metadata;
1111
use crate::shared::configuration::CrashtrackerConfiguration;
@@ -264,13 +264,18 @@ fn handle_posix_signal_impl(
264264
return Ok(());
265265
}
266266

267-
// Block SIGCHLD and SIGPIPE during crash handling. Our collector spawns child processes
268-
// and writes to pipes, both of which can generate these signals. Rather than risk
269-
// re-entering a signal handler or aborting due to SIGPIPE, suppress them for the
270-
// duration and restore when the guard drops
271-
let _sa_guard = SaGuard::new(&[
272-
nix::sys::signal::Signal::SIGCHLD,
273-
nix::sys::signal::Signal::SIGPIPE,
267+
// Suppress SIGPIPE and defer SIGCHLD during crash handling.
268+
// SIGCHLD is block-only because SIG_IGN changes child reaping semantics (waitpid/ECHILD),
269+
// which can interfere with receiver/collector process cleanup.
270+
let _sa_guard = SaGuard::new_with_modes(&[
271+
(
272+
nix::sys::signal::Signal::SIGCHLD,
273+
SuppressionMode::BlockOnly,
274+
),
275+
(
276+
nix::sys::signal::Signal::SIGPIPE,
277+
SuppressionMode::IgnoreAndBlock,
278+
),
274279
]);
275280

276281
// Take config and metadata out of global storage.

libdd-crashtracker/src/collector/process_handle.rs

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
use libdd_common::timeout::TimeoutManager;
55
use libdd_common::unix_utils::{reap_child_non_blocking, wait_for_pollhup};
6+
use nix::errno::Errno;
67
use nix::unistd::Pid;
78
use std::os::unix::io::RawFd;
89

@@ -24,15 +25,29 @@ impl ProcessHandle {
2425
// If we have less than the minimum amount of time, give ourselves a few scheduler
2526
// slices worth of headroom to help guarantee that we don't leak a zombie process.
2627
let kill_result = unsafe { libc::kill(pid, libc::SIGKILL) };
27-
debug_assert_eq!(kill_result, 0, "kill failed with result: {kill_result}");
28+
if kill_result != 0 {
29+
let errno = Errno::last_raw();
30+
if errno == libc::ESRCH {
31+
// Child has already exited, which is fine; no action needed
32+
} else {
33+
eprintln!("Warning: kill({pid}, SIGKILL) failed with errno {errno}");
34+
debug_assert_eq!(
35+
errno,
36+
libc::ESRCH,
37+
"kill failed with result: {kill_result}, errno: {errno}"
38+
);
39+
}
40+
}
2841

2942
// `self` is actually a handle to a child process and `self.pid` is the child process's
3043
// pid.
3144
let child_pid = Pid::from_raw(pid);
3245
let result = reap_child_non_blocking(child_pid, timeout_manager);
33-
debug_assert_eq!(
34-
result,
35-
Ok(true),
46+
// Ok(true): child was successfully reaped
47+
// Ok(false): child was already reaped (ECHILD) -> no zombie risk
48+
// This can happen because we defer SIGCHLD during crash handling
49+
debug_assert!(
50+
matches!(result, Ok(true) | Ok(false)),
3651
"reap_child_non_blocking failed: {result:?}"
3752
);
3853
}

libdd-crashtracker/src/collector/saguard.rs

Lines changed: 119 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,38 @@
22
// SPDX-License-Identifier: Apache-2.0
33
use nix::sys::signal::{self, SaFlags, SigAction, SigHandler};
44

5-
// Provides a lexically-scoped guard for signals
6-
// During execution of the signal handler, it cannot be guaranteed that the signal is handled
7-
// without SA_NODEFER, thus it also cannot be guaranteed that signals like SIGCHLD and SIGPIPE will
8-
// _not_ be emitted during this handler as a result of the handler itself. At the same time, it
9-
// isn't known whether it is safe to merely block all signals, as the user's own handler will be
10-
// given the chance to execute after ours. Thus, we need to prevent the emission of signals we
11-
// might create (and cannot be created during a signal handler except by our own execution) and
12-
// defer any other signals.
13-
// To put it another way, it is conceivable that the crash handling code will emit SIGCHLD or
14-
// SIGPIPE, and instead of risking responding to those signals, it needs to suppress them. On the
15-
// other hand, it can't just "block" (`sigprocmask()`) those signals because this will only defer
16-
// them to the next handler.
5+
// Provides a lexically-scoped guard for signal suppression.
6+
//
7+
// During crash handling we may generate signals such as SIGPIPE (pipe writes) and SIGCHLD
8+
// (fork/exec child lifecycle). We want to prevent re-entrant handling while preserving process
9+
// semantics needed by cleanup code.
10+
//
11+
// This guard supports per-signal policy:
12+
// - IgnoreAndBlock: block delivery and temporarily set disposition to SIG_IGN.
13+
// - BlockOnly: block delivery while leaving disposition unchanged.
14+
//
15+
// In practice, SIGPIPE is usually IgnoreAndBlock, while SIGCHLD should usually be BlockOnly
16+
// because SIG_IGN for SIGCHLD can change child-reaping semantics (waitpid/ECHILD behavior).
1717
pub struct SaGuard<const N: usize> {
18-
old_sigactions: [(signal::Signal, signal::SigAction); N],
18+
old_sigactions: [(signal::Signal, Option<signal::SigAction>); N],
1919
old_sigmask: signal::SigSet,
2020
}
2121

22+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
23+
pub enum SuppressionMode {
24+
/// Block delivery and set disposition to SIG_IGN while the guard is active.
25+
IgnoreAndBlock,
26+
/// Only block delivery while the guard is active
27+
BlockOnly,
28+
}
29+
2230
impl<const N: usize> SaGuard<N> {
23-
pub fn new(signals: &[signal::Signal; N]) -> anyhow::Result<Self> {
31+
pub fn new_with_modes(
32+
signals: &[(signal::Signal, SuppressionMode); N],
33+
) -> anyhow::Result<Self> {
2434
// Create an empty signal set for suppressing signals
2535
let mut suppressed_signals = signal::SigSet::empty();
26-
for signal in signals {
36+
for (signal, _) in signals {
2737
suppressed_signals.add(*signal);
2838
}
2939

@@ -36,26 +46,22 @@ impl<const N: usize> SaGuard<N> {
3646
)?;
3747

3848
// Initialize array for saving old signal actions
39-
let mut old_sigactions = [(
40-
signal::Signal::SIGINT,
41-
SigAction::new(
42-
SigHandler::SigDfl,
43-
SaFlags::empty(),
44-
signal::SigSet::empty(),
45-
),
46-
); N];
47-
48-
// Set SIG_IGN for the specified signals and save old handlers
49-
for (i, &signal) in signals.iter().enumerate() {
50-
let old_sigaction = unsafe {
51-
signal::sigaction(
52-
signal,
53-
&SigAction::new(
54-
SigHandler::SigIgn,
55-
SaFlags::empty(),
56-
signal::SigSet::empty(),
57-
),
58-
)?
49+
let mut old_sigactions = [(signal::Signal::SIGINT, None); N];
50+
51+
// Set SIG_IGN for configured signals and save old handlers when disposition changes
52+
for (i, &(signal, mode)) in signals.iter().enumerate() {
53+
let old_sigaction = match mode {
54+
SuppressionMode::IgnoreAndBlock => Some(unsafe {
55+
signal::sigaction(
56+
signal,
57+
&SigAction::new(
58+
SigHandler::SigIgn,
59+
SaFlags::empty(),
60+
signal::SigSet::empty(),
61+
),
62+
)?
63+
}),
64+
SuppressionMode::BlockOnly => None,
5965
};
6066
old_sigactions[i] = (signal, old_sigaction);
6167
}
@@ -69,14 +75,17 @@ impl<const N: usize> SaGuard<N> {
6975

7076
impl<const N: usize> Drop for SaGuard<N> {
7177
fn drop(&mut self) {
72-
// Restore the original signal actions
78+
// Restore the original signal actions first, before unblocking signals.
79+
// This prevents a window where deferred signals could fire with the wrong handler.
7380
for &(signal, old_sigaction) in &self.old_sigactions {
74-
unsafe {
75-
let _ = signal::sigaction(signal, &old_sigaction);
81+
if let Some(old_sigaction) = old_sigaction {
82+
unsafe {
83+
let _ = signal::sigaction(signal, &old_sigaction);
84+
}
7685
}
7786
}
7887

79-
// Restore the original signal mask
88+
// Now restore the original signal mask, which will deliver any deferred signals
8089
let _ = signal::sigprocmask(
8190
signal::SigmaskHow::SIG_SETMASK,
8291
Some(&self.old_sigmask),
@@ -91,11 +100,19 @@ mod single_threaded_tests {
91100
use nix::sys::signal::{self, Signal};
92101
use nix::unistd::Pid;
93102
use std::sync::atomic::{AtomicBool, Ordering};
103+
use std::sync::Mutex;
104+
105+
// These tests mutate global signal state, so we need to lock to avoid race conditions
106+
// even in single-threaded mode, as signal state can persist between test runs
107+
static SIGNAL_TEST_LOCK: Mutex<()> = Mutex::new(());
94108

95109
#[test]
96110
#[cfg_attr(miri, ignore)]
97111
fn signal_is_ignored_while_guard_is_active() {
98-
let _guard = SaGuard::<1>::new(&[Signal::SIGUSR1]).unwrap();
112+
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
113+
let _guard =
114+
SaGuard::<1>::new_with_modes(&[(Signal::SIGUSR1, SuppressionMode::IgnoreAndBlock)])
115+
.unwrap();
99116

100117
// Send SIGUSR1 to the process. The default action is to terminate, so if
101118
// the guard didn't set SIG_IGN this test process would die
@@ -108,6 +125,7 @@ mod single_threaded_tests {
108125
#[test]
109126
#[cfg_attr(miri, ignore)]
110127
fn original_handler_restored_after_drop() {
128+
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
111129
static HANDLER_CALLED: AtomicBool = AtomicBool::new(false);
112130

113131
extern "C" fn custom_handler(_: libc::c_int) {
@@ -124,7 +142,9 @@ mod single_threaded_tests {
124142

125143
// Create then drop the guard (dropped when out of scope)
126144
{
127-
let _guard = SaGuard::<1>::new(&[Signal::SIGUSR2]).unwrap();
145+
let _guard =
146+
SaGuard::<1>::new_with_modes(&[(Signal::SIGUSR2, SuppressionMode::IgnoreAndBlock)])
147+
.unwrap();
128148
signal::kill(Pid::this(), Signal::SIGUSR2).unwrap();
129149
assert!(
130150
!HANDLER_CALLED.load(Ordering::SeqCst),
@@ -150,10 +170,67 @@ mod single_threaded_tests {
150170
#[test]
151171
#[cfg_attr(miri, ignore)]
152172
fn multiple_signals_ignored() {
153-
let _guard = SaGuard::<2>::new(&[Signal::SIGUSR1, Signal::SIGUSR2]).unwrap();
173+
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
174+
let _guard = SaGuard::<2>::new_with_modes(&[
175+
(Signal::SIGUSR1, SuppressionMode::IgnoreAndBlock),
176+
(Signal::SIGUSR2, SuppressionMode::IgnoreAndBlock),
177+
])
178+
.unwrap();
154179

155180
// Both signals should be safely ignored
156181
signal::kill(Pid::this(), Signal::SIGUSR1).unwrap();
157182
signal::kill(Pid::this(), Signal::SIGUSR2).unwrap();
158183
}
184+
185+
#[test]
186+
#[cfg_attr(miri, ignore)]
187+
fn block_only_defers_signal_delivery() -> anyhow::Result<()> {
188+
let _test_lock = SIGNAL_TEST_LOCK.lock().unwrap();
189+
static SIGUSR1_COUNT: AtomicBool = AtomicBool::new(false);
190+
191+
extern "C" fn sigusr1_handler(_: libc::c_int) {
192+
SIGUSR1_COUNT.store(true, Ordering::SeqCst);
193+
}
194+
195+
let sig = Signal::SIGUSR1;
196+
197+
// Install a known handler and save the previous one so we can restore it
198+
let old_action = unsafe {
199+
signal::sigaction(
200+
sig,
201+
&SigAction::new(
202+
SigHandler::Handler(sigusr1_handler),
203+
SaFlags::empty(),
204+
signal::SigSet::empty(),
205+
),
206+
)?
207+
};
208+
209+
// Reset handler state
210+
SIGUSR1_COUNT.store(false, Ordering::SeqCst);
211+
212+
{
213+
let _guard = SaGuard::<1>::new_with_modes(&[(sig, SuppressionMode::BlockOnly)])?;
214+
215+
// Send SIGUSR1 to ourselves while it is blocked
216+
signal::raise(sig)?;
217+
218+
// Because the signal is blocked, the handler should not have run yet
219+
assert!(
220+
!SIGUSR1_COUNT.load(Ordering::SeqCst),
221+
"Handler should not be called while signal is blocked by BlockOnly guard"
222+
);
223+
} // guard drops here; old mask is restored, SIGUSR1 should now be delivered
224+
// After unblocking, the signal should be handled
225+
assert!(
226+
SIGUSR1_COUNT.load(Ordering::SeqCst),
227+
"Handler should be called after BlockOnly guard drops and pending signal is delivered"
228+
);
229+
// Restore the prev disposition
230+
unsafe {
231+
signal::sigaction(sig, &old_action)?;
232+
}
233+
234+
Ok(())
235+
}
159236
}

0 commit comments

Comments
 (0)