Skip to content

Commit a6f17a9

Browse files
committed
fix: zygote pattern for concurrent exec deadlock + exit code propagation (#349)
Route clone3() and waitpid through a single-threaded zygote process to fix three issues with concurrent container exec: 1. **musl __malloc_lock deadlock**: clone3() from tokio spawn_blocking inherits locked __malloc_lock, deadlocking the child on first malloc. Zygote forks before tokio starts, keeping clone3() single-threaded. 2. **Exit code loss (ECHILD)**: Container processes are children of the zygote, not main. Added Wait IPC to route waitpid through zygote. 3. **Concurrent wait deadlock**: Blocking waitpid held zygote Mutex for process lifetime, blocking all other waits and builds. Added WNOHANG polling (10ms) so each IPC round-trip holds Mutex for ~microseconds. Key changes: - guest/src/container/zygote.rs: New zygote process with tagged IPC protocol (ZygoteRequest/ZygoteResponse for Build + Wait) - guest/src/service/exec/state.rs: Replace direct waitpid with WNOHANG polling loop via ZYGOTE.wait() - guest/src/main.rs: Start zygote before tokio runtime - docs/investigations/concurrent-exec-deadlock.md: Root cause analysis - 28 integration tests covering all scenarios
1 parent 2a3b304 commit a6f17a9

File tree

19 files changed

+3531
-166
lines changed

19 files changed

+3531
-166
lines changed

boxlite/build.rs

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -829,29 +829,34 @@ fn main() {
829829
/// Compute SHA256 hash of the `boxlite-guest` binary and embed it via `cargo:rustc-env`.
830830
///
831831
/// Search order:
832-
/// 1. `runtime_dir` (OUT_DIR/runtime/ — for prebuilt mode)
833-
/// 2. `target/boxlite-runtime/boxlite-guest` (assembled by `make runtime-debug`)
832+
/// 1. Direct build output: `target/{target-triple}/{profile}/boxlite-guest`
833+
/// 2. `runtime_dir` (OUT_DIR/runtime/ — for prebuilt mode)
834+
///
835+
/// IMPORTANT: The source binary (direct build output) must be checked FIRST because
836+
/// `compute_guest_hash` runs before `generate()` in main(). The OUT_DIR/runtime/ copy
837+
/// is placed there by a previous build's `generate()` and may be stale when the guest
838+
/// binary has been rebuilt since then.
834839
///
835840
/// If the binary isn't found, silently skips — runtime will compute the hash as fallback.
836841
fn compute_guest_hash(runtime_dir: &Path) {
837842
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
838-
839-
let candidates = [
840-
runtime_dir.join("boxlite-guest"),
841-
manifest_dir
842-
.parent()
843-
.map(|root| root.join("target/boxlite-runtime/boxlite-guest"))
844-
.unwrap_or_default(),
845-
];
846-
847-
let guest_path = candidates.iter().find(|p| p.is_file());
843+
let profile = env::var("PROFILE").unwrap_or_else(|_| "debug".to_string());
844+
845+
// Check source binary first (direct build output) to avoid stale OUT_DIR copies.
846+
let workspace_root = manifest_dir.parent().unwrap_or(&manifest_dir);
847+
let guest_path =
848+
EmbeddedManifest::find_prebuilt_guest(workspace_root, &profile).or_else(|| {
849+
// Fallback: OUT_DIR copy (for prebuilt/CI mode where source isn't available)
850+
let p = runtime_dir.join("boxlite-guest");
851+
p.is_file().then_some(p)
852+
});
848853

849854
let Some(guest_path) = guest_path else {
850855
println!("cargo:warning=boxlite-guest not found, skipping compile-time hash");
851856
return;
852857
};
853858

854-
match sha256_file(guest_path) {
859+
match sha256_file(&guest_path) {
855860
Ok(hash) => {
856861
println!("cargo:rustc-env=BOXLITE_GUEST_HASH={}", hash);
857862
println!("cargo:rerun-if-changed={}", guest_path.display());

boxlite/src/metrics/box_metrics.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ pub struct BoxMetricsStorage {
1717
/// Bytes received from this box (via stdout/stderr)
1818
pub(crate) bytes_received: AtomicU64,
1919

20+
// Per-command timing metrics (monotonic, updated per command)
21+
/// Sum of all command durations (milliseconds)
22+
pub(crate) exec_duration_total_ms: AtomicU64,
23+
/// Maximum single command duration (milliseconds)
24+
pub(crate) exec_max_duration_ms: AtomicU64,
25+
2026
// Timing metrics (set once, never change)
2127
/// Total time from create() call to LiteBox ready (includes all stages)
2228
pub(crate) total_create_duration_ms: Option<u128>,
@@ -45,6 +51,10 @@ impl Clone for BoxMetricsStorage {
4551
exec_errors: AtomicU64::new(self.exec_errors.load(Ordering::Relaxed)),
4652
bytes_sent: AtomicU64::new(self.bytes_sent.load(Ordering::Relaxed)),
4753
bytes_received: AtomicU64::new(self.bytes_received.load(Ordering::Relaxed)),
54+
exec_duration_total_ms: AtomicU64::new(
55+
self.exec_duration_total_ms.load(Ordering::Relaxed),
56+
),
57+
exec_max_duration_ms: AtomicU64::new(self.exec_max_duration_ms.load(Ordering::Relaxed)),
4858
total_create_duration_ms: self.total_create_duration_ms,
4959
guest_boot_duration_ms: self.guest_boot_duration_ms,
5060
stage_filesystem_setup_ms: self.stage_filesystem_setup_ms,
@@ -140,6 +150,15 @@ impl BoxMetricsStorage {
140150
pub(crate) fn add_bytes_received(&self, bytes: u64) {
141151
self.bytes_received.fetch_add(bytes, Ordering::Relaxed);
142152
}
153+
154+
/// Record a command duration: adds to total and updates max.
155+
#[allow(dead_code)]
156+
pub(crate) fn record_exec_duration(&self, duration_ms: u64) {
157+
self.exec_duration_total_ms
158+
.fetch_add(duration_ms, Ordering::Relaxed);
159+
self.exec_max_duration_ms
160+
.fetch_max(duration_ms, Ordering::Relaxed);
161+
}
143162
}
144163

145164
/// Handle for querying per-box metrics.
@@ -156,6 +175,10 @@ pub struct BoxMetrics {
156175
pub bytes_sent_total: u64,
157176
/// Bytes received from this box (via stdout/stderr)
158177
pub bytes_received_total: u64,
178+
/// Sum of all command durations (milliseconds, monotonic)
179+
pub exec_duration_total_ms: u64,
180+
/// Maximum single command duration (milliseconds)
181+
pub exec_max_duration_ms: u64,
159182
/// Total time from create() call to LiteBox ready (milliseconds)
160183
pub total_create_duration_ms: Option<u128>,
161184
/// Time from box subprocess spawn to guest agent ready (milliseconds)
@@ -204,6 +227,8 @@ impl BoxMetrics {
204227
exec_errors_total: storage.exec_errors.load(Ordering::Relaxed),
205228
bytes_sent_total: storage.bytes_sent.load(Ordering::Relaxed),
206229
bytes_received_total: storage.bytes_received.load(Ordering::Relaxed),
230+
exec_duration_total_ms: storage.exec_duration_total_ms.load(Ordering::Relaxed),
231+
exec_max_duration_ms: storage.exec_max_duration_ms.load(Ordering::Relaxed),
207232
total_create_duration_ms: storage.total_create_duration_ms,
208233
guest_boot_duration_ms: storage.guest_boot_duration_ms,
209234
cpu_percent,
@@ -251,6 +276,20 @@ impl BoxMetrics {
251276
self.bytes_received_total
252277
}
253278

279+
/// Sum of all command durations (milliseconds).
280+
///
281+
/// Never decreases (monotonic counter).
282+
pub fn exec_duration_total_ms(&self) -> u64 {
283+
self.exec_duration_total_ms
284+
}
285+
286+
/// Maximum single command duration (milliseconds).
287+
///
288+
/// Tracks worst-case exec latency.
289+
pub fn exec_max_duration_ms(&self) -> u64 {
290+
self.exec_max_duration_ms
291+
}
292+
254293
/// Total time from create() call to box ready (milliseconds).
255294
///
256295
/// Includes all initialization stages: filesystem setup, image pull,

boxlite/src/portal/interfaces/exec.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ impl ExecutionInterface {
5555
// Build request
5656
let request = ExecProtocol::build_exec_request(&command);
5757

58-
tracing::debug!(?command, "Starting execution");
58+
tracing::debug!(command = %command.command, "exec RPC: sending request");
5959

6060
// Start execution
6161
let exec_response = self.client.exec(request).await?.into_inner();
@@ -68,6 +68,8 @@ impl ExecutionInterface {
6868

6969
let execution_id = exec_response.execution_id.clone();
7070

71+
tracing::debug!(execution_id = %execution_id, "spawning background streams");
72+
7173
// Spawn stdin pump (cancellable — exits cleanly during shutdown)
7274
ExecProtocol::spawn_stdin(
7375
self.client.clone(),
@@ -257,15 +259,15 @@ impl ExecProtocol {
257259
let response = tokio::select! {
258260
biased;
259261
_ = shutdown_token.cancelled() => {
260-
tracing::debug!(execution_id = %execution_id, "Attach cancelled during connect");
262+
tracing::debug!(execution_id = %execution_id, "attach cancelled during connect");
261263
return;
262264
}
263265
result = client.attach(request) => result,
264266
};
265267

266268
match response {
267269
Ok(response) => {
268-
tracing::debug!(execution_id = %execution_id, "Attach stream connected");
270+
tracing::debug!(execution_id = %execution_id, "attach stream connected");
269271
let mut stream = response.into_inner();
270272
let mut message_count = 0u64;
271273

@@ -351,6 +353,8 @@ impl ExecProtocol {
351353
execution_id: execution_id.clone(),
352354
};
353355

356+
tracing::debug!(execution_id = %execution_id, "wait: sending request");
357+
354358
// Use select! to handle cancellation during wait
355359
let result = tokio::select! {
356360
biased;
@@ -391,6 +395,7 @@ impl ExecProtocol {
391395
shutdown_token: CancellationToken,
392396
) {
393397
tokio::spawn(async move {
398+
tracing::debug!(execution_id = %execution_id, "stdin: starting stream");
394399
let (tx, rx) = mpsc::channel::<ExecStdin>(8);
395400

396401
// Producer: forward stdin channel into tonic stream

boxlite/src/rest/litebox.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,8 @@ fn box_metrics_from_response(resp: &BoxMetricsResponse) -> BoxMetrics {
632632
exec_errors_total: resp.exec_errors_total,
633633
bytes_sent_total: resp.bytes_sent_total,
634634
bytes_received_total: resp.bytes_received_total,
635+
exec_duration_total_ms: resp.exec_duration_total_ms.unwrap_or(0),
636+
exec_max_duration_ms: resp.exec_max_duration_ms.unwrap_or(0),
635637
total_create_duration_ms: total_create_ms,
636638
guest_boot_duration_ms: guest_boot_ms,
637639
cpu_percent: resp.cpu_percent,

boxlite/src/rest/types.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,10 @@ pub(crate) struct BoxMetricsResponse {
349349
pub bytes_sent_total: u64,
350350
#[serde(default)]
351351
pub bytes_received_total: u64,
352+
#[serde(default)]
353+
pub exec_duration_total_ms: Option<u64>,
354+
#[serde(default)]
355+
pub exec_max_duration_ms: Option<u64>,
352356
pub cpu_percent: Option<f32>,
353357
pub memory_bytes: Option<u64>,
354358
pub network_bytes_sent: Option<u64>,

boxlite/src/runtime/embedded.rs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ impl EmbeddedRuntime {
106106
Self::set_permissions(&path, name)?;
107107
}
108108

109+
// macOS: sign shim with hypervisor entitlement after extraction.
110+
// The embedded bytes are from an unsigned build artifact; the
111+
// Hypervisor.framework requires com.apple.security.hypervisor.
112+
#[cfg(target_os = "macos")]
113+
Self::sign_shim(&tmp)?;
114+
109115
// Stamp marks extraction as complete — checked by the fast path above.
110116
std::fs::write(tmp.join(".complete"), crate::VERSION)
111117
.map_err(|e| BoxliteError::Storage(format!("write stamp: {}", e)))?;
@@ -210,6 +216,59 @@ impl EmbeddedRuntime {
210216
BoxliteError::Storage(format!("chmod {:o} {}: {}", mode, path.display(), e))
211217
})
212218
}
219+
220+
/// Sign boxlite-shim with hypervisor entitlement (macOS only).
221+
///
222+
/// The embedded shim bytes come from an unsigned build artifact.
223+
/// macOS Hypervisor.framework requires `com.apple.security.hypervisor`
224+
/// entitlement, so we ad-hoc sign after extraction.
225+
#[cfg(target_os = "macos")]
226+
fn sign_shim(dir: &Path) -> BoxliteResult<()> {
227+
let shim = dir.join("boxlite-shim");
228+
if !shim.exists() {
229+
return Ok(());
230+
}
231+
232+
let entitlements = "\
233+
<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\
234+
<!DOCTYPE plist PUBLIC \"-//Apple//DTD PLIST 1.0//EN\" \"http://www.apple.com/DTDs/PropertyList-1.0.dtd\">\n\
235+
<plist version=\"1.0\">\n\
236+
<dict>\n\
237+
\t<key>com.apple.security.hypervisor</key>\n\
238+
\t<true/>\n\
239+
\t<key>com.apple.security.cs.disable-library-validation</key>\n\
240+
\t<true/>\n\
241+
</dict>\n\
242+
</plist>";
243+
244+
// Write entitlements to a temp file
245+
let ent_path = dir.join(".entitlements.plist");
246+
std::fs::write(&ent_path, entitlements)
247+
.map_err(|e| BoxliteError::Storage(format!("write entitlements: {}", e)))?;
248+
249+
let output = std::process::Command::new("codesign")
250+
.args(["-s", "-", "--force", "--entitlements"])
251+
.arg(&ent_path)
252+
.arg(&shim)
253+
.output()
254+
.map_err(|e| BoxliteError::Storage(format!("codesign: {}", e)))?;
255+
256+
// Clean up entitlements file
257+
let _ = std::fs::remove_file(&ent_path);
258+
259+
if !output.status.success() {
260+
let stderr = String::from_utf8_lossy(&output.stderr);
261+
tracing::warn!(
262+
shim = %shim.display(),
263+
stderr = %stderr,
264+
"Failed to sign extracted shim (non-fatal on Linux)"
265+
);
266+
} else {
267+
tracing::info!(shim = %shim.display(), "Signed extracted shim with hypervisor entitlement");
268+
}
269+
270+
Ok(())
271+
}
213272
}
214273

215274
#[cfg(test)]

0 commit comments

Comments
 (0)