scx_pandemonium/watchdog.rs
1// MONITOR-LOOP WATCHDOG: ABORTS IF THE ADAPTIVE CONTROL LOOP OR BPF-ONLY
2// TELEMETRY LOOP FAILS TO ADVANCE ITS HEARTBEAT WITHIN THE TIMEOUT. PROTECTS
3// AGAINST HUNG LIBBPF MAP OPERATIONS (KERNEL STALL, VERIFIER RELOAD, PERCPU
4// CONTENTION) THAT WOULD SILENTLY STOP KNOB UPDATES AND TELEMETRY.
5//
6// ABORT BYPASSES PROCDB SAVE INTENTIONALLY: A STALLED MONITOR LOOP MEANS
7// BPF STATE IS WEDGED, AND WE PREFER KERNEL WATCHDOG TAKEOVER OVER LIMPING.
8
9use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
10use std::time::Duration;
11
12pub static LOOP_HEARTBEAT: AtomicU64 = AtomicU64::new(0);
13
14pub fn spawn(shutdown: &'static AtomicBool, timeout: Duration) {
15 std::thread::Builder::new()
16 .name("pand-watchdog".into())
17 .spawn(move || {
18 let mut last = LOOP_HEARTBEAT.load(Ordering::Relaxed);
19 loop {
20 std::thread::sleep(timeout);
21 if shutdown.load(Ordering::Relaxed) {
22 return;
23 }
24 let cur = LOOP_HEARTBEAT.load(Ordering::Relaxed);
25 if cur == last {
26 eprintln!(
27 "[WATCHDOG] monitor loop stalled for >{:?}; aborting",
28 timeout
29 );
30 std::process::abort();
31 }
32 last = cur;
33 }
34 })
35 .expect("watchdog thread spawn");
36}