Skip to main content

scx_pandemonium/
watchdog.rs

1// MONITOR-LOOP WATCHDOG: ABORTS IF THE ADAPTIVE CONTROL LOOP OR BPF-ONLY
2// TELEMETRY LOOP FAILS TO ADVANCE ITS HEARTBEAT WITHIN THE TIMEOUT. PROTECTS
3// AGAINST HUNG LIBBPF MAP OPERATIONS (KERNEL STALL, VERIFIER RELOAD, PERCPU
4// CONTENTION) THAT WOULD SILENTLY STOP KNOB UPDATES AND TELEMETRY.
5//
6// ABORT BYPASSES PROCDB SAVE INTENTIONALLY: A STALLED MONITOR LOOP MEANS
7// BPF STATE IS WEDGED, AND WE PREFER KERNEL WATCHDOG TAKEOVER OVER LIMPING.
8
9use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
10use std::time::Duration;
11
12pub static LOOP_HEARTBEAT: AtomicU64 = AtomicU64::new(0);
13
14pub fn spawn(shutdown: &'static AtomicBool, timeout: Duration) {
15    std::thread::Builder::new()
16        .name("pand-watchdog".into())
17        .spawn(move || {
18            let mut last = LOOP_HEARTBEAT.load(Ordering::Relaxed);
19            loop {
20                std::thread::sleep(timeout);
21                if shutdown.load(Ordering::Relaxed) {
22                    return;
23                }
24                let cur = LOOP_HEARTBEAT.load(Ordering::Relaxed);
25                if cur == last {
26                    eprintln!(
27                        "[WATCHDOG] monitor loop stalled for >{:?}; aborting",
28                        timeout
29                    );
30                    std::process::abort();
31                }
32                last = cur;
33            }
34        })
35        .expect("watchdog thread spawn");
36}