Skip to main content

scx_pandemonium/
main.rs

1// PANDEMONIUM -- SCHED_EXT KERNEL SCHEDULER
2// ADAPTIVE DESKTOP SCHEDULING FOR LINUX
3//
4// SCHEDULING DECISIONS HAPPEN IN BPF (ZERO KERNEL-USERSPACE ROUND TRIPS)
5// RUST USERSPACE HANDLES: ADAPTIVE CONTROL LOOP, MONITORING, BENCHMARKING
6
7#[allow(non_upper_case_globals)]
8#[allow(non_camel_case_types)]
9#[allow(non_snake_case)]
10#[allow(dead_code)]
11mod bpf_skel;
12
13mod bpf_intf;
14
15#[macro_use]
16mod log;
17mod adaptive;
18mod cli;
19mod procdb;
20mod scheduler;
21mod topology;
22mod tuning;
23mod watchdog;
24
25use std::mem::MaybeUninit;
26use std::sync::atomic::{AtomicBool, Ordering};
27use std::time::Duration;
28
29use anyhow::Result;
30use clap::{Parser, Subcommand};
31
32use scheduler::Scheduler;
33use scx_utils::build_id;
34
35static SHUTDOWN: AtomicBool = AtomicBool::new(false);
36
37#[derive(Parser)]
38#[command(name = "scx_pandemonium")]
39#[command(
40    version,
41    disable_version_flag = true,
42    about = "PANDEMONIUM -- ADAPTIVE LINUX SCHEDULER"
43)]
44struct Cli {
45    #[command(subcommand)]
46    command: Option<SubCmd>,
47
48    #[arg(short, long)]
49    verbose: bool,
50
51    /// Print scheduler version and exit.
52    #[arg(long)]
53    version: bool,
54
55    /// Internal: dump in-memory ring log on shutdown
56    #[arg(long, hide = true)]
57    dump_log: bool,
58
59    /// Internal: override CPU count for scaling formulas (test harness use)
60    #[arg(long, hide = true)]
61    nr_cpus: Option<u64>,
62
63    /// Run BPF scheduler only, disable Rust adaptive control loop
64    #[arg(long)]
65    no_adaptive: bool,
66
67    /// Additional compositor process names to boost to LAT_CRITICAL
68    #[arg(long)]
69    compositor: Vec<String>,
70}
71
72#[derive(Subcommand)]
73enum SubCmd {
74    /// Internal: interactive wakeup probe (Python test harness use)
75    #[command(hide = true)]
76    Probe,
77
78    /// Internal: CPU-pinned stress worker (Python test harness use)
79    #[command(hide = true)]
80    StressWorker(StressWorkerArgs),
81}
82
83#[derive(Parser)]
84struct StressWorkerArgs {
85    /// CPU to pin the stress worker to
86    #[arg(long)]
87    cpu: u32,
88}
89
90fn main() -> Result<()> {
91    let cli = Cli::parse();
92
93    let verbose = cli.verbose;
94    let dump_log = cli.dump_log;
95    let nr_cpus = cli.nr_cpus;
96    let no_adaptive = cli.no_adaptive;
97    let extra_compositors = cli.compositor;
98
99    if cli.version {
100        println!(
101            "scx_pandemonium {}",
102            build_id::full_version(env!("CARGO_PKG_VERSION"))
103        );
104        return Ok(());
105    }
106
107    match cli.command {
108        None => run_scheduler(verbose, dump_log, nr_cpus, no_adaptive, &extra_compositors),
109        Some(SubCmd::Probe) => {
110            cli::probe::run_probe();
111            Ok(())
112        }
113        Some(SubCmd::StressWorker(args)) => {
114            cli::stress::run_stress_worker(args.cpu);
115            Ok(())
116        }
117    }
118}
119
120// DEFAULT COMPOSITORS: BOOSTED TO LAT_CRITICAL VIA BPF MAP LOOKUP
121const DEFAULT_COMPOSITORS: &[&str] = &[
122    "kwin",
123    "gnome-shell",
124    "mutter",
125    "sway",
126    "Hyprland",
127    "picom",
128    "weston",
129    "labwc",
130    "wayfire",
131    "niri",
132];
133
134fn run_scheduler(
135    verbose: bool,
136    dump_log: bool,
137    nr_cpus: Option<u64>,
138    no_adaptive: bool,
139    extra_compositors: &[String],
140) -> Result<()> {
141    ctrlc::set_handler(move || {
142        SHUTDOWN.store(true, Ordering::Relaxed);
143    })?;
144
145    // WATCHDOG: ABORTS IF THE CONTROL LOOP STALLS FOR MORE THAN 10 SECONDS.
146    // LIBBPF MAP OPERATIONS CAN HANG ON KERNEL STALL / VERIFIER RELOAD /
147    // PERCPU CONTENTION; WITHOUT THIS, TELEMETRY AND KNOB WRITES STOP SILENTLY.
148    watchdog::spawn(&SHUTDOWN, Duration::from_secs(10));
149
150    let nr_cpus_display =
151        nr_cpus.unwrap_or_else(|| libbpf_rs::num_possible_cpus().unwrap_or(1) as u64);
152    let governor = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor")
153        .unwrap_or_default()
154        .trim()
155        .to_string();
156
157    let smt_on = std::fs::read_to_string("/sys/devices/system/cpu/smt/active")
158        .map(|s| s.trim() == "1")
159        .unwrap_or(false);
160
161    log_info!(
162        "scx_pandemonium {} SMT {}",
163        build_id::full_version(env!("CARGO_PKG_VERSION")),
164        if smt_on { "on" } else { "off" }
165    );
166    log_info!(
167        "CPUS: {} (governor: {})",
168        nr_cpus_display,
169        if governor.is_empty() {
170            "unknown"
171        } else {
172            &governor
173        }
174    );
175    log_info!("VERBOSE: {}", verbose);
176
177    let mut is_restart = false;
178    loop {
179        // ON RESTART, WAIT FOR KERNEL STRUCT_OPS CLEANUP.
180        // DETACH IS ASYNCHRONOUS -- UNDER HEAVY LOAD (12C SATURATED),
181        // THE KERNEL NEEDS TIME TO FULLY UNREGISTER THE OLD SCHEDULER.
182        if is_restart {
183            std::thread::sleep(Duration::from_secs(2));
184        }
185
186        let mut open_object = MaybeUninit::uninit();
187        let mut sched = Scheduler::init(&mut open_object, nr_cpus)?;
188
189        // POPULATE CACHE TOPOLOGY MAP AT STARTUP
190        match topology::CpuTopology::detect(nr_cpus_display as usize) {
191            Ok(topo) => {
192                topo.log_summary();
193                if let Err(e) = topo.populate_bpf_map(&sched) {
194                    log_warn!("CACHE TOPOLOGY MAP WRITE FAILED: {}", e);
195                }
196                if let Err(e) = topo.populate_l2_siblings_map(&sched) {
197                    log_warn!("L2 SIBLINGS MAP WRITE FAILED: {}", e);
198                }
199                // RESISTANCE AFFINITY: COMPUTE R_EFF VIA LAPLACIAN PSEUDOINVERSE
200                // AND POPULATE BPF AFFINITY RANK MAP. SPECTRUM CARRIES lambda_2
201                // AND tau_ns FOR UNIVERSAL TOPOLOGY-DERIVED SCALING.
202                let (reff, rank, spectrum) = topo.compute_resistance_affinity();
203                topo.log_resistance_affinity(&reff, &rank, spectrum);
204                if let Err(e) = topo.populate_affinity_rank_map(&sched, &rank) {
205                    log_warn!("AFFINITY RANK MAP WRITE FAILED: {}", e);
206                }
207                // WRITE tau_ns + codel_eq_ns INTO tuning_knobs. BPF'S tick() ON
208                // CPU 0 PICKS THESE UP AND DERIVES THE TAU-SCALED TIMING STATICS
209                // AND THE R_eff-DERIVED CODEL EQUILIBRIUM TARGET.
210                if let Err(e) = sched.write_topology_fields(spectrum.tau_ns, spectrum.codel_eq_ns) {
211                    log_warn!("TOPOLOGY KNOB WRITE FAILED: {}", e);
212                }
213            }
214            Err(e) => log_warn!("CACHE TOPOLOGY DETECT FAILED: {}", e),
215        }
216
217        // POPULATE COMPOSITOR MAP: DEFAULT + USER-SUPPLIED NAMES
218        for name in DEFAULT_COMPOSITORS {
219            if let Err(e) = sched.write_compositor(name) {
220                log_warn!("COMPOSITOR MAP WRITE FAILED: {} ({})", name, e);
221            }
222        }
223        for name in extra_compositors {
224            if let Err(e) = sched.write_compositor(name) {
225                log_warn!("COMPOSITOR MAP WRITE FAILED: {} ({})", name, e);
226            }
227        }
228
229        let should_restart = if no_adaptive {
230            // BPF-ONLY MODE: SCHEDULER RUNS WITH DEFAULT KNOBS, NO RUST TUNING
231            // STILL PRINTS STATS SO BENCHMARKS GET TELEMETRY FOR BOTH PHASES
232            log_info!("PANDEMONIUM IS ACTIVE (BPF ONLY, CTRL+C TO EXIT)");
233            let mut prev = scheduler::PandemoniumStats::default();
234            while !SHUTDOWN.load(Ordering::Relaxed) && !sched.exited() {
235                watchdog::LOOP_HEARTBEAT.fetch_add(1, Ordering::Relaxed);
236                std::thread::sleep(Duration::from_secs(1));
237
238                let stats = sched.read_stats();
239
240                let delta_d = stats.nr_dispatches.wrapping_sub(prev.nr_dispatches);
241                let delta_idle = stats.nr_idle_hits.wrapping_sub(prev.nr_idle_hits);
242                let delta_shared = stats.nr_shared.wrapping_sub(prev.nr_shared);
243                let delta_preempt = stats.nr_preempt.wrapping_sub(prev.nr_preempt);
244                let delta_keep = stats.nr_keep_running.wrapping_sub(prev.nr_keep_running);
245                let delta_wake_sum = stats.wake_lat_sum.wrapping_sub(prev.wake_lat_sum);
246                let delta_wake_samples = stats.wake_lat_samples.wrapping_sub(prev.wake_lat_samples);
247                let delta_hard = stats.nr_hard_kicks.wrapping_sub(prev.nr_hard_kicks);
248                let delta_soft = stats.nr_soft_kicks.wrapping_sub(prev.nr_soft_kicks);
249                let delta_enq_wake = stats.nr_enq_wakeup.wrapping_sub(prev.nr_enq_wakeup);
250                let delta_enq_requeue = stats.nr_enq_requeue.wrapping_sub(prev.nr_enq_requeue);
251                let wake_avg_us = if delta_wake_samples > 0 {
252                    delta_wake_sum / delta_wake_samples / 1000
253                } else {
254                    0
255                };
256
257                let d_idle_sum = stats.wake_lat_idle_sum.wrapping_sub(prev.wake_lat_idle_sum);
258                let d_idle_cnt = stats.wake_lat_idle_cnt.wrapping_sub(prev.wake_lat_idle_cnt);
259                let d_kick_sum = stats.wake_lat_kick_sum.wrapping_sub(prev.wake_lat_kick_sum);
260                let d_kick_cnt = stats.wake_lat_kick_cnt.wrapping_sub(prev.wake_lat_kick_cnt);
261                let lat_idle_us = if d_idle_cnt > 0 {
262                    d_idle_sum / d_idle_cnt / 1000
263                } else {
264                    0
265                };
266                let lat_kick_us = if d_kick_cnt > 0 {
267                    d_kick_sum / d_kick_cnt / 1000
268                } else {
269                    0
270                };
271                let delta_reenq = stats.nr_reenqueue.wrapping_sub(prev.nr_reenqueue);
272
273                // L2 CACHE AFFINITY DELTAS
274                let dl2_hb = stats.nr_l2_hit_batch.wrapping_sub(prev.nr_l2_hit_batch);
275                let dl2_mb = stats.nr_l2_miss_batch.wrapping_sub(prev.nr_l2_miss_batch);
276                let dl2_hi = stats
277                    .nr_l2_hit_interactive
278                    .wrapping_sub(prev.nr_l2_hit_interactive);
279                let dl2_mi = stats
280                    .nr_l2_miss_interactive
281                    .wrapping_sub(prev.nr_l2_miss_interactive);
282                let dl2_hl = stats
283                    .nr_l2_hit_lat_crit
284                    .wrapping_sub(prev.nr_l2_hit_lat_crit);
285                let dl2_ml = stats
286                    .nr_l2_miss_lat_crit
287                    .wrapping_sub(prev.nr_l2_miss_lat_crit);
288                let l2_pct_b = if dl2_hb + dl2_mb > 0 {
289                    dl2_hb * 100 / (dl2_hb + dl2_mb)
290                } else {
291                    0
292                };
293                let l2_pct_i = if dl2_hi + dl2_mi > 0 {
294                    dl2_hi * 100 / (dl2_hi + dl2_mi)
295                } else {
296                    0
297                };
298                let l2_pct_l = if dl2_hl + dl2_ml > 0 {
299                    dl2_hl * 100 / (dl2_hl + dl2_ml)
300                } else {
301                    0
302                };
303
304                let idle_pct = if delta_d > 0 {
305                    delta_idle * 100 / delta_d
306                } else {
307                    0
308                };
309
310                let sojourn_ms = stats.batch_sojourn_ns / 1_000_000;
311                let longrun_label = if stats.longrun_mode_active > 0 {
312                    " LONGRUN"
313                } else {
314                    ""
315                };
316
317                if verbose {
318                    println!(
319                        "d/s: {:<8} idle: {}% shared: {:<6} preempt: {:<4} keep: {:<4} kick: H={:<4} S={:<4} enq: W={:<4} R={:<4} wake: {}us lat_idle: {}us lat_kick: {}us reenq: {} sjrn: {}ms l2: B={}% I={}% L={}% [BPF{}]",
320                        delta_d, idle_pct, delta_shared, delta_preempt, delta_keep,
321                        delta_hard, delta_soft, delta_enq_wake, delta_enq_requeue,
322                        wake_avg_us, lat_idle_us, lat_kick_us,
323                        delta_reenq, sojourn_ms, l2_pct_b, l2_pct_i, l2_pct_l,
324                        longrun_label,
325                    );
326                }
327
328                sched.log.snapshot(
329                    delta_d,
330                    delta_idle,
331                    delta_shared,
332                    delta_preempt,
333                    delta_keep,
334                    wake_avg_us,
335                    delta_hard,
336                    delta_soft,
337                    lat_idle_us,
338                    lat_kick_us,
339                );
340
341                prev = stats;
342            }
343
344            // KNOBS SUMMARY: CAPTURED BY TEST HARNESS FOR ARCHIVE
345            let knobs = sched.read_tuning_knobs();
346            let final_stats = sched.read_stats();
347            let l2_total_b = final_stats.nr_l2_hit_batch + final_stats.nr_l2_miss_batch;
348            let l2_total_i = final_stats.nr_l2_hit_interactive + final_stats.nr_l2_miss_interactive;
349            let l2_total_l = final_stats.nr_l2_hit_lat_crit + final_stats.nr_l2_miss_lat_crit;
350            let l2_cum_b = if l2_total_b > 0 {
351                final_stats.nr_l2_hit_batch * 100 / l2_total_b
352            } else {
353                0
354            };
355            let l2_cum_i = if l2_total_i > 0 {
356                final_stats.nr_l2_hit_interactive * 100 / l2_total_i
357            } else {
358                0
359            };
360            let l2_cum_l = if l2_total_l > 0 {
361                final_stats.nr_l2_hit_lat_crit * 100 / l2_total_l
362            } else {
363                0
364            };
365            println!(
366                "[KNOBS] regime=BPF slice_ns={} batch_ns={} preempt_ns={} lag={} l2_hit=B:{}%/I:{}%/L:{}%",
367                knobs.slice_ns, knobs.batch_slice_ns,
368                knobs.preempt_thresh_ns,
369                knobs.lag_scale, l2_cum_b, l2_cum_i, l2_cum_l,
370            );
371
372            sched.read_exit_info()
373        } else {
374            // ADAPTIVE MODE: BPF + SINGLE-THREAD MONITOR LOOP
375            log_info!("PANDEMONIUM IS ACTIVE (CTRL+C TO EXIT)");
376            adaptive::monitor_loop(&mut sched, &SHUTDOWN, verbose, nr_cpus_display)?
377        };
378
379        log_info!("PANDEMONIUM IS SHUTTING DOWN");
380
381        if dump_log {
382            sched.log.dump();
383        }
384        sched.log.summary();
385
386        if !should_restart || SHUTDOWN.load(Ordering::Relaxed) {
387            break;
388        }
389
390        // RESET SHUTDOWN FOR RESTART
391        SHUTDOWN.store(false, Ordering::Relaxed);
392        log_info!("RESTARTING PANDEMONIUM...");
393        is_restart = true;
394    }
395
396    log_info!("Shutdown complete");
397    Ok(())
398}