Skip to main content

scx_pandemonium/
main.rs

1// PANDEMONIUM -- SCHED_EXT KERNEL SCHEDULER
2// ADAPTIVE DESKTOP SCHEDULING FOR LINUX
3//
4// SCHEDULING DECISIONS HAPPEN IN BPF (ZERO KERNEL-USERSPACE ROUND TRIPS)
5// RUST USERSPACE HANDLES: ADAPTIVE CONTROL LOOP, MONITORING, BENCHMARKING
6
7#[allow(non_upper_case_globals)]
8#[allow(non_camel_case_types)]
9#[allow(non_snake_case)]
10#[allow(dead_code)]
11mod bpf_skel;
12
13#[macro_use]
14mod log;
15mod adaptive;
16mod cli;
17mod procdb;
18mod scheduler;
19mod topology;
20mod tuning;
21
22use std::mem::MaybeUninit;
23use std::sync::atomic::{AtomicBool, Ordering};
24use std::time::Duration;
25
26use anyhow::Result;
27use clap::{Parser, Subcommand};
28
29use scheduler::Scheduler;
30use scx_utils::build_id;
31
32static SHUTDOWN: AtomicBool = AtomicBool::new(false);
33
34#[derive(Parser)]
35#[command(name = "scx_pandemonium")]
36#[command(
37    version,
38    disable_version_flag = true,
39    about = "PANDEMONIUM -- ADAPTIVE LINUX SCHEDULER"
40)]
41struct Cli {
42    #[command(subcommand)]
43    command: Option<SubCmd>,
44
45    #[arg(short, long)]
46    verbose: bool,
47
48    /// Print scheduler version and exit.
49    #[arg(long)]
50    version: bool,
51
52    #[arg(long)]
53    dump_log: bool,
54
55    /// Override CPU count for scaling formulas (default: auto-detect)
56    #[arg(long)]
57    nr_cpus: Option<u64>,
58
59    /// Run BPF scheduler only, disable Rust adaptive control loop
60    #[arg(long)]
61    no_adaptive: bool,
62
63    /// Additional compositor process names to boost to LAT_CRITICAL
64    #[arg(long)]
65    compositor: Vec<String>,
66}
67
68#[derive(Subcommand)]
69enum SubCmd {
70    /// Check dependencies and kernel config
71    Check,
72
73    /// Run interactive wakeup probe (stdout: overshoot_us per line)
74    Probe(ProbeArgs),
75
76    /// Build, run with sudo, capture output + dmesg, save logs
77    Start(StartArgs),
78
79    /// Show filtered kernel dmesg for sched_ext/pandemonium
80    Dmesg,
81
82    /// A/B benchmark (EEVDF baseline vs PANDEMONIUM)
83    Bench(BenchArgs),
84
85    /// Build release then run bench (logs to /tmp/pandemonium)
86    BenchRun(BenchRunArgs),
87
88    /// Run test gate (unit + integration)
89    Test,
90
91    /// CPU-pinned stress worker for bench-scale (internal use)
92    StressWorker(StressWorkerArgs),
93}
94
95#[derive(Parser)]
96struct ProbeArgs {
97    /// Death pipe FD for orphan detection (internal use)
98    #[arg(long)]
99    death_pipe_fd: Option<i32>,
100}
101
102#[derive(Parser)]
103struct StressWorkerArgs {
104    /// CPU to pin the stress worker to
105    #[arg(long)]
106    cpu: u32,
107}
108
109#[derive(Parser)]
110struct StartArgs {
111    /// Run with --verbose --dump-log
112    #[arg(long)]
113    observe: bool,
114
115    /// Extra args forwarded to `pandemonium run`
116    #[arg(last = true)]
117    sched_args: Vec<String>,
118}
119
120#[derive(Parser)]
121struct BenchArgs {
122    /// Benchmark mode
123    #[arg(long, value_enum)]
124    mode: cli::bench::BenchMode,
125
126    /// Command to benchmark (for --mode cmd)
127    #[arg(long)]
128    cmd: Option<String>,
129
130    /// Number of iterations per phase
131    #[arg(long, default_value_t = 3)]
132    iterations: usize,
133
134    /// Clean command between iterations (for --mode cmd)
135    #[arg(long)]
136    clean_cmd: Option<String>,
137
138    /// Extra args forwarded to `pandemonium run`
139    #[arg(last = true)]
140    sched_args: Vec<String>,
141}
142
143#[derive(Parser)]
144struct BenchRunArgs {
145    /// Benchmark mode
146    #[arg(long, value_enum)]
147    mode: cli::bench::BenchMode,
148
149    /// Command to benchmark (for --mode cmd)
150    #[arg(long)]
151    cmd: Option<String>,
152
153    /// Number of iterations per phase
154    #[arg(long, default_value_t = 3)]
155    iterations: usize,
156
157    /// Clean command between iterations (for --mode cmd)
158    #[arg(long)]
159    clean_cmd: Option<String>,
160
161    /// Extra args forwarded to `pandemonium run`
162    #[arg(last = true)]
163    sched_args: Vec<String>,
164}
165
166fn main() -> Result<()> {
167    let cli = Cli::parse();
168
169    let verbose = cli.verbose;
170    let dump_log = cli.dump_log;
171    let nr_cpus = cli.nr_cpus;
172    let no_adaptive = cli.no_adaptive;
173    let extra_compositors = cli.compositor;
174
175    if cli.version {
176        println!(
177            "scx_pandemonium {}",
178            build_id::full_version(env!("CARGO_PKG_VERSION"))
179        );
180        return Ok(());
181    }
182
183    match cli.command {
184        None => run_scheduler(verbose, dump_log, nr_cpus, no_adaptive, &extra_compositors),
185        Some(SubCmd::Check) => cli::check::run_check(),
186        Some(SubCmd::Probe(args)) => {
187            cli::probe::run_probe(args.death_pipe_fd);
188            Ok(())
189        }
190        Some(SubCmd::Start(args)) => cli::run::run_start(args.observe, &args.sched_args),
191        Some(SubCmd::Dmesg) => cli::run::run_dmesg(),
192        Some(SubCmd::Bench(args)) => cli::bench::run_bench(
193            args.mode,
194            args.cmd.as_deref(),
195            args.iterations,
196            args.clean_cmd.as_deref(),
197            &args.sched_args,
198        ),
199        Some(SubCmd::BenchRun(args)) => cli::bench::run_bench_run(
200            args.mode,
201            args.cmd.as_deref(),
202            args.iterations,
203            args.clean_cmd.as_deref(),
204            &args.sched_args,
205        ),
206        Some(SubCmd::Test) => cli::test_gate::run_test_gate(),
207        Some(SubCmd::StressWorker(args)) => {
208            cli::stress::run_stress_worker(args.cpu);
209            Ok(())
210        }
211    }
212}
213
214// DEFAULT COMPOSITORS: BOOSTED TO LAT_CRITICAL VIA BPF MAP LOOKUP
215const DEFAULT_COMPOSITORS: &[&str] = &[
216    "kwin",
217    "gnome-shell",
218    "mutter",
219    "sway",
220    "Hyprland",
221    "picom",
222    "weston",
223    "labwc",
224    "wayfire",
225    "niri",
226    "pandemonium",
227];
228
229fn run_scheduler(
230    verbose: bool,
231    dump_log: bool,
232    nr_cpus: Option<u64>,
233    no_adaptive: bool,
234    extra_compositors: &[String],
235) -> Result<()> {
236    ctrlc::set_handler(move || {
237        SHUTDOWN.store(true, Ordering::Relaxed);
238    })?;
239
240    let nr_cpus_display =
241        nr_cpus.unwrap_or_else(|| libbpf_rs::num_possible_cpus().unwrap_or(1) as u64);
242    let governor = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor")
243        .unwrap_or_default()
244        .trim()
245        .to_string();
246
247    let smt_on = std::fs::read_to_string("/sys/devices/system/cpu/smt/active")
248        .map(|s| s.trim() == "1")
249        .unwrap_or(false);
250
251    log_info!(
252        "scx_pandemonium {} SMT {}",
253        build_id::full_version(env!("CARGO_PKG_VERSION")),
254        if smt_on { "on" } else { "off" }
255    );
256    log_info!(
257        "CPUS: {} (governor: {})",
258        nr_cpus_display,
259        if governor.is_empty() {
260            "unknown"
261        } else {
262            &governor
263        }
264    );
265    log_info!("VERBOSE: {}", verbose);
266
267    let mut is_restart = false;
268    loop {
269        // ON RESTART, WAIT FOR KERNEL STRUCT_OPS CLEANUP.
270        // DETACH IS ASYNCHRONOUS -- UNDER HEAVY LOAD (12C SATURATED),
271        // THE KERNEL NEEDS TIME TO FULLY UNREGISTER THE OLD SCHEDULER.
272        if is_restart {
273            std::thread::sleep(Duration::from_secs(2));
274        }
275
276        let mut open_object = MaybeUninit::uninit();
277        let mut sched = Scheduler::init(&mut open_object, nr_cpus)?;
278
279        // POPULATE CACHE TOPOLOGY MAP AT STARTUP
280        match topology::CpuTopology::detect(nr_cpus_display as usize) {
281            Ok(topo) => {
282                topo.log_summary();
283                if let Err(e) = topo.populate_bpf_map(&sched) {
284                    log_warn!("CACHE TOPOLOGY MAP WRITE FAILED: {}", e);
285                }
286                if let Err(e) = topo.populate_l2_siblings_map(&sched) {
287                    log_warn!("L2 SIBLINGS MAP WRITE FAILED: {}", e);
288                }
289                // RESISTANCE AFFINITY: COMPUTE R_EFF VIA LAPLACIAN PSEUDOINVERSE
290                // AND POPULATE BPF AFFINITY RANK MAP
291                let (reff, rank) = topo.compute_resistance_affinity();
292                topo.log_resistance_affinity(&reff, &rank);
293                if let Err(e) = topo.populate_affinity_rank_map(&sched, &rank) {
294                    log_warn!("AFFINITY RANK MAP WRITE FAILED: {}", e);
295                }
296            }
297            Err(e) => log_warn!("CACHE TOPOLOGY DETECT FAILED: {}", e),
298        }
299
300        // POPULATE COMPOSITOR MAP: DEFAULT + USER-SUPPLIED NAMES
301        for name in DEFAULT_COMPOSITORS {
302            if let Err(e) = sched.write_compositor(name) {
303                log_warn!("COMPOSITOR MAP WRITE FAILED: {} ({})", name, e);
304            }
305        }
306        for name in extra_compositors {
307            if let Err(e) = sched.write_compositor(name) {
308                log_warn!("COMPOSITOR MAP WRITE FAILED: {} ({})", name, e);
309            }
310        }
311
312        let should_restart = if no_adaptive {
313            // BPF-ONLY MODE: SCHEDULER RUNS WITH DEFAULT KNOBS, NO RUST TUNING
314            // STILL PRINTS STATS SO BENCHMARKS GET TELEMETRY FOR BOTH PHASES
315            log_info!("PANDEMONIUM IS ACTIVE (BPF ONLY, CTRL+C TO EXIT)");
316            let mut prev = scheduler::PandemoniumStats::default();
317            while !SHUTDOWN.load(Ordering::Relaxed) && !sched.exited() {
318                std::thread::sleep(Duration::from_secs(1));
319
320                let stats = sched.read_stats();
321
322                let delta_d = stats.nr_dispatches.wrapping_sub(prev.nr_dispatches);
323                let delta_idle = stats.nr_idle_hits.wrapping_sub(prev.nr_idle_hits);
324                let delta_shared = stats.nr_shared.wrapping_sub(prev.nr_shared);
325                let delta_preempt = stats.nr_preempt.wrapping_sub(prev.nr_preempt);
326                let delta_keep = stats.nr_keep_running.wrapping_sub(prev.nr_keep_running);
327                let delta_wake_sum = stats.wake_lat_sum.wrapping_sub(prev.wake_lat_sum);
328                let delta_wake_samples = stats.wake_lat_samples.wrapping_sub(prev.wake_lat_samples);
329                let delta_hard = stats.nr_hard_kicks.wrapping_sub(prev.nr_hard_kicks);
330                let delta_soft = stats.nr_soft_kicks.wrapping_sub(prev.nr_soft_kicks);
331                let delta_enq_wake = stats.nr_enq_wakeup.wrapping_sub(prev.nr_enq_wakeup);
332                let delta_enq_requeue = stats.nr_enq_requeue.wrapping_sub(prev.nr_enq_requeue);
333                let wake_avg_us = if delta_wake_samples > 0 {
334                    delta_wake_sum / delta_wake_samples / 1000
335                } else {
336                    0
337                };
338
339                let d_idle_sum = stats.wake_lat_idle_sum.wrapping_sub(prev.wake_lat_idle_sum);
340                let d_idle_cnt = stats.wake_lat_idle_cnt.wrapping_sub(prev.wake_lat_idle_cnt);
341                let d_kick_sum = stats.wake_lat_kick_sum.wrapping_sub(prev.wake_lat_kick_sum);
342                let d_kick_cnt = stats.wake_lat_kick_cnt.wrapping_sub(prev.wake_lat_kick_cnt);
343                let lat_idle_us = if d_idle_cnt > 0 {
344                    d_idle_sum / d_idle_cnt / 1000
345                } else {
346                    0
347                };
348                let lat_kick_us = if d_kick_cnt > 0 {
349                    d_kick_sum / d_kick_cnt / 1000
350                } else {
351                    0
352                };
353                let delta_procdb = stats.nr_procdb_hits.wrapping_sub(prev.nr_procdb_hits);
354                let delta_reenq = stats.nr_reenqueue.wrapping_sub(prev.nr_reenqueue);
355
356                // L2 CACHE AFFINITY DELTAS
357                let dl2_hb = stats.nr_l2_hit_batch.wrapping_sub(prev.nr_l2_hit_batch);
358                let dl2_mb = stats.nr_l2_miss_batch.wrapping_sub(prev.nr_l2_miss_batch);
359                let dl2_hi = stats
360                    .nr_l2_hit_interactive
361                    .wrapping_sub(prev.nr_l2_hit_interactive);
362                let dl2_mi = stats
363                    .nr_l2_miss_interactive
364                    .wrapping_sub(prev.nr_l2_miss_interactive);
365                let dl2_hl = stats
366                    .nr_l2_hit_lat_crit
367                    .wrapping_sub(prev.nr_l2_hit_lat_crit);
368                let dl2_ml = stats
369                    .nr_l2_miss_lat_crit
370                    .wrapping_sub(prev.nr_l2_miss_lat_crit);
371                let l2_pct_b = if dl2_hb + dl2_mb > 0 {
372                    dl2_hb * 100 / (dl2_hb + dl2_mb)
373                } else {
374                    0
375                };
376                let l2_pct_i = if dl2_hi + dl2_mi > 0 {
377                    dl2_hi * 100 / (dl2_hi + dl2_mi)
378                } else {
379                    0
380                };
381                let l2_pct_l = if dl2_hl + dl2_ml > 0 {
382                    dl2_hl * 100 / (dl2_hl + dl2_ml)
383                } else {
384                    0
385                };
386
387                let idle_pct = if delta_d > 0 {
388                    delta_idle * 100 / delta_d
389                } else {
390                    0
391                };
392
393                let sojourn_ms = stats.batch_sojourn_ns / 1_000_000;
394                let delta_burst = stats.burst_mode_active.wrapping_sub(prev.burst_mode_active);
395                let burst_label = if delta_burst > 0 { " BURST" } else { "" };
396                let longrun_label = if stats.longrun_mode_active > 0 {
397                    " LONGRUN"
398                } else {
399                    ""
400                };
401
402                if verbose {
403                    println!(
404                        "d/s: {:<8} idle: {}% shared: {:<6} preempt: {:<4} keep: {:<4} kick: H={:<4} S={:<4} enq: W={:<4} R={:<4} wake: {}us lat_idle: {}us lat_kick: {}us procdb: {} reenq: {} sjrn: {}ms l2: B={}% I={}% L={}% [BPF{}{}]",
405                        delta_d, idle_pct, delta_shared, delta_preempt, delta_keep,
406                        delta_hard, delta_soft, delta_enq_wake, delta_enq_requeue,
407                        wake_avg_us, lat_idle_us, lat_kick_us, delta_procdb,
408                        delta_reenq, sojourn_ms, l2_pct_b, l2_pct_i, l2_pct_l,
409                        burst_label, longrun_label,
410                    );
411                }
412
413                sched.log.snapshot(
414                    delta_d,
415                    delta_idle,
416                    delta_shared,
417                    delta_preempt,
418                    delta_keep,
419                    wake_avg_us,
420                    delta_hard,
421                    delta_soft,
422                    lat_idle_us,
423                    lat_kick_us,
424                );
425
426                prev = stats;
427            }
428
429            // KNOBS SUMMARY: CAPTURED BY TEST HARNESS FOR ARCHIVE
430            let knobs = sched.read_tuning_knobs();
431            let final_stats = sched.read_stats();
432            let l2_total_b = final_stats.nr_l2_hit_batch + final_stats.nr_l2_miss_batch;
433            let l2_total_i = final_stats.nr_l2_hit_interactive + final_stats.nr_l2_miss_interactive;
434            let l2_total_l = final_stats.nr_l2_hit_lat_crit + final_stats.nr_l2_miss_lat_crit;
435            let l2_cum_b = if l2_total_b > 0 {
436                final_stats.nr_l2_hit_batch * 100 / l2_total_b
437            } else {
438                0
439            };
440            let l2_cum_i = if l2_total_i > 0 {
441                final_stats.nr_l2_hit_interactive * 100 / l2_total_i
442            } else {
443                0
444            };
445            let l2_cum_l = if l2_total_l > 0 {
446                final_stats.nr_l2_hit_lat_crit * 100 / l2_total_l
447            } else {
448                0
449            };
450            println!(
451                "[KNOBS] regime=BPF slice_ns={} batch_ns={} preempt_ns={} demotion_ns={} lag={} l2_hit=B:{}%/I:{}%/L:{}%",
452                knobs.slice_ns, knobs.batch_slice_ns,
453                knobs.preempt_thresh_ns, knobs.cpu_bound_thresh_ns,
454                knobs.lag_scale, l2_cum_b, l2_cum_i, l2_cum_l,
455            );
456
457            sched.read_exit_info()
458        } else {
459            // ADAPTIVE MODE: BPF + SINGLE-THREAD MONITOR LOOP
460            log_info!("PANDEMONIUM IS ACTIVE (CTRL+C TO EXIT)");
461            adaptive::monitor_loop(&mut sched, &SHUTDOWN, verbose, nr_cpus_display)?
462        };
463
464        log_info!("PANDEMONIUM IS SHUTTING DOWN");
465
466        if dump_log {
467            sched.log.dump();
468        }
469        sched.log.summary();
470
471        if !should_restart || SHUTDOWN.load(Ordering::Relaxed) {
472            break;
473        }
474
475        // RESET SHUTDOWN FOR RESTART
476        SHUTDOWN.store(false, Ordering::Relaxed);
477        log_info!("RESTARTING PANDEMONIUM...");
478        is_restart = true;
479    }
480
481    log_info!("Shutdown complete");
482    Ok(())
483}