Skip to main content

scx_cake/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2// scx_cake - sched_ext scheduler applying CAKE bufferbloat concepts to CPU scheduling
3
4mod calibrate;
5mod stats;
6mod topology;
7mod tui;
8
9use core::sync::atomic::Ordering;
10use std::os::fd::AsRawFd;
11use std::sync::atomic::AtomicBool;
12use std::sync::Arc;
13
14use anyhow::{Context, Result};
15use clap::{Parser, ValueEnum};
16use log::{info, warn};
17use nix::sys::signal::{SigSet, Signal};
18use nix::sys::signalfd::{SfdFlags, SignalFd};
19// Include the generated interface bindings
20#[allow(non_camel_case_types, non_upper_case_globals, dead_code)]
21mod bpf_intf {
22    include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
23}
24
25// Include the generated BPF skeleton
26#[allow(non_camel_case_types, non_upper_case_globals, dead_code)]
27mod bpf_skel {
28    include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
29}
30use bpf_skel::*;
31
32/// Scheduler profile presets
33#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
34pub enum Profile {
35    /// Ultra-low-latency for competitive esports (1ms quantum)
36    Esports,
37    /// Optimized for older/lower-power hardware (4ms quantum)
38    Legacy,
39    /// Low-latency profile optimized for gaming and interactive workloads
40    Gaming,
41    /// Balanced profile for general desktop use (same as gaming for now)
42    Default,
43}
44
45impl Profile {
46    /// Returns (quantum_us, new_flow_bonus_us, starvation_us)
47    fn values(&self) -> (u64, u64, u64) {
48        match self {
49            // Esports: Ultra-aggressive, 1ms quantum for maximum responsiveness
50            Profile::Esports => (1000, 4000, 50000),
51            // Legacy: High efficiency, 4ms quantum to reduce overhead on older CPUs
52            Profile::Legacy => (4000, 12000, 200000),
53            // Gaming: Aggressive latency, 2ms quantum
54            Profile::Gaming => (2000, 8000, 100000),
55            // Default: Same as gaming for now
56            Profile::Default => (2000, 8000, 100000),
57        }
58    }
59
60    /// Per-tier starvation thresholds in nanoseconds (4 tiers + padding)
61    fn starvation_threshold(&self) -> [u64; 8] {
62        match self {
63            Profile::Esports => [
64                1_500_000,  // T0 Critical: 1.5ms
65                4_000_000,  // T1 Interactive: 4ms
66                20_000_000, // T2 Frame: 20ms
67                50_000_000, // T3 Bulk: 50ms
68                50_000_000, 50_000_000, 50_000_000, 50_000_000, // Padding
69            ],
70            Profile::Legacy => [
71                6_000_000,   // T0 Critical: 6ms
72                16_000_000,  // T1 Interactive: 16ms
73                80_000_000,  // T2 Frame: 80ms
74                200_000_000, // T3 Bulk: 200ms
75                200_000_000,
76                200_000_000,
77                200_000_000,
78                200_000_000, // Padding
79            ],
80            Profile::Gaming | Profile::Default => [
81                3_000_000,   // T0 Critical: 3ms
82                8_000_000,   // T1 Interactive: 8ms
83                40_000_000,  // T2 Frame: 40ms
84                100_000_000, // T3 Bulk: 100ms
85                100_000_000,
86                100_000_000,
87                100_000_000,
88                100_000_000, // Padding
89            ],
90        }
91    }
92
93    /// Tier quantum multipliers (fixed-point, 1024 = 1.0x) — 4 tiers + padding
94    fn tier_multiplier(&self) -> [u32; 8] {
95        match self {
96            Profile::Esports | Profile::Legacy | Profile::Gaming | Profile::Default => [
97                768,  // T0 Critical: 0.75x
98                1024, // T1 Interactive: 1.0x
99                1229, // T2 Frame: 1.2x
100                1434, // T3 Bulk: 1.4x
101                1434, 1434, 1434, 1434, // Padding
102            ],
103        }
104    }
105
106    /// Wait budget per tier in nanoseconds — 4 tiers + padding
107    fn wait_budget(&self) -> [u64; 8] {
108        match self {
109            Profile::Esports => [
110                50_000,    // T0 Critical: 50µs
111                1_000_000, // T1 Interactive: 1ms
112                4_000_000, // T2 Frame: 4ms
113                0,         // T3 Bulk: no limit
114                0, 0, 0, 0, // Padding
115            ],
116            Profile::Legacy => [
117                200_000,    // T0 Critical: 200µs
118                4_000_000,  // T1 Interactive: 4ms
119                16_000_000, // T2 Frame: 16ms
120                0,          // T3 Bulk: no limit
121                0, 0, 0, 0, // Padding
122            ],
123            Profile::Gaming | Profile::Default => [
124                100_000,   // T0 Critical: 100µs
125                2_000_000, // T1 Interactive: 2ms
126                8_000_000, // T2 Frame: 8ms
127                0,         // T3 Bulk: no limit
128                0, 0, 0, 0, // Padding
129            ],
130        }
131    }
132
133    /// Consolidated tier config - packs quantum/multiplier/budget/starvation into 64-bit per tier.
134    fn tier_configs(&self, quantum_us: u64) -> [u64; 8] {
135        let starvation = self.starvation_threshold();
136        let multiplier = self.tier_multiplier();
137        let budget = self.wait_budget();
138
139        let mut configs = [0u64; 8];
140        for i in 0..8 {
141            configs[i] = (multiplier[i] as u64 & 0xFFF)
142                | ((quantum_us & 0xFFFF) << 12)
143                | (((budget[i] >> 10) & 0xFFFF) << 28)
144                | (((starvation[i] >> 10) & 0xFFFFF) << 44);
145        }
146        configs
147    }
148}
149
150/// 🍰 scx_cake: A sched_ext scheduler applying CAKE bufferbloat concepts
151///
152/// This scheduler adapts CAKE's DRR++ (Deficit Round Robin++) algorithm
153/// for CPU scheduling, providing low-latency scheduling for gaming and
154/// interactive workloads while maintaining fairness.
155///
156/// PROFILES set all tuning parameters at once. Individual options override profile defaults.
157///
158/// 4-TIER SYSTEM (classified by avg_runtime):
159///   T0 Critical  (<100µs): IRQ, input, audio, network
160///   T1 Interact  (<2ms):   compositor, physics, AI
161///   T2 Frame     (<8ms):   game render, encoding
162///   T3 Bulk      (≥8ms):   compilation, background
163///
164/// EXAMPLES:
165///   scx_cake                          # Run with gaming profile (default)
166///   scx_cake -p esports               # Ultra-low-latency for competitive play
167///   scx_cake --quantum 1500           # Gaming profile with custom quantum
168///   scx_cake -v                       # Run with live TUI stats display
169#[derive(Parser, Debug)]
170#[command(
171    author,
172    version,
173    about = "🍰 A sched_ext scheduler applying CAKE bufferbloat concepts to CPU scheduling",
174    verbatim_doc_comment
175)]
176struct Args {
177    /// Scheduler profile preset.
178    ///
179    /// Profiles configure all tier thresholds, quantum multipliers, and wait budgets.
180    /// Individual CLI options (--quantum, etc.) override profile values.
181    ///
182    /// ESPORTS: Ultra-low-latency for competitive gaming.
183    ///   - Quantum: 1000µs, Starvation: 50ms
184    ///
185    /// LEGACY: Optimized for older/lower-power hardware.
186    ///   - Quantum: 4000µs, Starvation: 200ms
187    ///
188    /// GAMING: Optimized for low-latency gaming and interactive workloads.
189    ///   - Quantum: 2000µs, Starvation: 100ms
190    ///
191    /// DEFAULT: Balanced profile for general desktop use.
192    ///   - Currently same as gaming; will diverge in future versions
193    #[arg(long, short, value_enum, default_value_t = Profile::Gaming, verbatim_doc_comment)]
194    profile: Profile,
195
196    /// Base scheduling time slice in MICROSECONDS [default: 2000].
197    ///
198    /// How long a task runs before potentially yielding.
199    ///
200    /// Smaller quantum = more responsive but higher overhead.
201    /// Esports: 1000µs | Gaming: 2000µs | Legacy: 4000µs
202    /// Recommended range: 1000-8000µs
203    #[arg(long, verbatim_doc_comment)]
204    quantum: Option<u64>,
205
206    /// Bonus time for newly woken tasks in MICROSECONDS [default: 8000].
207    ///
208    /// Tasks waking from sleep get this extra time added to their deficit,
209    /// allowing them to run longer on first dispatch. Helps bursty workloads.
210    ///
211    /// Esports: 4000µs | Gaming: 8000µs
212    /// Recommended range: 4000-16000µs
213    #[arg(long, verbatim_doc_comment)]
214    new_flow_bonus: Option<u64>,
215
216    /// Max run time before forced preemption in MICROSECONDS [default: 100000].
217    ///
218    /// Safety limit: tasks running longer than this are forcibly preempted.
219    /// Prevents any single task from monopolizing the CPU.
220    ///
221    /// Esports: 50000µs (50ms) | Gaming: 100000µs (100ms) | Legacy: 200000µs (200ms)
222    /// Recommended range: 50000-200000µs
223    #[arg(long, verbatim_doc_comment)]
224    starvation: Option<u64>,
225
226    /// Enable live TUI (Terminal User Interface) with real-time statistics.
227    ///
228    /// Shows dispatch counts per tier, tier transitions,
229    /// wait time stats, and system topology information.
230    /// Press 'q' to exit TUI mode.
231    #[arg(long, short, verbatim_doc_comment)]
232    verbose: bool,
233
234    /// Statistics refresh interval in SECONDS (only with --verbose).
235    ///
236    /// How often the TUI updates. Lower values = more responsive but
237    /// higher overhead. Has no effect without --verbose.
238    ///
239    /// Default: 1 second
240    #[arg(long, default_value_t = 1, verbatim_doc_comment)]
241    interval: u64,
242}
243
244impl Args {
245    /// Get effective values (profile defaults with CLI overrides applied)
246    fn effective_values(&self) -> (u64, u64, u64) {
247        let (q, nfb, starv) = self.profile.values();
248        (
249            self.quantum.unwrap_or(q),
250            self.new_flow_bonus.unwrap_or(nfb),
251            self.starvation.unwrap_or(starv),
252        )
253    }
254}
255
256struct Scheduler<'a> {
257    skel: BpfSkel<'a>,
258    args: Args,
259    topology: topology::TopologyInfo,
260    latency_matrix: Vec<Vec<f64>>,
261}
262
263impl<'a> Scheduler<'a> {
264    fn new(
265        args: Args,
266        open_object: &'a mut std::mem::MaybeUninit<libbpf_rs::OpenObject>,
267    ) -> Result<Self> {
268        use libbpf_rs::skel::{OpenSkel, SkelBuilder};
269
270        // Open and load the BPF skeleton
271        let skel_builder = BpfSkelBuilder::default();
272
273        let mut open_skel = skel_builder
274            .open(open_object)
275            .context("Failed to open BPF skeleton")?;
276
277        // Populate SCX enum RODATA from kernel BTF (SCX_DSQ_LOCAL_ON, SCX_KICK_PREEMPT, etc.)
278        scx_utils::import_enums!(open_skel);
279
280        // Detect system topology (CCDs, P/E cores)
281        let topo = topology::detect()?;
282
283        // Get effective values (profile + CLI overrides)
284        let (quantum, new_flow_bonus, _starvation) = args.effective_values();
285
286        // ETD: Empirical Topology Discovery — display-grade measurement
287        // Measures inter-core CAS latency for startup heatmap and TUI display
288        info!("Starting ETD calibration...");
289        let latency_matrix = calibrate::calibrate_full_matrix(
290            topo.nr_cpus,
291            &calibrate::EtdConfig::default(),
292            |current, total, is_complete| {
293                tui::render_calibration_progress(current, total, is_complete);
294            },
295        );
296
297        // Configure the scheduler via rodata (read-only data)
298        if let Some(rodata) = &mut open_skel.maps.rodata_data {
299            rodata.quantum_ns = quantum * 1000;
300            rodata.new_flow_bonus_ns = new_flow_bonus * 1000;
301            rodata.enable_stats = args.verbose;
302            rodata.tier_configs = args.profile.tier_configs(quantum);
303
304            // Topology: only has_hybrid is live (DVFS scaling in cake_tick)
305            rodata.has_hybrid = topo.has_hybrid_cores;
306
307            // Per-LLC DSQ partitioning: populate CPU→LLC mapping
308            let llc_count = topo.llc_cpu_mask.iter().filter(|&&m| m != 0).count() as u32;
309            rodata.nr_llcs = llc_count.max(1);
310            rodata.nr_cpus = topo.nr_cpus.min(64) as u32; // Rule 39: bounds kick scan loop
311            for (i, &llc_id) in topo.cpu_llc_id.iter().enumerate() {
312                rodata.cpu_llc_id[i] = llc_id as u32;
313            }
314        }
315
316        // Load the BPF program
317        let skel = open_skel.load().context("Failed to load BPF program")?;
318
319        Ok(Self {
320            skel,
321            args,
322            topology: topo,
323            latency_matrix,
324        })
325    }
326
327    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<()> {
328        // Attach the scheduler
329        let _link = self
330            .skel
331            .maps
332            .cake_ops
333            .attach_struct_ops()
334            .context("Failed to attach scheduler")?;
335
336        self.show_startup_splash()?;
337
338        if self.args.verbose {
339            // Run TUI mode
340            tui::run_tui(
341                &mut self.skel,
342                shutdown.clone(),
343                self.args.interval,
344                self.topology.clone(),
345            )?;
346        } else {
347            // Event-based silent mode - block on signalfd, poll with 60s timeout for UEI check
348
349            // Block SIGINT and SIGTERM from normal delivery
350            let mut mask = SigSet::empty();
351            mask.add(Signal::SIGINT);
352            mask.add(Signal::SIGTERM);
353            mask.thread_block().context("Failed to block signals")?;
354
355            // Create signalfd to receive signals as readable events
356            let sfd = SignalFd::with_flags(&mask, SfdFlags::SFD_NONBLOCK)
357                .context("Failed to create signalfd")?;
358
359            use nix::poll::{poll, PollFd, PollFlags};
360            use std::os::fd::BorrowedFd;
361
362            loop {
363                // Block for up to 60 seconds, then check UEI
364                // poll() returns: >0 = readable, 0 = timeout, -1 = error
365                // SAFETY: sfd is valid for the duration of this loop
366                let poll_fd = unsafe {
367                    PollFd::new(BorrowedFd::borrow_raw(sfd.as_raw_fd()), PollFlags::POLLIN)
368                };
369                let mut fds = [poll_fd];
370                let result = poll(&mut fds, nix::poll::PollTimeout::from(60_000u16)); // 60 seconds
371
372                match result {
373                    Ok(n) if n > 0 => {
374                        // Signal received - read it to clear and exit
375                        if let Ok(Some(siginfo)) = sfd.read_signal() {
376                            info!("Received signal {} - shutting down", siginfo.ssi_signo);
377                            shutdown.store(true, Ordering::Relaxed);
378                        }
379                        break;
380                    }
381                    Ok(_) => {
382                        // Timeout - check UEI
383                        if scx_utils::uei_exited!(&self.skel, uei) {
384                            match scx_utils::uei_report!(&self.skel, uei) {
385                                Ok(reason) => {
386                                    warn!("BPF scheduler exited: {:?}", reason);
387                                }
388                                Err(e) => {
389                                    warn!("BPF scheduler exited (failed to get reason: {})", e);
390                                }
391                            }
392                            break;
393                        }
394                    }
395                    Err(nix::errno::Errno::EINTR) => {
396                        // Interrupted - check shutdown flag
397                        if shutdown.load(Ordering::Relaxed) {
398                            break;
399                        }
400                    }
401                    Err(e) => {
402                        warn!("poll() error: {}", e);
403                        break;
404                    }
405                }
406            }
407        }
408
409        info!("scx_cake scheduler shutting down");
410        Ok(())
411    }
412
413    fn show_startup_splash(&self) -> Result<()> {
414        let (q, _nfb, starv) = self.args.effective_values();
415        let profile_str = format!("{:?}", self.args.profile).to_uppercase();
416
417        tui::render_startup_screen(tui::StartupParams {
418            topology: &self.topology,
419            latency_matrix: &self.latency_matrix,
420            profile: &profile_str,
421            quantum: q,
422            starvation: starv,
423        })
424    }
425}
426
427fn main() -> Result<()> {
428    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
429
430    let args = Args::parse();
431
432    // Set up signal handler
433    let shutdown = Arc::new(AtomicBool::new(false));
434    let shutdown_clone = shutdown.clone();
435
436    ctrlc::set_handler(move || {
437        info!("Received shutdown signal");
438        shutdown_clone.store(true, Ordering::Relaxed);
439    })?;
440
441    // Create open object for BPF - needs to outlive scheduler
442    let mut open_object = std::mem::MaybeUninit::uninit();
443
444    // Create and run the scheduler
445    let mut scheduler = Scheduler::new(args, &mut open_object)?;
446    scheduler.run(shutdown)?;
447
448    Ok(())
449}