scx_cake/main.rs
1// SPDX-License-Identifier: GPL-2.0
2// scx_cake - sched_ext scheduler applying CAKE bufferbloat concepts to CPU scheduling
3
4mod topology;
5mod tui;
6
7use core::sync::atomic::Ordering;
8use std::io::IsTerminal;
9
10use std::sync::atomic::AtomicBool;
11use std::sync::Arc;
12
13use anyhow::{Context, Result};
14use clap::{Parser, ValueEnum};
15use log::{info, warn};
16
17use scx_arena::ArenaLib;
18use scx_utils::build_id;
19use scx_utils::UserExitInfo;
20use scx_utils::NR_CPU_IDS;
21// Include the generated interface bindings
22#[allow(non_camel_case_types, non_upper_case_globals, dead_code)]
23mod bpf_intf {
24 include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
25}
26
27// Include the generated BPF skeleton
28#[allow(non_camel_case_types, non_upper_case_globals, dead_code)]
29mod bpf_skel {
30 include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
31}
32use bpf_skel::*;
33
34const SCHEDULER_NAME: &str = "scx_cake";
35
36/// Scheduler profile presets
37#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
38pub enum Profile {
39 /// Ultra-low-latency for competitive esports (1ms quantum)
40 Esports,
41 /// Optimized for older/lower-power hardware (4ms quantum)
42 Legacy,
43 /// Low-latency profile optimized for gaming and interactive workloads
44 Gaming,
45 /// Balanced profile for general desktop use (same as gaming for now)
46 Default,
47 /// Power-efficient profile for handhelds/laptops on battery (DVFS enabled)
48 Battery,
49}
50
51impl Profile {
52 /// Returns (quantum_us, new_flow_bonus_us, starvation_us)
53 fn values(&self) -> (u64, u64, u64) {
54 match self {
55 // Esports: Ultra-aggressive, 1ms quantum for maximum responsiveness
56 Profile::Esports => (1000, 4000, 50000),
57 // Legacy: High efficiency, 4ms quantum to reduce overhead on older CPUs
58 Profile::Legacy => (4000, 12000, 200000),
59 // Gaming: Aggressive latency, 2ms quantum
60 Profile::Gaming => (2000, 8000, 100000),
61 // Default: Same as gaming for now
62 Profile::Default => (2000, 8000, 100000),
63 // Battery: 4ms quantum — fewer context switches = less power
64 Profile::Battery => (4000, 12000, 200000),
65 }
66 }
67
68 // DVFS — disabled (tick architecture removed, no runtime effect).
69 // RODATA symbols retained in BPF for loader compat; JIT eliminates.
70}
71
72/// 🍰 scx_cake: A sched_ext scheduler applying CAKE bufferbloat concepts
73///
74/// This scheduler adapts CAKE's DRR++ (Deficit Round Robin++) algorithm
75/// for CPU scheduling, providing low-latency scheduling for gaming and
76/// interactive workloads while maintaining fairness.
77///
78/// PROFILES set all tuning parameters at once. Individual options override profile defaults.
79///
80/// 4-CLASS SYSTEM (classified by PELT utilization + game family detection):
81/// GAME: game process tree + audio + compositor (during GAMING)
82/// NORMAL: default class — interactive desktop tasks
83/// HOG: high PELT utilization (≥78% CPU) non-game tasks
84/// BG: low PELT utilization non-game tasks during GAMING
85///
86/// EXAMPLES:
87/// scx_cake # Run with gaming profile (default)
88/// scx_cake -p esports # Ultra-low-latency for competitive play
89/// scx_cake --quantum 1500 # Gaming profile with custom quantum
90/// scx_cake -v # Run with live TUI stats display
91#[derive(Parser, Debug, Clone)]
92#[command(
93 author,
94 version,
95 disable_version_flag = true,
96 about = "🍰 A sched_ext scheduler applying CAKE bufferbloat concepts to CPU scheduling",
97 verbatim_doc_comment
98)]
99struct Args {
100 /// Scheduler profile preset.
101 ///
102 /// Profiles configure all tier thresholds, quantum multipliers, and wait budgets.
103 /// Individual CLI options (--quantum, etc.) override profile values.
104 ///
105 /// ESPORTS: Ultra-low-latency for competitive gaming.
106 /// - Quantum: 1000µs, Starvation: 50ms
107 ///
108 /// LEGACY: Optimized for older/lower-power hardware.
109 /// - Quantum: 4000µs, Starvation: 200ms
110 ///
111 /// GAMING: Optimized for low-latency gaming and interactive workloads.
112 /// - Quantum: 2000µs, Starvation: 100ms
113 ///
114 /// DEFAULT: Balanced profile for general desktop use.
115 /// - Currently same as gaming; will diverge in future versions
116 ///
117 /// BATTERY: Power-efficient for handhelds/laptops on battery.
118 /// - Quantum: 4000µs, reduced context switch overhead
119 #[arg(long, short, value_enum, default_value_t = Profile::Gaming, verbatim_doc_comment)]
120 profile: Profile,
121
122 /// Base scheduling time slice in MICROSECONDS [default: 2000].
123 ///
124 /// How long a task runs before potentially yielding.
125 ///
126 /// Smaller quantum = more responsive but higher overhead.
127 /// Esports: 1000µs | Gaming: 2000µs | Legacy: 4000µs
128 /// Recommended range: 1000-8000µs
129 #[arg(long, verbatim_doc_comment)]
130 quantum: Option<u64>,
131
132 /// Bonus time for newly woken tasks in MICROSECONDS [default: 8000].
133 ///
134 /// Tasks waking from sleep get this extra time added to their deficit,
135 /// allowing them to run longer on first dispatch. Helps bursty workloads.
136 ///
137 /// Esports: 4000µs | Gaming: 8000µs
138 /// Recommended range: 4000-16000µs
139 #[arg(long, verbatim_doc_comment)]
140 new_flow_bonus: Option<u64>,
141
142 /// Max run time before forced preemption in MICROSECONDS [default: 100000].
143 ///
144 /// Safety limit: tasks running longer than this are forcibly preempted.
145 /// Prevents any single task from monopolizing the CPU.
146 ///
147 /// Esports: 50000µs (50ms) | Gaming: 100000µs (100ms) | Legacy: 200000µs (200ms)
148 /// Recommended range: 50000-200000µs
149 #[arg(long, verbatim_doc_comment)]
150 starvation: Option<u64>,
151
152 /// Enable live TUI (Terminal User Interface) with real-time statistics.
153 ///
154 /// Shows dispatch counts per tier, tier transitions,
155 /// wait time stats, and system topology information.
156 /// Press 'q' to exit TUI mode.
157 #[arg(long, short, verbatim_doc_comment)]
158 verbose: bool,
159
160 /// Statistics refresh interval in SECONDS (only with --verbose).
161 ///
162 /// How often the TUI updates. Lower values = more responsive but
163 /// higher overhead. Has no effect without --verbose.
164 ///
165 /// Default: 1 second
166 #[arg(long, default_value_t = 1, verbatim_doc_comment)]
167 interval: u64,
168
169 /// Live in-kernel testing mode for automated benchmarking.
170 ///
171 /// Runs the scheduler for 10 seconds, collects BPF data points,
172 /// and prints a structured JSON output to stdout.
173 #[arg(long, verbatim_doc_comment)]
174 testing: bool,
175
176 /// Print scheduler version and exit.
177 #[arg(short = 'V', long, action = clap::ArgAction::SetTrue)]
178 version: bool,
179}
180
181impl Args {
182 /// Get effective values (profile defaults with CLI overrides applied)
183 fn effective_values(&self) -> (u64, u64, u64) {
184 let (q, nfb, starv) = self.profile.values();
185 (
186 self.quantum.unwrap_or(q),
187 self.new_flow_bonus.unwrap_or(nfb),
188 self.starvation.unwrap_or(starv),
189 )
190 }
191}
192
193struct Scheduler<'a> {
194 skel: BpfSkel<'a>,
195 args: Args,
196 topology: topology::TopologyInfo,
197 latency_matrix: Vec<Vec<f64>>,
198 struct_ops: Option<libbpf_rs::Link>,
199}
200
201impl<'a> Scheduler<'a> {
202 fn new(
203 args: Args,
204 open_object: &'a mut std::mem::MaybeUninit<libbpf_rs::OpenObject>,
205 ) -> Result<Self> {
206 use libbpf_rs::skel::{OpenSkel, Skel, SkelBuilder};
207
208 // ═══ scx_ops_open! equivalent ═══
209 // Matches scx_ops_open!(skel_builder, open_object, cake_ops, None)
210 // Cake can't use the macro directly (custom arena architecture),
211 // so we inline the critical functionality.
212 scx_utils::compat::check_min_requirements()?;
213
214 let skel_builder = BpfSkelBuilder::default();
215 let mut open_skel = skel_builder
216 .open(open_object)
217 .context("Failed to open BPF skeleton")?;
218
219 // Inject version suffix into ops name: "cake" → "cake_1.1.0_g<hash>_<target>"
220 // This is what scx_loader reads from /sys/kernel/sched_ext/root/ops
221 {
222 let ops = open_skel.struct_ops.cake_ops_mut();
223 let name_field = &mut ops.name;
224
225 let version_suffix = scx_utils::build_id::ops_version_suffix(env!("CARGO_PKG_VERSION"));
226 let bytes = version_suffix.as_bytes();
227 let mut i = 0;
228 let mut bytes_idx = 0;
229 let mut found_null = false;
230
231 while i < name_field.len() - 1 {
232 found_null |= name_field[i] == 0;
233 if !found_null {
234 i += 1;
235 continue;
236 }
237
238 if bytes_idx < bytes.len() {
239 name_field[i] = bytes[bytes_idx] as i8;
240 bytes_idx += 1;
241 } else {
242 break;
243 }
244 i += 1;
245 }
246 name_field[i] = 0;
247 }
248
249 // Read hotplug sequence number — enables kernel-requested restarts on CPU hotplug
250 {
251 let path = std::path::Path::new("/sys/kernel/sched_ext/hotplug_seq");
252 let val = std::fs::read_to_string(path)
253 .context("Failed to read /sys/kernel/sched_ext/hotplug_seq")?;
254 open_skel.struct_ops.cake_ops_mut().hotplug_seq = val
255 .trim()
256 .parse::<u64>()
257 .context("Failed to parse hotplug_seq")?;
258 }
259
260 // Honor SCX_TIMEOUT_MS environment variable (matches scx_ops_open! behavior)
261 if let Ok(s) = std::env::var("SCX_TIMEOUT_MS") {
262 let ms: u32 = s.parse().context("SCX_TIMEOUT_MS has invalid value")?;
263 info!("Setting timeout_ms to {} based on environment", ms);
264 open_skel.struct_ops.cake_ops_mut().timeout_ms = ms;
265 }
266
267 // Populate SCX enum RODATA from kernel BTF (SCX_DSQ_LOCAL_ON, SCX_KICK_PREEMPT, etc.)
268 scx_utils::import_enums!(open_skel);
269
270 // Detect system topology (CCDs, P/E cores)
271 let topo = topology::detect()?;
272
273 // Get effective values (profile + CLI overrides)
274 let (quantum, new_flow_bonus, _starvation) = args.effective_values();
275
276 // Latency matrix: zeroed, populated by TUI Topology tab if --verbose
277 let latency_matrix = vec![vec![0.0; topo.nr_cpus]; topo.nr_cpus];
278
279 // Configure the scheduler via rodata (read-only data)
280 if let Some(rodata) = &mut open_skel.maps.rodata_data {
281 rodata.quantum_ns = quantum * 1000;
282 rodata.new_flow_bonus_ns = new_flow_bonus * 1000;
283 // Stats/telemetry: only available in debug builds (CAKE_RELEASE omits the field).
284 // In release, --verbose is silently ignored — zero overhead for production gaming.
285 #[cfg(debug_assertions)]
286 {
287 rodata.enable_stats = args.verbose || args.testing;
288 }
289
290 // has_hybrid removed: smt_sibling now uses pre-filled cpu_sibling_map only
291 // Per-LLC DSQ partitioning: populate CPU→LLC mapping
292 let llc_count = topo.llc_cpu_mask.iter().filter(|&&m| m != 0).count() as u32;
293 rodata.nr_llcs = llc_count.max(1);
294 rodata.nr_cpus = topo.nr_cpus.min(256) as u32; // F2: widened from 64→256 for Threadripper
295 rodata.nr_phys_cpus = topo.nr_phys_cpus.min(256) as u32; // V3: PHYS_FIRST scan mask
296
297 // Ferry explicit 64-bit topology arrays down into BPF (O(1) execution replacements)
298
299 // Heterogeneous Gaming Topology — u64[4] arrays (F2: 256-bit masks)
300 rodata.big_core_phys_mask[0] = topo.big_core_phys_mask;
301 rodata.big_core_smt_mask[0] = topo.big_core_smt_mask;
302 rodata.little_core_mask[0] = topo.little_core_mask;
303 rodata.vcache_llc_mask[0] = topo.vcache_llc_mask;
304 rodata.has_vcache = topo.has_vcache;
305 rodata.has_hybrid_cores = topo.big_core_phys_mask != 0;
306
307 for i in 0..topo.cpu_sibling_map.len() {
308 rodata.cpu_sibling_map[i] = topo.cpu_sibling_map[i];
309 }
310 for i in 0..topo.llc_cpu_mask.len().min(8) {
311 rodata.llc_cpu_mask[i] = topo.llc_cpu_mask[i];
312 }
313 for i in 0..topo.core_cpu_mask.len().min(32) {
314 rodata.core_cpu_mask[i] = topo.core_cpu_mask[i];
315 }
316
317 for (i, &llc_id) in topo.cpu_llc_id.iter().enumerate() {
318 rodata.cpu_llc_id[i] = llc_id as u32;
319 }
320
321 // Performance-ordered CPU arrays: read prefcore ranking from sysfs,
322 // sort by performance, group SMT pairs together.
323 // GAME tasks scan fast→slow, non-GAME scans slow→fast.
324 {
325 let nr = topo.nr_cpus.min(256);
326 // Read prefcore ranking per CPU (higher = faster)
327 let mut rankings: Vec<(usize, u32)> = (0..nr)
328 .map(|cpu| {
329 let path = format!(
330 "/sys/devices/system/cpu/cpu{}/cpufreq/amd_pstate_prefcore_ranking",
331 cpu
332 );
333 let rank = std::fs::read_to_string(&path)
334 .ok()
335 .and_then(|s| s.trim().parse::<u32>().ok())
336 .unwrap_or(100); // fallback: equal ranking
337 (cpu, rank)
338 })
339 .collect();
340
341 // Sort by descending rank (fastest first), stable for SMT grouping
342 rankings.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
343
344 // Build fast→slow array with SMT pairs grouped together:
345 // [best_phys, best_smt, second_phys, second_smt, ...]
346 let mut fast_to_slow: Vec<u8> = Vec::with_capacity(nr);
347 let mut used = vec![false; nr];
348 for &(cpu, _) in &rankings {
349 if used[cpu] {
350 continue;
351 }
352 fast_to_slow.push(cpu as u8);
353 used[cpu] = true;
354 // Add SMT sibling immediately after
355 let sib = topo.cpu_sibling_map.get(cpu).copied().unwrap_or(0xFF);
356 if (sib as usize) < nr && !used[sib as usize] {
357 fast_to_slow.push(sib);
358 used[sib as usize] = true;
359 }
360 }
361
362 // Populate RODATA arrays
363 for i in 0..64usize {
364 if i < fast_to_slow.len() {
365 rodata.cpus_fast_to_slow[i] = fast_to_slow[i];
366 // Reverse for slow→fast
367 rodata.cpus_slow_to_fast[i] = fast_to_slow[fast_to_slow.len() - 1 - i];
368 } else {
369 rodata.cpus_fast_to_slow[i] = 0xFF; // sentinel
370 rodata.cpus_slow_to_fast[i] = 0xFF;
371 }
372 }
373
374 let top_cpus: Vec<_> = fast_to_slow.iter().take(4).collect();
375 info!(
376 "Core steering: fast→slow order {:?} ({} CPUs)",
377 top_cpus, nr
378 );
379 }
380
381 // ═══ Per-CPU capacity table (F1 correctness fix) ═══
382 // Read arch_scale_cpu_capacity from sysfs for P/E core vruntime scaling.
383 // Scale: 0-1024, where 1024 = fastest core. On SMP all = 1024 → JIT folds.
384 // Intel hybrid: P-cores ~1024, E-cores ~600-700.
385 // AMD SMP: all 1024 → cap > 0 && cap < 1024 is always false → zero overhead.
386 {
387 let nr = topo.nr_cpus.min(256);
388 let mut all_equal = true;
389 let mut first_cap: u32 = 0;
390
391 for cpu in 0..nr {
392 let path = format!("/sys/devices/system/cpu/cpu{}/cpu_capacity", cpu);
393 let cap = std::fs::read_to_string(&path)
394 .ok()
395 .and_then(|s| s.trim().parse::<u32>().ok())
396 .unwrap_or(1024);
397
398 rodata.cpuperf_cap_table[cpu] = cap;
399
400 if cpu == 0 {
401 first_cap = cap;
402 } else if cap != first_cap {
403 all_equal = false;
404 }
405 }
406
407 if !all_equal {
408 info!(
409 "Capacity scaling: heterogeneous (P/E cores, range {}-{})",
410 rodata.cpuperf_cap_table[..nr].iter().min().unwrap_or(&0),
411 rodata.cpuperf_cap_table[..nr].iter().max().unwrap_or(&1024)
412 );
413 }
414 }
415
416 // Arena library: nr_cpu_ids must be set before load() — arena_init
417 // checks this and returns -ENODEV (errno 19) if uninitialized.
418 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
419
420 // ═══ Audio stack detection ═══
421 // Phase 1: Core audio daemons by comm name.
422 // Phase 2: PipeWire socket clients (mixers like goxlr-daemon).
423 // Both are session-persistent → bake into RODATA.
424 {
425 use std::collections::HashSet;
426
427 const AUDIO_COMMS: &[&str] = &[
428 "pipewire",
429 "wireplumber",
430 "pipewire-pulse",
431 "pulseaudio",
432 "jackd",
433 "jackdbus",
434 ];
435 let mut audio_tgids: Vec<u32> = Vec::new();
436 let mut audio_tgid_set: HashSet<u32> = HashSet::new();
437
438 // Phase 1: comm-based detection
439 if let Ok(entries) = std::fs::read_dir("/proc") {
440 for entry in entries.flatten() {
441 let name = entry.file_name();
442 let name_str = name.to_string_lossy();
443 if !name_str.chars().all(|c| c.is_ascii_digit()) {
444 continue;
445 }
446 let pid: u32 = match name_str.parse() {
447 Ok(p) => p,
448 Err(_) => continue,
449 };
450 if let Ok(comm) = std::fs::read_to_string(format!("/proc/{}/comm", pid)) {
451 let comm = comm.trim();
452 if AUDIO_COMMS.contains(&comm) && audio_tgid_set.insert(pid) {
453 audio_tgids.push(pid);
454 }
455 }
456 }
457 }
458
459 // Phase 2: PipeWire socket client detection.
460 // Scan /proc/net/unix for pipewire-0 socket inodes, then find
461 // processes with fds pointing to those inodes. This catches any
462 // audio mixer daemon (goxlr-daemon, easyeffects, etc.) without
463 // brittle comm lists.
464 let core_count = audio_tgids.len();
465 'pw_detect: {
466 let uid = unsafe { libc::getuid() };
467 let pw_socket_path = format!("/run/user/{}/pipewire-0", uid);
468
469 // Collect inodes for the PipeWire socket
470 let unix_content = match std::fs::read_to_string("/proc/net/unix") {
471 Ok(c) => c,
472 Err(_) => break 'pw_detect,
473 };
474 let mut pw_inodes: HashSet<u64> = HashSet::new();
475 for line in unix_content.lines().skip(1) {
476 if line.ends_with(&pw_socket_path)
477 || line.contains(&format!("{} ", pw_socket_path))
478 {
479 // Format: Num RefCount Protocol Flags Type St Inode Path
480 let fields: Vec<&str> = line.split_whitespace().collect();
481 if fields.len() >= 7 {
482 if let Ok(inode) = fields[6].parse::<u64>() {
483 if inode > 0 {
484 pw_inodes.insert(inode);
485 }
486 }
487 }
488 }
489 }
490 if pw_inodes.is_empty() {
491 break 'pw_detect;
492 }
493
494 // Scan /proc/*/fd for socket links matching PipeWire inodes.
495 // Only check thread-group leaders (dirs in /proc with numeric names).
496 if let Ok(proc_entries) = std::fs::read_dir("/proc") {
497 for entry in proc_entries.flatten() {
498 if audio_tgids.len() >= 8 {
499 break;
500 }
501 let name = entry.file_name();
502 let name_str = name.to_string_lossy();
503 if !name_str.chars().all(|c| c.is_ascii_digit()) {
504 continue;
505 }
506 let pid: u32 = match name_str.parse() {
507 Ok(p) => p,
508 Err(_) => continue,
509 };
510 // Skip PIDs already detected as core audio
511 if audio_tgid_set.contains(&pid) {
512 continue;
513 }
514 let fd_dir = format!("/proc/{}/fd", pid);
515 let fd_entries = match std::fs::read_dir(&fd_dir) {
516 Ok(e) => e,
517 Err(_) => continue,
518 };
519 for fd_entry in fd_entries.flatten() {
520 if let Ok(link) = std::fs::read_link(fd_entry.path()) {
521 let link_str = link.to_string_lossy();
522 // Socket links look like "socket:[12345]"
523 if let Some(inode_str) = link_str
524 .strip_prefix("socket:[")
525 .and_then(|s| s.strip_suffix(']'))
526 {
527 if let Ok(inode) = inode_str.parse::<u64>() {
528 if pw_inodes.contains(&inode) {
529 if audio_tgid_set.insert(pid) {
530 audio_tgids.push(pid);
531 }
532 break; // Found one match, move to next PID
533 }
534 }
535 }
536 }
537 }
538 }
539 }
540 }
541
542 rodata.nr_audio_tgids = audio_tgids.len() as u32;
543 for (i, &tgid) in audio_tgids.iter().enumerate() {
544 rodata.audio_tgids[i] = tgid;
545 }
546 let client_count = audio_tgids.len() - core_count;
547 if !audio_tgids.is_empty() {
548 info!(
549 "Audio stack detected: {} daemons{} (TGIDs: {:?})",
550 audio_tgids.len(),
551 if client_count > 0 {
552 format!(
553 ", {} PipeWire client{}",
554 client_count,
555 if client_count == 1 { "" } else { "s" }
556 )
557 } else {
558 String::new()
559 },
560 audio_tgids
561 );
562 }
563 }
564
565 // ═══ Compositor detection ═══
566 // Wayland compositors present every frame to the display.
567 // Session-persistent → bake into RODATA.
568 {
569 const COMPOSITOR_COMMS: &[&str] = &[
570 "kwin_wayland",
571 "kwin_x11",
572 "mutter",
573 "gnome-shell",
574 "sway",
575 "Hyprland",
576 "weston",
577 "labwc",
578 "wayfire",
579 "river",
580 "gamescope",
581 ];
582 let mut compositor_tgids: Vec<u32> = Vec::new();
583 if let Ok(entries) = std::fs::read_dir("/proc") {
584 for entry in entries.flatten() {
585 let name = entry.file_name();
586 let name_str = name.to_string_lossy();
587 if !name_str.chars().all(|c| c.is_ascii_digit()) {
588 continue;
589 }
590 let pid: u32 = match name_str.parse() {
591 Ok(p) => p,
592 Err(_) => continue,
593 };
594 if let Ok(comm) = std::fs::read_to_string(format!("/proc/{}/comm", pid)) {
595 let comm = comm.trim();
596 if COMPOSITOR_COMMS.contains(&comm) {
597 compositor_tgids.push(pid);
598 if compositor_tgids.len() >= 4 {
599 break;
600 }
601 }
602 }
603 }
604 }
605 rodata.nr_compositor_tgids = compositor_tgids.len() as u32;
606 for (i, &tgid) in compositor_tgids.iter().enumerate() {
607 rodata.compositor_tgids[i] = tgid;
608 }
609 if !compositor_tgids.is_empty() {
610 info!(
611 "Compositor detected: {} (TGIDs: {:?})",
612 compositor_tgids.len(),
613 compositor_tgids
614 );
615 }
616 }
617 }
618
619 // ═══ scx_ops_load! equivalent ═══
620 // Set UEI dump buffer size before load (matches scx_ops_load! behavior)
621 scx_utils::uei_set_size!(open_skel, cake_ops, uei);
622
623 let mut skel = open_skel.load().context("Failed to load BPF program")?;
624
625 // Initialize the BPF arena library.
626 // Must happen after load() (BPF maps are now live) but before attach_struct_ops()
627 // (scheduler not yet running, so init_task hasn't fired yet).
628 // ArenaLib::setup() runs SEC("syscall") probes:
629 // 1. arena_init: allocates static pages, inits task stack allocator
630 // 2. arena_topology_node_init: registers topology nodes for arena traversal
631 let task_ctx_size = std::mem::size_of::<bpf_intf::cake_task_ctx>();
632 let arena = ArenaLib::init(skel.object_mut(), task_ctx_size, topo.nr_cpus)
633 .context("Failed to create ArenaLib")?;
634 arena.setup().context("Failed to initialize BPF arena")?;
635 info!(
636 "BPF arena initialized (task_ctx_size={}B, nr_cpus={})",
637 task_ctx_size, topo.nr_cpus
638 );
639
640 // Set initial BSS values before attach (zero-init'd in BPF for BSS placement).
641 // quantum_ceiling_ns: default IDLE/GAMING → 2ms. TUI updates at ~2Hz.
642 if let Some(bss) = &mut skel.maps.bss_data {
643 bss.quantum_ceiling_ns = 2_000_000; // AQ_BULK_CEILING_NS
644 }
645
646 Ok(Self {
647 skel,
648 args,
649 topology: topo,
650 latency_matrix,
651 struct_ops: None,
652 })
653 }
654
655 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
656 use libbpf_rs::skel::Skel;
657
658 // ═══ scx_ops_attach! equivalent ═══
659 // Guard: prevent loading if another sched_ext scheduler is already active
660 if scx_utils::compat::is_sched_ext_enabled().unwrap_or(false) {
661 anyhow::bail!("another sched_ext scheduler is already running");
662 }
663
664 // Attach non-struct_ops BPF programs first, then struct_ops
665 self.skel
666 .attach()
667 .context("Failed to attach non-struct_ops BPF programs")?;
668 self.struct_ops = Some(
669 self.skel
670 .maps
671 .cake_ops
672 .attach_struct_ops()
673 .context("Failed to attach struct_ops BPF programs")?,
674 );
675
676 // Release builds: --verbose and --testing are unavailable (stats compiled out).
677 // Warn early so user knows these flags require a debug build.
678 #[cfg(not(debug_assertions))]
679 if self.args.verbose || self.args.testing {
680 warn!("--verbose and --testing require a debug build (telemetry is compiled out in release).");
681 warn!("Rebuild without --release: cargo build -p scx_cake");
682 self.args.verbose = false;
683 self.args.testing = false;
684 }
685
686 // Standard startup banner: follows scx_cosmos/scx_bpfland convention
687 info!(
688 "{} {} {}",
689 SCHEDULER_NAME,
690 build_id::full_version(env!("CARGO_PKG_VERSION")),
691 if self.topology.smt_enabled {
692 "SMT on"
693 } else {
694 "SMT off"
695 }
696 );
697
698 // Print command line.
699 info!(
700 "scheduler options: {}",
701 std::env::args().collect::<Vec<_>>().join(" ")
702 );
703
704 info!(
705 "{} CPUs, {} LLCs, profile: {:?}",
706 self.topology.nr_cpus,
707 self.topology
708 .llc_cpu_mask
709 .iter()
710 .filter(|&&m| m != 0)
711 .count()
712 .max(1),
713 self.args.profile
714 );
715 if self.args.testing {
716 info!("Running in benchmarking mode for 10 seconds...");
717 std::thread::sleep(std::time::Duration::from_secs(1)); // Warmup
718
719 let mut start_dispatches = 0u64;
720 for cpu in 0..self.topology.nr_cpus {
721 let stats = &self.skel.maps.bss_data.as_ref().unwrap().global_stats[cpu];
722 start_dispatches += stats.nr_new_flow_dispatches + stats.nr_old_flow_dispatches;
723 }
724
725 let start_time = std::time::Instant::now();
726 let mut elapsed = 0;
727 while elapsed < 10 && !shutdown.load(Ordering::Relaxed) {
728 std::thread::sleep(std::time::Duration::from_secs(1));
729 elapsed += 1;
730 }
731 let duration = start_time.elapsed().as_secs_f64();
732
733 let mut end_dispatches = 0u64;
734 for cpu in 0..self.topology.nr_cpus {
735 let stats = &self.skel.maps.bss_data.as_ref().unwrap().global_stats[cpu];
736 end_dispatches += stats.nr_new_flow_dispatches + stats.nr_old_flow_dispatches;
737 }
738
739 let delta = end_dispatches.saturating_sub(start_dispatches);
740 let throughput = delta as f64 / duration;
741 println!("{{\"duration_sec\": {:.2}, \"total_dispatches\": {}, \"dispatches_per_sec\": {:.2}}}",
742 duration, delta, throughput);
743
744 shutdown.store(true, Ordering::Relaxed);
745 } else if self.args.verbose && std::io::stdout().is_terminal() {
746 // Run TUI mode
747 tui::run_tui(
748 &mut self.skel,
749 shutdown.clone(),
750 self.args.interval,
751 self.topology.clone(),
752 self.latency_matrix.clone(),
753 )?;
754 } else {
755 if self.args.verbose && !std::io::stdout().is_terminal() {
756 warn!("TUI disabled: no terminal detected (headless mode)");
757 }
758 // Simple headless mode: matches cosmos/bpfland pattern exactly.
759 // ctrlc handler sets shutdown on SIGINT/SIGTERM.
760 // 1-second sleep + UEI check = responsive shutdown.
761 while !shutdown.load(Ordering::Relaxed) {
762 std::thread::sleep(std::time::Duration::from_secs(1));
763 if scx_utils::uei_exited!(&self.skel, uei) {
764 break;
765 }
766 }
767 }
768
769 info!("{SCHEDULER_NAME} scheduler shutting down");
770
771 // Drop struct_ops link BEFORE uei_report — this triggers the kernel to
772 // set UEI kind=SCX_EXIT_UNREG. Matches scx_bpfland/scx_cosmos/scx_lavd
773 // pattern: `let _ = self.struct_ops.take(); uei_report!(...)`
774 let _ = self.struct_ops.take();
775
776 // Standard UEI exit report — returns UserExitInfo for should_restart().
777 scx_utils::uei_report!(&self.skel, uei)
778 }
779}
780
781impl Drop for Scheduler<'_> {
782 fn drop(&mut self) {
783 info!("Unregister {SCHEDULER_NAME} scheduler");
784 }
785}
786
787fn main() -> Result<()> {
788 env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
789
790 let args = Args::parse();
791
792 // Handle --version before anything else (matches cosmos/bpfland)
793 if args.version {
794 println!(
795 "{} {}",
796 SCHEDULER_NAME,
797 build_id::full_version(env!("CARGO_PKG_VERSION"))
798 );
799 return Ok(());
800 }
801
802 // Set up signal handler: ctrlc handles both SIGINT and SIGTERM on Linux.
803 // This is the same pattern cosmos/bpfland use — no SigSet blocking or
804 // SignalFd complexity needed.
805 let shutdown = Arc::new(AtomicBool::new(false));
806 let shutdown_clone = shutdown.clone();
807
808 ctrlc::set_handler(move || {
809 info!("Received shutdown signal");
810 shutdown_clone.store(true, Ordering::Relaxed);
811 })?;
812
813 // Create open object for BPF - needs to outlive scheduler
814 let mut open_object = std::mem::MaybeUninit::uninit();
815
816 // Restart loop: matches cosmos/bpfland pattern.
817 // Kernel can request restart via UEI (e.g., CPU hotplug).
818 loop {
819 let mut scheduler = Scheduler::new(args.clone(), &mut open_object)?;
820 if !scheduler.run(shutdown.clone())?.should_restart() {
821 break;
822 }
823 }
824
825 Ok(())
826}