Skip to main content

scx_cake/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2// scx_cake - sched_ext scheduler applying CAKE bufferbloat concepts to CPU scheduling
3
4mod topology;
5mod tui;
6
7use core::sync::atomic::Ordering;
8use std::io::IsTerminal;
9
10use std::sync::atomic::AtomicBool;
11use std::sync::Arc;
12
13use anyhow::{Context, Result};
14use clap::{Parser, ValueEnum};
15use log::{info, warn};
16
17use scx_arena::ArenaLib;
18use scx_utils::build_id;
19use scx_utils::UserExitInfo;
20use scx_utils::NR_CPU_IDS;
21// Include the generated interface bindings
22#[allow(non_camel_case_types, non_upper_case_globals, dead_code)]
23mod bpf_intf {
24    include!(concat!(env!("OUT_DIR"), "/bpf_intf.rs"));
25}
26
27// Include the generated BPF skeleton
28#[allow(non_camel_case_types, non_upper_case_globals, dead_code)]
29mod bpf_skel {
30    include!(concat!(env!("OUT_DIR"), "/bpf_skel.rs"));
31}
32use bpf_skel::*;
33
34const SCHEDULER_NAME: &str = "scx_cake";
35
36/// Scheduler profile presets
37#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
38pub enum Profile {
39    /// Ultra-low-latency for competitive esports (1ms quantum)
40    Esports,
41    /// Optimized for older/lower-power hardware (4ms quantum)
42    Legacy,
43    /// Low-latency profile optimized for gaming and interactive workloads
44    Gaming,
45    /// Balanced profile for general desktop use (same as gaming for now)
46    Default,
47    /// Power-efficient profile for handhelds/laptops on battery (DVFS enabled)
48    Battery,
49}
50
51impl Profile {
52    /// Returns (quantum_us, new_flow_bonus_us, starvation_us)
53    fn values(&self) -> (u64, u64, u64) {
54        match self {
55            // Esports: Ultra-aggressive, 1ms quantum for maximum responsiveness
56            Profile::Esports => (1000, 4000, 50000),
57            // Legacy: High efficiency, 4ms quantum to reduce overhead on older CPUs
58            Profile::Legacy => (4000, 12000, 200000),
59            // Gaming: Aggressive latency, 2ms quantum
60            Profile::Gaming => (2000, 8000, 100000),
61            // Default: Same as gaming for now
62            Profile::Default => (2000, 8000, 100000),
63            // Battery: 4ms quantum — fewer context switches = less power
64            Profile::Battery => (4000, 12000, 200000),
65        }
66    }
67
68    // DVFS — disabled (tick architecture removed, no runtime effect).
69    // RODATA symbols retained in BPF for loader compat; JIT eliminates.
70}
71
72/// 🍰 scx_cake: A sched_ext scheduler applying CAKE bufferbloat concepts
73///
74/// This scheduler adapts CAKE's DRR++ (Deficit Round Robin++) algorithm
75/// for CPU scheduling, providing low-latency scheduling for gaming and
76/// interactive workloads while maintaining fairness.
77///
78/// PROFILES set all tuning parameters at once. Individual options override profile defaults.
79///
80/// 4-CLASS SYSTEM (classified by PELT utilization + game family detection):
81///   GAME:    game process tree + audio + compositor (during GAMING)
82///   NORMAL:  default class — interactive desktop tasks
83///   HOG:     high PELT utilization (≥78% CPU) non-game tasks
84///   BG:      low PELT utilization non-game tasks during GAMING
85///
86/// EXAMPLES:
87///   scx_cake                          # Run with gaming profile (default)
88///   scx_cake -p esports               # Ultra-low-latency for competitive play
89///   scx_cake --quantum 1500           # Gaming profile with custom quantum
90///   scx_cake -v                       # Run with live TUI stats display
91#[derive(Parser, Debug, Clone)]
92#[command(
93    author,
94    version,
95    disable_version_flag = true,
96    about = "🍰 A sched_ext scheduler applying CAKE bufferbloat concepts to CPU scheduling",
97    verbatim_doc_comment
98)]
99struct Args {
100    /// Scheduler profile preset.
101    ///
102    /// Profiles configure all tier thresholds, quantum multipliers, and wait budgets.
103    /// Individual CLI options (--quantum, etc.) override profile values.
104    ///
105    /// ESPORTS: Ultra-low-latency for competitive gaming.
106    ///   - Quantum: 1000µs, Starvation: 50ms
107    ///
108    /// LEGACY: Optimized for older/lower-power hardware.
109    ///   - Quantum: 4000µs, Starvation: 200ms
110    ///
111    /// GAMING: Optimized for low-latency gaming and interactive workloads.
112    ///   - Quantum: 2000µs, Starvation: 100ms
113    ///
114    /// DEFAULT: Balanced profile for general desktop use.
115    ///   - Currently same as gaming; will diverge in future versions
116    ///
117    /// BATTERY: Power-efficient for handhelds/laptops on battery.
118    ///   - Quantum: 4000µs, reduced context switch overhead
119    #[arg(long, short, value_enum, default_value_t = Profile::Gaming, verbatim_doc_comment)]
120    profile: Profile,
121
122    /// Base scheduling time slice in MICROSECONDS [default: 2000].
123    ///
124    /// How long a task runs before potentially yielding.
125    ///
126    /// Smaller quantum = more responsive but higher overhead.
127    /// Esports: 1000µs | Gaming: 2000µs | Legacy: 4000µs
128    /// Recommended range: 1000-8000µs
129    #[arg(long, verbatim_doc_comment)]
130    quantum: Option<u64>,
131
132    /// Bonus time for newly woken tasks in MICROSECONDS [default: 8000].
133    ///
134    /// Tasks waking from sleep get this extra time added to their deficit,
135    /// allowing them to run longer on first dispatch. Helps bursty workloads.
136    ///
137    /// Esports: 4000µs | Gaming: 8000µs
138    /// Recommended range: 4000-16000µs
139    #[arg(long, verbatim_doc_comment)]
140    new_flow_bonus: Option<u64>,
141
142    /// Max run time before forced preemption in MICROSECONDS [default: 100000].
143    ///
144    /// Safety limit: tasks running longer than this are forcibly preempted.
145    /// Prevents any single task from monopolizing the CPU.
146    ///
147    /// Esports: 50000µs (50ms) | Gaming: 100000µs (100ms) | Legacy: 200000µs (200ms)
148    /// Recommended range: 50000-200000µs
149    #[arg(long, verbatim_doc_comment)]
150    starvation: Option<u64>,
151
152    /// Enable live TUI (Terminal User Interface) with real-time statistics.
153    ///
154    /// Shows dispatch counts per tier, tier transitions,
155    /// wait time stats, and system topology information.
156    /// Press 'q' to exit TUI mode.
157    #[arg(long, short, verbatim_doc_comment)]
158    verbose: bool,
159
160    /// Statistics refresh interval in SECONDS (only with --verbose).
161    ///
162    /// How often the TUI updates. Lower values = more responsive but
163    /// higher overhead. Has no effect without --verbose.
164    ///
165    /// Default: 1 second
166    #[arg(long, default_value_t = 1, verbatim_doc_comment)]
167    interval: u64,
168
169    /// Live in-kernel testing mode for automated benchmarking.
170    ///
171    /// Runs the scheduler for 10 seconds, collects BPF data points,
172    /// and prints a structured JSON output to stdout.
173    #[arg(long, verbatim_doc_comment)]
174    testing: bool,
175
176    /// Print scheduler version and exit.
177    #[arg(short = 'V', long, action = clap::ArgAction::SetTrue)]
178    version: bool,
179}
180
181impl Args {
182    /// Get effective values (profile defaults with CLI overrides applied)
183    fn effective_values(&self) -> (u64, u64, u64) {
184        let (q, nfb, starv) = self.profile.values();
185        (
186            self.quantum.unwrap_or(q),
187            self.new_flow_bonus.unwrap_or(nfb),
188            self.starvation.unwrap_or(starv),
189        )
190    }
191}
192
193struct Scheduler<'a> {
194    skel: BpfSkel<'a>,
195    args: Args,
196    topology: topology::TopologyInfo,
197    latency_matrix: Vec<Vec<f64>>,
198    struct_ops: Option<libbpf_rs::Link>,
199}
200
201impl<'a> Scheduler<'a> {
202    fn new(
203        args: Args,
204        open_object: &'a mut std::mem::MaybeUninit<libbpf_rs::OpenObject>,
205    ) -> Result<Self> {
206        use libbpf_rs::skel::{OpenSkel, Skel, SkelBuilder};
207
208        // ═══ scx_ops_open! equivalent ═══
209        // Matches scx_ops_open!(skel_builder, open_object, cake_ops, None)
210        // Cake can't use the macro directly (custom arena architecture),
211        // so we inline the critical functionality.
212        scx_utils::compat::check_min_requirements()?;
213
214        let skel_builder = BpfSkelBuilder::default();
215        let mut open_skel = skel_builder
216            .open(open_object)
217            .context("Failed to open BPF skeleton")?;
218
219        // Inject version suffix into ops name: "cake" → "cake_1.1.0_g<hash>_<target>"
220        // This is what scx_loader reads from /sys/kernel/sched_ext/root/ops
221        {
222            let ops = open_skel.struct_ops.cake_ops_mut();
223            let name_field = &mut ops.name;
224
225            let version_suffix = scx_utils::build_id::ops_version_suffix(env!("CARGO_PKG_VERSION"));
226            let bytes = version_suffix.as_bytes();
227            let mut i = 0;
228            let mut bytes_idx = 0;
229            let mut found_null = false;
230
231            while i < name_field.len() - 1 {
232                found_null |= name_field[i] == 0;
233                if !found_null {
234                    i += 1;
235                    continue;
236                }
237
238                if bytes_idx < bytes.len() {
239                    name_field[i] = bytes[bytes_idx] as i8;
240                    bytes_idx += 1;
241                } else {
242                    break;
243                }
244                i += 1;
245            }
246            name_field[i] = 0;
247        }
248
249        // Read hotplug sequence number — enables kernel-requested restarts on CPU hotplug
250        {
251            let path = std::path::Path::new("/sys/kernel/sched_ext/hotplug_seq");
252            let val = std::fs::read_to_string(path)
253                .context("Failed to read /sys/kernel/sched_ext/hotplug_seq")?;
254            open_skel.struct_ops.cake_ops_mut().hotplug_seq = val
255                .trim()
256                .parse::<u64>()
257                .context("Failed to parse hotplug_seq")?;
258        }
259
260        // Honor SCX_TIMEOUT_MS environment variable (matches scx_ops_open! behavior)
261        if let Ok(s) = std::env::var("SCX_TIMEOUT_MS") {
262            let ms: u32 = s.parse().context("SCX_TIMEOUT_MS has invalid value")?;
263            info!("Setting timeout_ms to {} based on environment", ms);
264            open_skel.struct_ops.cake_ops_mut().timeout_ms = ms;
265        }
266
267        // Populate SCX enum RODATA from kernel BTF (SCX_DSQ_LOCAL_ON, SCX_KICK_PREEMPT, etc.)
268        scx_utils::import_enums!(open_skel);
269
270        // Detect system topology (CCDs, P/E cores)
271        let topo = topology::detect()?;
272
273        // Get effective values (profile + CLI overrides)
274        let (quantum, new_flow_bonus, _starvation) = args.effective_values();
275
276        // Latency matrix: zeroed, populated by TUI Topology tab if --verbose
277        let latency_matrix = vec![vec![0.0; topo.nr_cpus]; topo.nr_cpus];
278
279        // Configure the scheduler via rodata (read-only data)
280        if let Some(rodata) = &mut open_skel.maps.rodata_data {
281            rodata.quantum_ns = quantum * 1000;
282            rodata.new_flow_bonus_ns = new_flow_bonus * 1000;
283            // Stats/telemetry: only available in debug builds (CAKE_RELEASE omits the field).
284            // In release, --verbose is silently ignored — zero overhead for production gaming.
285            #[cfg(debug_assertions)]
286            {
287                rodata.enable_stats = args.verbose || args.testing;
288            }
289
290            // has_hybrid removed: smt_sibling now uses pre-filled cpu_sibling_map only
291            // Per-LLC DSQ partitioning: populate CPU→LLC mapping
292            let llc_count = topo.llc_cpu_mask.iter().filter(|&&m| m != 0).count() as u32;
293            rodata.nr_llcs = llc_count.max(1);
294            rodata.nr_cpus = topo.nr_cpus.min(256) as u32; // F2: widened from 64→256 for Threadripper
295            rodata.nr_phys_cpus = topo.nr_phys_cpus.min(256) as u32; // V3: PHYS_FIRST scan mask
296
297            // Ferry explicit 64-bit topology arrays down into BPF (O(1) execution replacements)
298
299            // Heterogeneous Gaming Topology — u64[4] arrays (F2: 256-bit masks)
300            rodata.big_core_phys_mask[0] = topo.big_core_phys_mask;
301            rodata.big_core_smt_mask[0] = topo.big_core_smt_mask;
302            rodata.little_core_mask[0] = topo.little_core_mask;
303            rodata.vcache_llc_mask[0] = topo.vcache_llc_mask;
304            rodata.has_vcache = topo.has_vcache;
305            rodata.has_hybrid_cores = topo.big_core_phys_mask != 0;
306
307            for i in 0..topo.cpu_sibling_map.len() {
308                rodata.cpu_sibling_map[i] = topo.cpu_sibling_map[i];
309            }
310            for i in 0..topo.llc_cpu_mask.len().min(8) {
311                rodata.llc_cpu_mask[i] = topo.llc_cpu_mask[i];
312            }
313            for i in 0..topo.core_cpu_mask.len().min(32) {
314                rodata.core_cpu_mask[i] = topo.core_cpu_mask[i];
315            }
316
317            for (i, &llc_id) in topo.cpu_llc_id.iter().enumerate() {
318                rodata.cpu_llc_id[i] = llc_id as u32;
319            }
320
321            // Performance-ordered CPU arrays: read prefcore ranking from sysfs,
322            // sort by performance, group SMT pairs together.
323            // GAME tasks scan fast→slow, non-GAME scans slow→fast.
324            {
325                let nr = topo.nr_cpus.min(256);
326                // Read prefcore ranking per CPU (higher = faster)
327                let mut rankings: Vec<(usize, u32)> = (0..nr)
328                    .map(|cpu| {
329                        let path = format!(
330                            "/sys/devices/system/cpu/cpu{}/cpufreq/amd_pstate_prefcore_ranking",
331                            cpu
332                        );
333                        let rank = std::fs::read_to_string(&path)
334                            .ok()
335                            .and_then(|s| s.trim().parse::<u32>().ok())
336                            .unwrap_or(100); // fallback: equal ranking
337                        (cpu, rank)
338                    })
339                    .collect();
340
341                // Sort by descending rank (fastest first), stable for SMT grouping
342                rankings.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
343
344                // Build fast→slow array with SMT pairs grouped together:
345                // [best_phys, best_smt, second_phys, second_smt, ...]
346                let mut fast_to_slow: Vec<u8> = Vec::with_capacity(nr);
347                let mut used = vec![false; nr];
348                for &(cpu, _) in &rankings {
349                    if used[cpu] {
350                        continue;
351                    }
352                    fast_to_slow.push(cpu as u8);
353                    used[cpu] = true;
354                    // Add SMT sibling immediately after
355                    let sib = topo.cpu_sibling_map.get(cpu).copied().unwrap_or(0xFF);
356                    if (sib as usize) < nr && !used[sib as usize] {
357                        fast_to_slow.push(sib);
358                        used[sib as usize] = true;
359                    }
360                }
361
362                // Populate RODATA arrays
363                for i in 0..64usize {
364                    if i < fast_to_slow.len() {
365                        rodata.cpus_fast_to_slow[i] = fast_to_slow[i];
366                        // Reverse for slow→fast
367                        rodata.cpus_slow_to_fast[i] = fast_to_slow[fast_to_slow.len() - 1 - i];
368                    } else {
369                        rodata.cpus_fast_to_slow[i] = 0xFF; // sentinel
370                        rodata.cpus_slow_to_fast[i] = 0xFF;
371                    }
372                }
373
374                let top_cpus: Vec<_> = fast_to_slow.iter().take(4).collect();
375                info!(
376                    "Core steering: fast→slow order {:?} ({} CPUs)",
377                    top_cpus, nr
378                );
379            }
380
381            // ═══ Per-CPU capacity table (F1 correctness fix) ═══
382            // Read arch_scale_cpu_capacity from sysfs for P/E core vruntime scaling.
383            // Scale: 0-1024, where 1024 = fastest core. On SMP all = 1024 → JIT folds.
384            // Intel hybrid: P-cores ~1024, E-cores ~600-700.
385            // AMD SMP: all 1024 → cap > 0 && cap < 1024 is always false → zero overhead.
386            {
387                let nr = topo.nr_cpus.min(256);
388                let mut all_equal = true;
389                let mut first_cap: u32 = 0;
390
391                for cpu in 0..nr {
392                    let path = format!("/sys/devices/system/cpu/cpu{}/cpu_capacity", cpu);
393                    let cap = std::fs::read_to_string(&path)
394                        .ok()
395                        .and_then(|s| s.trim().parse::<u32>().ok())
396                        .unwrap_or(1024);
397
398                    rodata.cpuperf_cap_table[cpu] = cap;
399
400                    if cpu == 0 {
401                        first_cap = cap;
402                    } else if cap != first_cap {
403                        all_equal = false;
404                    }
405                }
406
407                if !all_equal {
408                    info!(
409                        "Capacity scaling: heterogeneous (P/E cores, range {}-{})",
410                        rodata.cpuperf_cap_table[..nr].iter().min().unwrap_or(&0),
411                        rodata.cpuperf_cap_table[..nr].iter().max().unwrap_or(&1024)
412                    );
413                }
414            }
415
416            // Arena library: nr_cpu_ids must be set before load() — arena_init
417            // checks this and returns -ENODEV (errno 19) if uninitialized.
418            rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
419
420            // ═══ Audio stack detection ═══
421            // Phase 1: Core audio daemons by comm name.
422            // Phase 2: PipeWire socket clients (mixers like goxlr-daemon).
423            // Both are session-persistent → bake into RODATA.
424            {
425                use std::collections::HashSet;
426
427                const AUDIO_COMMS: &[&str] = &[
428                    "pipewire",
429                    "wireplumber",
430                    "pipewire-pulse",
431                    "pulseaudio",
432                    "jackd",
433                    "jackdbus",
434                ];
435                let mut audio_tgids: Vec<u32> = Vec::new();
436                let mut audio_tgid_set: HashSet<u32> = HashSet::new();
437
438                // Phase 1: comm-based detection
439                if let Ok(entries) = std::fs::read_dir("/proc") {
440                    for entry in entries.flatten() {
441                        let name = entry.file_name();
442                        let name_str = name.to_string_lossy();
443                        if !name_str.chars().all(|c| c.is_ascii_digit()) {
444                            continue;
445                        }
446                        let pid: u32 = match name_str.parse() {
447                            Ok(p) => p,
448                            Err(_) => continue,
449                        };
450                        if let Ok(comm) = std::fs::read_to_string(format!("/proc/{}/comm", pid)) {
451                            let comm = comm.trim();
452                            if AUDIO_COMMS.contains(&comm) && audio_tgid_set.insert(pid) {
453                                audio_tgids.push(pid);
454                            }
455                        }
456                    }
457                }
458
459                // Phase 2: PipeWire socket client detection.
460                // Scan /proc/net/unix for pipewire-0 socket inodes, then find
461                // processes with fds pointing to those inodes. This catches any
462                // audio mixer daemon (goxlr-daemon, easyeffects, etc.) without
463                // brittle comm lists.
464                let core_count = audio_tgids.len();
465                'pw_detect: {
466                    let uid = unsafe { libc::getuid() };
467                    let pw_socket_path = format!("/run/user/{}/pipewire-0", uid);
468
469                    // Collect inodes for the PipeWire socket
470                    let unix_content = match std::fs::read_to_string("/proc/net/unix") {
471                        Ok(c) => c,
472                        Err(_) => break 'pw_detect,
473                    };
474                    let mut pw_inodes: HashSet<u64> = HashSet::new();
475                    for line in unix_content.lines().skip(1) {
476                        if line.ends_with(&pw_socket_path)
477                            || line.contains(&format!("{} ", pw_socket_path))
478                        {
479                            // Format: Num RefCount Protocol Flags Type St Inode Path
480                            let fields: Vec<&str> = line.split_whitespace().collect();
481                            if fields.len() >= 7 {
482                                if let Ok(inode) = fields[6].parse::<u64>() {
483                                    if inode > 0 {
484                                        pw_inodes.insert(inode);
485                                    }
486                                }
487                            }
488                        }
489                    }
490                    if pw_inodes.is_empty() {
491                        break 'pw_detect;
492                    }
493
494                    // Scan /proc/*/fd for socket links matching PipeWire inodes.
495                    // Only check thread-group leaders (dirs in /proc with numeric names).
496                    if let Ok(proc_entries) = std::fs::read_dir("/proc") {
497                        for entry in proc_entries.flatten() {
498                            if audio_tgids.len() >= 8 {
499                                break;
500                            }
501                            let name = entry.file_name();
502                            let name_str = name.to_string_lossy();
503                            if !name_str.chars().all(|c| c.is_ascii_digit()) {
504                                continue;
505                            }
506                            let pid: u32 = match name_str.parse() {
507                                Ok(p) => p,
508                                Err(_) => continue,
509                            };
510                            // Skip PIDs already detected as core audio
511                            if audio_tgid_set.contains(&pid) {
512                                continue;
513                            }
514                            let fd_dir = format!("/proc/{}/fd", pid);
515                            let fd_entries = match std::fs::read_dir(&fd_dir) {
516                                Ok(e) => e,
517                                Err(_) => continue,
518                            };
519                            for fd_entry in fd_entries.flatten() {
520                                if let Ok(link) = std::fs::read_link(fd_entry.path()) {
521                                    let link_str = link.to_string_lossy();
522                                    // Socket links look like "socket:[12345]"
523                                    if let Some(inode_str) = link_str
524                                        .strip_prefix("socket:[")
525                                        .and_then(|s| s.strip_suffix(']'))
526                                    {
527                                        if let Ok(inode) = inode_str.parse::<u64>() {
528                                            if pw_inodes.contains(&inode) {
529                                                if audio_tgid_set.insert(pid) {
530                                                    audio_tgids.push(pid);
531                                                }
532                                                break; // Found one match, move to next PID
533                                            }
534                                        }
535                                    }
536                                }
537                            }
538                        }
539                    }
540                }
541
542                rodata.nr_audio_tgids = audio_tgids.len() as u32;
543                for (i, &tgid) in audio_tgids.iter().enumerate() {
544                    rodata.audio_tgids[i] = tgid;
545                }
546                let client_count = audio_tgids.len() - core_count;
547                if !audio_tgids.is_empty() {
548                    info!(
549                        "Audio stack detected: {} daemons{} (TGIDs: {:?})",
550                        audio_tgids.len(),
551                        if client_count > 0 {
552                            format!(
553                                ", {} PipeWire client{}",
554                                client_count,
555                                if client_count == 1 { "" } else { "s" }
556                            )
557                        } else {
558                            String::new()
559                        },
560                        audio_tgids
561                    );
562                }
563            }
564
565            // ═══ Compositor detection ═══
566            // Wayland compositors present every frame to the display.
567            // Session-persistent → bake into RODATA.
568            {
569                const COMPOSITOR_COMMS: &[&str] = &[
570                    "kwin_wayland",
571                    "kwin_x11",
572                    "mutter",
573                    "gnome-shell",
574                    "sway",
575                    "Hyprland",
576                    "weston",
577                    "labwc",
578                    "wayfire",
579                    "river",
580                    "gamescope",
581                ];
582                let mut compositor_tgids: Vec<u32> = Vec::new();
583                if let Ok(entries) = std::fs::read_dir("/proc") {
584                    for entry in entries.flatten() {
585                        let name = entry.file_name();
586                        let name_str = name.to_string_lossy();
587                        if !name_str.chars().all(|c| c.is_ascii_digit()) {
588                            continue;
589                        }
590                        let pid: u32 = match name_str.parse() {
591                            Ok(p) => p,
592                            Err(_) => continue,
593                        };
594                        if let Ok(comm) = std::fs::read_to_string(format!("/proc/{}/comm", pid)) {
595                            let comm = comm.trim();
596                            if COMPOSITOR_COMMS.contains(&comm) {
597                                compositor_tgids.push(pid);
598                                if compositor_tgids.len() >= 4 {
599                                    break;
600                                }
601                            }
602                        }
603                    }
604                }
605                rodata.nr_compositor_tgids = compositor_tgids.len() as u32;
606                for (i, &tgid) in compositor_tgids.iter().enumerate() {
607                    rodata.compositor_tgids[i] = tgid;
608                }
609                if !compositor_tgids.is_empty() {
610                    info!(
611                        "Compositor detected: {} (TGIDs: {:?})",
612                        compositor_tgids.len(),
613                        compositor_tgids
614                    );
615                }
616            }
617        }
618
619        // ═══ scx_ops_load! equivalent ═══
620        // Set UEI dump buffer size before load (matches scx_ops_load! behavior)
621        scx_utils::uei_set_size!(open_skel, cake_ops, uei);
622
623        let mut skel = open_skel.load().context("Failed to load BPF program")?;
624
625        // Initialize the BPF arena library.
626        // Must happen after load() (BPF maps are now live) but before attach_struct_ops()
627        // (scheduler not yet running, so init_task hasn't fired yet).
628        // ArenaLib::setup() runs SEC("syscall") probes:
629        //   1. arena_init: allocates static pages, inits task stack allocator
630        //   2. arena_topology_node_init: registers topology nodes for arena traversal
631        let task_ctx_size = std::mem::size_of::<bpf_intf::cake_task_ctx>();
632        let arena = ArenaLib::init(skel.object_mut(), task_ctx_size, topo.nr_cpus)
633            .context("Failed to create ArenaLib")?;
634        arena.setup().context("Failed to initialize BPF arena")?;
635        info!(
636            "BPF arena initialized (task_ctx_size={}B, nr_cpus={})",
637            task_ctx_size, topo.nr_cpus
638        );
639
640        // Set initial BSS values before attach (zero-init'd in BPF for BSS placement).
641        // quantum_ceiling_ns: default IDLE/GAMING → 2ms. TUI updates at ~2Hz.
642        if let Some(bss) = &mut skel.maps.bss_data {
643            bss.quantum_ceiling_ns = 2_000_000; // AQ_BULK_CEILING_NS
644        }
645
646        Ok(Self {
647            skel,
648            args,
649            topology: topo,
650            latency_matrix,
651            struct_ops: None,
652        })
653    }
654
655    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
656        use libbpf_rs::skel::Skel;
657
658        // ═══ scx_ops_attach! equivalent ═══
659        // Guard: prevent loading if another sched_ext scheduler is already active
660        if scx_utils::compat::is_sched_ext_enabled().unwrap_or(false) {
661            anyhow::bail!("another sched_ext scheduler is already running");
662        }
663
664        // Attach non-struct_ops BPF programs first, then struct_ops
665        self.skel
666            .attach()
667            .context("Failed to attach non-struct_ops BPF programs")?;
668        self.struct_ops = Some(
669            self.skel
670                .maps
671                .cake_ops
672                .attach_struct_ops()
673                .context("Failed to attach struct_ops BPF programs")?,
674        );
675
676        // Release builds: --verbose and --testing are unavailable (stats compiled out).
677        // Warn early so user knows these flags require a debug build.
678        #[cfg(not(debug_assertions))]
679        if self.args.verbose || self.args.testing {
680            warn!("--verbose and --testing require a debug build (telemetry is compiled out in release).");
681            warn!("Rebuild without --release: cargo build -p scx_cake");
682            self.args.verbose = false;
683            self.args.testing = false;
684        }
685
686        // Standard startup banner: follows scx_cosmos/scx_bpfland convention
687        info!(
688            "{} {} {}",
689            SCHEDULER_NAME,
690            build_id::full_version(env!("CARGO_PKG_VERSION")),
691            if self.topology.smt_enabled {
692                "SMT on"
693            } else {
694                "SMT off"
695            }
696        );
697
698        // Print command line.
699        info!(
700            "scheduler options: {}",
701            std::env::args().collect::<Vec<_>>().join(" ")
702        );
703
704        info!(
705            "{} CPUs, {} LLCs, profile: {:?}",
706            self.topology.nr_cpus,
707            self.topology
708                .llc_cpu_mask
709                .iter()
710                .filter(|&&m| m != 0)
711                .count()
712                .max(1),
713            self.args.profile
714        );
715        if self.args.testing {
716            info!("Running in benchmarking mode for 10 seconds...");
717            std::thread::sleep(std::time::Duration::from_secs(1)); // Warmup
718
719            let mut start_dispatches = 0u64;
720            for cpu in 0..self.topology.nr_cpus {
721                let stats = &self.skel.maps.bss_data.as_ref().unwrap().global_stats[cpu];
722                start_dispatches += stats.nr_new_flow_dispatches + stats.nr_old_flow_dispatches;
723            }
724
725            let start_time = std::time::Instant::now();
726            let mut elapsed = 0;
727            while elapsed < 10 && !shutdown.load(Ordering::Relaxed) {
728                std::thread::sleep(std::time::Duration::from_secs(1));
729                elapsed += 1;
730            }
731            let duration = start_time.elapsed().as_secs_f64();
732
733            let mut end_dispatches = 0u64;
734            for cpu in 0..self.topology.nr_cpus {
735                let stats = &self.skel.maps.bss_data.as_ref().unwrap().global_stats[cpu];
736                end_dispatches += stats.nr_new_flow_dispatches + stats.nr_old_flow_dispatches;
737            }
738
739            let delta = end_dispatches.saturating_sub(start_dispatches);
740            let throughput = delta as f64 / duration;
741            println!("{{\"duration_sec\": {:.2}, \"total_dispatches\": {}, \"dispatches_per_sec\": {:.2}}}",
742                     duration, delta, throughput);
743
744            shutdown.store(true, Ordering::Relaxed);
745        } else if self.args.verbose && std::io::stdout().is_terminal() {
746            // Run TUI mode
747            tui::run_tui(
748                &mut self.skel,
749                shutdown.clone(),
750                self.args.interval,
751                self.topology.clone(),
752                self.latency_matrix.clone(),
753            )?;
754        } else {
755            if self.args.verbose && !std::io::stdout().is_terminal() {
756                warn!("TUI disabled: no terminal detected (headless mode)");
757            }
758            // Simple headless mode: matches cosmos/bpfland pattern exactly.
759            // ctrlc handler sets shutdown on SIGINT/SIGTERM.
760            // 1-second sleep + UEI check = responsive shutdown.
761            while !shutdown.load(Ordering::Relaxed) {
762                std::thread::sleep(std::time::Duration::from_secs(1));
763                if scx_utils::uei_exited!(&self.skel, uei) {
764                    break;
765                }
766            }
767        }
768
769        info!("{SCHEDULER_NAME} scheduler shutting down");
770
771        // Drop struct_ops link BEFORE uei_report — this triggers the kernel to
772        // set UEI kind=SCX_EXIT_UNREG. Matches scx_bpfland/scx_cosmos/scx_lavd
773        // pattern: `let _ = self.struct_ops.take(); uei_report!(...)`
774        let _ = self.struct_ops.take();
775
776        // Standard UEI exit report — returns UserExitInfo for should_restart().
777        scx_utils::uei_report!(&self.skel, uei)
778    }
779}
780
781impl Drop for Scheduler<'_> {
782    fn drop(&mut self) {
783        info!("Unregister {SCHEDULER_NAME} scheduler");
784    }
785}
786
787fn main() -> Result<()> {
788    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
789
790    let args = Args::parse();
791
792    // Handle --version before anything else (matches cosmos/bpfland)
793    if args.version {
794        println!(
795            "{} {}",
796            SCHEDULER_NAME,
797            build_id::full_version(env!("CARGO_PKG_VERSION"))
798        );
799        return Ok(());
800    }
801
802    // Set up signal handler: ctrlc handles both SIGINT and SIGTERM on Linux.
803    // This is the same pattern cosmos/bpfland use — no SigSet blocking or
804    // SignalFd complexity needed.
805    let shutdown = Arc::new(AtomicBool::new(false));
806    let shutdown_clone = shutdown.clone();
807
808    ctrlc::set_handler(move || {
809        info!("Received shutdown signal");
810        shutdown_clone.store(true, Ordering::Relaxed);
811    })?;
812
813    // Create open object for BPF - needs to outlive scheduler
814    let mut open_object = std::mem::MaybeUninit::uninit();
815
816    // Restart loop: matches cosmos/bpfland pattern.
817    // Kernel can request restart via UEI (e.g., CPU hotplug).
818    loop {
819        let mut scheduler = Scheduler::new(args.clone(), &mut open_object)?;
820        if !scheduler.run(shutdown.clone())?.should_restart() {
821            break;
822        }
823    }
824
825    Ok(())
826}