Skip to main content

scx_pandemonium/
scheduler.rs

1// PANDEMONIUM SCHEDULER
2// WRAPS THE BPF SKELETON: OPEN, CONFIGURE, LOAD, ATTACH, SHUTDOWN
3// MONITORING AND ADAPTIVE CONTROL LIVE IN adaptive.rs
4
5use std::mem::MaybeUninit;
6
7use anyhow::Result;
8use libbpf_rs::skel::{OpenSkel, SkelBuilder};
9use libbpf_rs::MapCore;
10
11use crate::bpf_skel::*;
12use crate::tuning::{OscillatorState, TuningKnobs};
13use scx_pandemonium::event::EventLog;
14
15// SCX EXIT CODES (FROM KERNEL)
16const SCX_EXIT_NONE: i32 = 0;
17const SCX_ECODE_RST_MASK: u64 = 1 << 16;
18
19// SCX DSQ FLAGS (STABLE KERNEL ABI -- sched_ext/sched.h)
20const SCX_DSQ_FLAG_BUILTIN: u64 = 1u64 << 63;
21const SCX_DSQ_FLAG_LOCAL_ON: u64 = 1u64 << 62;
22
23// MATCHES struct pandemonium_stats IN BPF (intf.h)
24#[repr(C)]
25#[derive(Default, Clone, Copy)]
26pub struct PandemoniumStats {
27    pub nr_dispatches: u64,
28    pub nr_idle_hits: u64,
29    pub nr_shared: u64,
30    pub nr_preempt: u64,
31    pub wake_lat_sum: u64,
32    pub wake_lat_samples: u64,
33    pub nr_keep_running: u64,
34    pub nr_hard_kicks: u64,
35    pub nr_soft_kicks: u64,
36    pub nr_enq_wakeup: u64,
37    pub nr_enq_requeue: u64,
38    pub wake_lat_idle_sum: u64,
39    pub wake_lat_idle_cnt: u64,
40    pub wake_lat_kick_sum: u64,
41    pub wake_lat_kick_cnt: u64,
42    pub nr_l2_hit_batch: u64,
43    pub nr_l2_miss_batch: u64,
44    pub nr_l2_hit_interactive: u64,
45    pub nr_l2_miss_interactive: u64,
46    pub nr_l2_hit_lat_crit: u64,
47    pub nr_l2_miss_lat_crit: u64,
48    pub nr_reenqueue: u64,
49    pub batch_sojourn_ns: u64,
50    pub longrun_mode_active: u64,
51    pub nr_overflow_rescue: u64,
52}
53
54// COMPILE-TIME ABI SAFETY: MUST MATCH STRUCT LAYOUTS IN intf.h
55const _: () = assert!(std::mem::size_of::<PandemoniumStats>() == 200);
56const _: () = assert!(std::mem::size_of::<TuningKnobs>() == 88);
57
58// MAX_AFFINITY_CANDIDATES IS DEFINED IN intf.h. THE RUST MIRROR IN
59// bpf_intf.rs MUST KEEP THE SAME VALUE; IF THE TWO SIDES DRIFT, THE
60// BPF MAP STRIDE AND THE RUST WRITER STRIDE DISAGREE AND THE TABLE
61// IS SILENTLY MIS-POPULATED.
62const _: () = assert!(crate::bpf_intf::MAX_AFFINITY_CANDIDATES == crate::bpf_intf::MAX_CPUS >> 3);
63
64// TuningKnobs LIVES IN tuning.rs (ZERO BPF DEPENDENCIES, TESTABLE OFFLINE)
65
66const KNOBS_PIN: &str = "/sys/fs/bpf/pandemonium/tuning_knobs";
67
68pub struct Scheduler<'a> {
69    skel: MainSkel<'a>,
70    _link: libbpf_rs::Link,
71    pub log: EventLog,
72}
73
74impl<'a> Scheduler<'a> {
75    pub fn init(
76        open_object: &'a mut MaybeUninit<libbpf_rs::OpenObject>,
77        nr_cpus_override: Option<u64>,
78    ) -> Result<Self> {
79        // OPEN
80        let builder = MainSkelBuilder::default();
81        let mut open_skel = builder.open(open_object)?;
82
83        // INJECT VERSION SUFFIX INTO OPS NAME FOR scx_loader GUI
84        {
85            let ops = open_skel.struct_ops.pandemonium_ops_mut();
86            let name_field = &mut ops.name;
87            let version_suffix = scx_utils::build_id::ops_version_suffix(env!("CARGO_PKG_VERSION"));
88            let bytes = version_suffix.as_bytes();
89            let mut i = 0;
90            let mut bytes_idx = 0;
91            let mut found_null = false;
92            while i < name_field.len() - 1 {
93                found_null |= name_field[i] == 0;
94                if !found_null {
95                    i += 1;
96                    continue;
97                }
98                if bytes_idx < bytes.len() {
99                    name_field[i] = bytes[bytes_idx] as i8;
100                    bytes_idx += 1;
101                } else {
102                    break;
103                }
104                i += 1;
105            }
106            name_field[i] = 0;
107        }
108
109        // CONFIGURE RODATA (BEFORE LOAD)
110        let rodata = open_skel.maps.rodata_data.as_mut().unwrap();
111
112        let possible = libbpf_rs::num_possible_cpus()? as u64;
113        rodata.nr_cpu_ids = nr_cpus_override.unwrap_or(possible);
114
115        // POPULATE SCX ENUM VALUES
116        rodata.__SCX_DSQ_FLAG_BUILTIN = SCX_DSQ_FLAG_BUILTIN;
117        rodata.__SCX_DSQ_FLAG_LOCAL_ON = SCX_DSQ_FLAG_LOCAL_ON;
118        rodata.__SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN;
119        rodata.__SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1;
120        rodata.__SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON;
121        rodata.__SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON | 1;
122        rodata.__SCX_DSQ_LOCAL_CPU_MASK = 0xFFFFFFFF;
123
124        // POPULATE SCX_KICK_* ENUM VALUES
125        rodata.__SCX_KICK_IDLE = 1;
126        rodata.__SCX_KICK_PREEMPT = 2;
127        rodata.__SCX_KICK_WAIT = 4;
128
129        // LOAD (VALIDATES BPF WITH KERNEL)
130        let mut skel = open_skel.load()?;
131
132        // ATTACH STRUCT_OPS
133        let link = skel.maps.pandemonium_ops.attach_struct_ops()?;
134
135        // PIN MAPS FOR USERSPACE ACCESS (NON-FATAL: bpffs MAY NOT BE MOUNTED)
136        let pin_dir = "/sys/fs/bpf/pandemonium";
137        let bpffs_ok = std::fs::create_dir_all(pin_dir).is_ok();
138        if bpffs_ok {
139            std::fs::remove_file(KNOBS_PIN).ok();
140            skel.maps.tuning_knobs_map.pin(KNOBS_PIN).ok();
141
142            let cache_pin = "/sys/fs/bpf/pandemonium/cache_domain";
143            std::fs::remove_file(cache_pin).ok();
144            skel.maps.cache_domain.pin(cache_pin).ok();
145
146            let observe_pin = "/sys/fs/bpf/pandemonium/task_class_observe";
147            std::fs::remove_file(observe_pin).ok();
148            skel.maps.task_class_observe.pin(observe_pin).ok();
149
150            let init_pin = "/sys/fs/bpf/pandemonium/task_class_init";
151            std::fs::remove_file(init_pin).ok();
152            skel.maps.task_class_init.pin(init_pin).ok();
153
154            let compositor_pin = "/sys/fs/bpf/pandemonium/compositor_map";
155            std::fs::remove_file(compositor_pin).ok();
156            skel.maps.compositor_map.pin(compositor_pin).ok();
157        } else {
158            log_warn!("BPFFS NOT AVAILABLE: map pinning skipped (scheduler still functional)");
159        }
160
161        Ok(Self {
162            skel,
163            _link: link,
164            log: EventLog::new(),
165        })
166    }
167
168    // SUM PER-CPU STATS INTO A SINGLE TOTAL
169    pub fn read_stats(&self) -> PandemoniumStats {
170        let key = 0u32.to_ne_bytes();
171        let mut total = PandemoniumStats::default();
172
173        let percpu_vals = match self
174            .skel
175            .maps
176            .stats_map
177            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
178        {
179            Ok(Some(v)) => v,
180            _ => return total,
181        };
182
183        for cpu_val in &percpu_vals {
184            if cpu_val.len() >= std::mem::size_of::<PandemoniumStats>() {
185                let stats: PandemoniumStats = unsafe {
186                    std::ptr::read_unaligned(cpu_val.as_ptr() as *const PandemoniumStats)
187                };
188                total.nr_dispatches += stats.nr_dispatches;
189                total.nr_idle_hits += stats.nr_idle_hits;
190                total.nr_shared += stats.nr_shared;
191                total.nr_preempt += stats.nr_preempt;
192                total.wake_lat_sum += stats.wake_lat_sum;
193                total.wake_lat_samples += stats.wake_lat_samples;
194                total.nr_keep_running += stats.nr_keep_running;
195                total.nr_hard_kicks += stats.nr_hard_kicks;
196                total.nr_soft_kicks += stats.nr_soft_kicks;
197                total.nr_enq_wakeup += stats.nr_enq_wakeup;
198                total.nr_enq_requeue += stats.nr_enq_requeue;
199                total.wake_lat_idle_sum += stats.wake_lat_idle_sum;
200                total.wake_lat_idle_cnt += stats.wake_lat_idle_cnt;
201                total.wake_lat_kick_sum += stats.wake_lat_kick_sum;
202                total.wake_lat_kick_cnt += stats.wake_lat_kick_cnt;
203                total.nr_l2_hit_batch += stats.nr_l2_hit_batch;
204                total.nr_l2_miss_batch += stats.nr_l2_miss_batch;
205                total.nr_l2_hit_interactive += stats.nr_l2_hit_interactive;
206                total.nr_l2_miss_interactive += stats.nr_l2_miss_interactive;
207                total.nr_l2_hit_lat_crit += stats.nr_l2_hit_lat_crit;
208                total.nr_l2_miss_lat_crit += stats.nr_l2_miss_lat_crit;
209                total.nr_reenqueue += stats.nr_reenqueue;
210                if stats.batch_sojourn_ns > total.batch_sojourn_ns {
211                    total.batch_sojourn_ns = stats.batch_sojourn_ns;
212                }
213                if stats.longrun_mode_active > total.longrun_mode_active {
214                    total.longrun_mode_active = stats.longrun_mode_active;
215                }
216                total.nr_overflow_rescue += stats.nr_overflow_rescue;
217            }
218        }
219
220        total
221    }
222
223    // WRITE TUNING KNOBS TO BPF MAP -- CALLED BY MONITOR THREAD
224    pub fn write_tuning_knobs(&self, knobs: &TuningKnobs) -> Result<()> {
225        let key = 0u32.to_ne_bytes();
226        let value = unsafe {
227            std::slice::from_raw_parts(
228                knobs as *const TuningKnobs as *const u8,
229                std::mem::size_of::<TuningKnobs>(),
230            )
231        };
232        self.skel
233            .maps
234            .tuning_knobs_map
235            .update(&key, value, libbpf_rs::MapFlags::ANY)?;
236        Ok(())
237    }
238
239    // WRITE TOPOLOGY-OWNED FIELDS (tau_ns + codel_eq_ns), PRESERVING OTHERS.
240    // CALLED AT TOPOLOGY DETECT AND ON HOTPLUG. READ-MODIFY-WRITE BECAUSE THE
241    // tuning_knobs_map IS A SINGLE-ENTRY STRUCT AND PARTIAL UPDATES AREN'T A
242    // libbpf CONCEPT -- BUT WE NEED A NARROW SETTER SO TOPOLOGY CHANGES DON'T
243    // STOMP ON WHATEVER THE ADAPTIVE LOOP'S LATEST KNOB VALUES ARE.
244    pub fn write_topology_fields(&self, tau_ns: u64, codel_eq_ns: u64) -> Result<()> {
245        let mut knobs = self.read_tuning_knobs();
246        knobs.topology_tau_ns = tau_ns;
247        knobs.codel_eq_ns = codel_eq_ns;
248        self.write_tuning_knobs(&knobs)
249    }
250
251    // READ BPF OSCILLATOR STATE FROM BSS/DATA SECTIONS.
252    // MWU GATES ITS RESCUE-DRIVEN PATHWAYS ON THIS SO IT DOESN'T
253    // DOUBLE-CORRECT WHEN THE BPF DAMPED OSCILLATOR HAS ALREADY MOVED.
254    pub fn read_oscillator_state(&self) -> OscillatorState {
255        let bss = match self.skel.maps.bss_data.as_ref() {
256            Some(b) => b,
257            None => return OscillatorState::default(),
258        };
259        let data = match self.skel.maps.data_data.as_ref() {
260            Some(d) => d,
261            None => return OscillatorState::default(),
262        };
263        OscillatorState {
264            codel_target_ns: bss.codel_target_ns,
265            codel_target_floor_ns: bss.codel_target_floor_ns,
266            codel_target_max_ns: data.codel_target_max_ns,
267        }
268    }
269
270    // READ CURRENT TUNING KNOBS FROM BPF MAP
271    pub fn read_tuning_knobs(&self) -> TuningKnobs {
272        let key = 0u32.to_ne_bytes();
273        match self
274            .skel
275            .maps
276            .tuning_knobs_map
277            .lookup(&key, libbpf_rs::MapFlags::ANY)
278        {
279            Ok(Some(v)) if v.len() >= std::mem::size_of::<TuningKnobs>() => unsafe {
280                std::ptr::read_unaligned(v.as_ptr() as *const TuningKnobs)
281            },
282            _ => TuningKnobs::default(),
283        }
284    }
285
286    // READ WAKEUP LATENCY HISTOGRAM: 3 TIERS x 12 BUCKETS
287    // SUMS ACROSS ALL CPUs (PERCPU_ARRAY). RETURNS CUMULATIVE COUNTS.
288    pub fn read_wake_lat_hist(&self) -> [[u64; 12]; 3] {
289        let mut result = [[0u64; 12]; 3];
290        for key_idx in 0u32..36 {
291            let key = key_idx.to_ne_bytes();
292            if let Ok(Some(percpu_vals)) = self
293                .skel
294                .maps
295                .wake_lat_hist
296                .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
297            {
298                let tier = (key_idx / 12) as usize;
299                let bucket = (key_idx % 12) as usize;
300                for cpu_val in &percpu_vals {
301                    if cpu_val.len() >= std::mem::size_of::<u64>() {
302                        let val: u64 =
303                            unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
304                        result[tier][bucket] += val;
305                    }
306                }
307            }
308        }
309        result
310    }
311
312    // READ SLEEP DURATION HISTOGRAM: 4 BUCKETS
313    // SUMS ACROSS ALL CPUs (PERCPU_ARRAY). RETURNS CUMULATIVE COUNTS.
314    pub fn read_sleep_hist(&self) -> [u64; 4] {
315        let mut result = [0u64; 4];
316        for key_idx in 0u32..4 {
317            let key = key_idx.to_ne_bytes();
318            if let Ok(Some(percpu_vals)) = self
319                .skel
320                .maps
321                .sleep_hist
322                .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
323            {
324                for cpu_val in &percpu_vals {
325                    if cpu_val.len() >= std::mem::size_of::<u64>() {
326                        let val: u64 =
327                            unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
328                        result[key_idx as usize] += val;
329                    }
330                }
331            }
332        }
333        result
334    }
335
336    // POPULATE CACHE DOMAIN MAP FROM TOPOLOGY DATA AT STARTUP
337    pub fn write_cache_domain(&self, cpu: u32, l2_group: u32) -> Result<()> {
338        let key = cpu.to_ne_bytes();
339        let val = l2_group.to_ne_bytes();
340        self.skel
341            .maps
342            .cache_domain
343            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
344        Ok(())
345    }
346
347    // POPULATE L2 SIBLINGS MAP ENTRY
348    pub fn write_l2_sibling(&self, group_id: u32, slot: u32, cpu: u32) -> Result<()> {
349        let key = (group_id * 8 + slot).to_ne_bytes();
350        let val = cpu.to_ne_bytes();
351        self.skel
352            .maps
353            .l2_siblings
354            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
355        Ok(())
356    }
357
358    // POPULATE RESISTANCE AFFINITY RANK MAP
359    // affinity_rank[cpu * MAX_AFFINITY_CANDIDATES + slot] = target_cpu
360    // SORTED BY ASCENDING R_EFF FROM LAPLACIAN PSEUDOINVERSE
361    pub fn write_affinity_rank(&self, cpu: u32, slot: u32, target_cpu: u32) -> Result<()> {
362        // Stride = MAX_AFFINITY_CANDIDATES. Single source of truth is the
363        // C macro in src/bpf/intf.h, mirrored in bpf_intf.rs. The
364        // static_assert above catches drift at compile time.
365        let stride = crate::bpf_intf::MAX_AFFINITY_CANDIDATES;
366        let key = (cpu * stride + slot).to_ne_bytes();
367        let val = target_cpu.to_ne_bytes();
368        self.skel
369            .maps
370            .affinity_rank
371            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
372        Ok(())
373    }
374
375    // POPULATE COMPOSITOR MAP ENTRY
376    pub fn write_compositor(&self, name: &str) -> Result<()> {
377        let mut key = [0u8; 16];
378        let bytes = name.as_bytes();
379        let len = bytes.len().min(15);
380        key[..len].copy_from_slice(&bytes[..len]);
381        let val = [1u8];
382        self.skel
383            .maps
384            .compositor_map
385            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
386        Ok(())
387    }
388
389    // READ UEI EXIT INFO. RETURNS (should_restart).
390    pub fn read_exit_info(&self) -> bool {
391        let data = self.skel.maps.data_data.as_ref().unwrap();
392        let kind = data.uei.kind;
393        let exit_code = data.uei.exit_code;
394
395        if kind != SCX_EXIT_NONE {
396            let reason_bytes: &[u8] =
397                unsafe { std::slice::from_raw_parts(data.uei.reason.as_ptr() as *const u8, 128) };
398            let msg_bytes: &[u8] =
399                unsafe { std::slice::from_raw_parts(data.uei.msg.as_ptr() as *const u8, 1024) };
400
401            let reason = std::str::from_utf8(reason_bytes)
402                .unwrap_or("unknown")
403                .trim_end_matches('\0');
404            let msg = std::str::from_utf8(msg_bytes)
405                .unwrap_or("")
406                .trim_end_matches('\0');
407
408            log_warn!("BPF exit: kind={} code={}", kind, exit_code);
409            if !reason.is_empty() {
410                log_warn!("BPF exit reason: {}", reason);
411            }
412            if !msg.is_empty() {
413                log_warn!("BPF exit msg: {}", msg);
414            }
415        }
416
417        (exit_code as u64 & SCX_ECODE_RST_MASK) != 0
418    }
419
420    pub fn exited(&self) -> bool {
421        self.skel.maps.data_data.as_ref().unwrap().uei.kind != SCX_EXIT_NONE
422    }
423}
424
425impl Drop for Scheduler<'_> {
426    fn drop(&mut self) {
427        let _ = self.skel.maps.tuning_knobs_map.unpin(KNOBS_PIN);
428        let _ = self
429            .skel
430            .maps
431            .cache_domain
432            .unpin("/sys/fs/bpf/pandemonium/cache_domain");
433        let _ = self
434            .skel
435            .maps
436            .task_class_observe
437            .unpin("/sys/fs/bpf/pandemonium/task_class_observe");
438        let _ = self
439            .skel
440            .maps
441            .task_class_init
442            .unpin("/sys/fs/bpf/pandemonium/task_class_init");
443        let _ = self
444            .skel
445            .maps
446            .compositor_map
447            .unpin("/sys/fs/bpf/pandemonium/compositor_map");
448        let _ = std::fs::remove_dir("/sys/fs/bpf/pandemonium");
449    }
450}