Skip to main content

scx_pandemonium/
scheduler.rs

1// PANDEMONIUM SCHEDULER
2// WRAPS THE BPF SKELETON: OPEN, CONFIGURE, LOAD, ATTACH, SHUTDOWN
3// MONITORING AND ADAPTIVE CONTROL LIVE IN adaptive.rs
4
5use std::mem::MaybeUninit;
6
7use anyhow::Result;
8use libbpf_rs::skel::{OpenSkel, SkelBuilder};
9use libbpf_rs::MapCore;
10
11use crate::bpf_skel::*;
12use crate::tuning::TuningKnobs;
13use scx_pandemonium::event::EventLog;
14
15// SCX EXIT CODES (FROM KERNEL)
16const SCX_EXIT_NONE: i32 = 0;
17const SCX_ECODE_RST_MASK: u64 = 1 << 16;
18
19// SCX DSQ FLAGS (STABLE KERNEL ABI -- sched_ext/sched.h)
20const SCX_DSQ_FLAG_BUILTIN: u64 = 1u64 << 63;
21const SCX_DSQ_FLAG_LOCAL_ON: u64 = 1u64 << 62;
22
23// MATCHES struct pandemonium_stats IN BPF (intf.h)
24#[repr(C)]
25#[derive(Default, Clone, Copy)]
26pub struct PandemoniumStats {
27    pub nr_dispatches: u64,
28    pub nr_idle_hits: u64,
29    pub nr_shared: u64,
30    pub nr_preempt: u64,
31    pub wake_lat_sum: u64,
32    pub wake_lat_max: u64,
33    pub wake_lat_samples: u64,
34    pub nr_keep_running: u64,
35    pub nr_hard_kicks: u64,
36    pub nr_soft_kicks: u64,
37    pub nr_enq_wakeup: u64,
38    pub nr_enq_requeue: u64,
39    pub wake_lat_idle_sum: u64,
40    pub wake_lat_idle_cnt: u64,
41    pub wake_lat_kick_sum: u64,
42    pub wake_lat_kick_cnt: u64,
43    pub nr_procdb_hits: u64,
44    pub nr_l2_hit_batch: u64,
45    pub nr_l2_miss_batch: u64,
46    pub nr_l2_hit_interactive: u64,
47    pub nr_l2_miss_interactive: u64,
48    pub nr_l2_hit_lat_crit: u64,
49    pub nr_l2_miss_lat_crit: u64,
50    pub nr_reenqueue: u64,
51    pub batch_sojourn_ns: u64,
52    pub burst_mode_active: u64,
53    pub longrun_mode_active: u64,
54    pub nr_overflow_rescue: u64,
55}
56
57// COMPILE-TIME ABI SAFETY: MUST MATCH STRUCT LAYOUTS IN intf.h
58const _: () = assert!(std::mem::size_of::<PandemoniumStats>() == 224);
59const _: () = assert!(std::mem::size_of::<TuningKnobs>() == 80);
60
61// TuningKnobs lives in tuning.rs (zero BPF dependencies, testable offline)
62
63const KNOBS_PIN: &str = "/sys/fs/bpf/pandemonium/tuning_knobs";
64
65pub struct Scheduler<'a> {
66    skel: MainSkel<'a>,
67    _link: libbpf_rs::Link,
68    pub log: EventLog,
69}
70
71impl<'a> Scheduler<'a> {
72    pub fn init(
73        open_object: &'a mut MaybeUninit<libbpf_rs::OpenObject>,
74        nr_cpus_override: Option<u64>,
75    ) -> Result<Self> {
76        // OPEN
77        let builder = MainSkelBuilder::default();
78        let mut open_skel = builder.open(open_object)?;
79
80        // INJECT VERSION SUFFIX INTO OPS NAME FOR scx_loader GUI
81        {
82            let ops = open_skel.struct_ops.pandemonium_ops_mut();
83            let name_field = &mut ops.name;
84            let version_suffix = scx_utils::build_id::ops_version_suffix(env!("CARGO_PKG_VERSION"));
85            let bytes = version_suffix.as_bytes();
86            let mut i = 0;
87            let mut bytes_idx = 0;
88            let mut found_null = false;
89            while i < name_field.len() - 1 {
90                found_null |= name_field[i] == 0;
91                if !found_null {
92                    i += 1;
93                    continue;
94                }
95                if bytes_idx < bytes.len() {
96                    name_field[i] = bytes[bytes_idx] as i8;
97                    bytes_idx += 1;
98                } else {
99                    break;
100                }
101                i += 1;
102            }
103            name_field[i] = 0;
104        }
105
106        // CONFIGURE RODATA (BEFORE LOAD)
107        let rodata = open_skel.maps.rodata_data.as_mut().unwrap();
108
109        let possible = libbpf_rs::num_possible_cpus()? as u64;
110        rodata.nr_cpu_ids = nr_cpus_override.unwrap_or(possible);
111
112        // POPULATE SCX ENUM VALUES
113        rodata.__SCX_DSQ_FLAG_BUILTIN = SCX_DSQ_FLAG_BUILTIN;
114        rodata.__SCX_DSQ_FLAG_LOCAL_ON = SCX_DSQ_FLAG_LOCAL_ON;
115        rodata.__SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN;
116        rodata.__SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1;
117        rodata.__SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON;
118        rodata.__SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON | 1;
119        rodata.__SCX_DSQ_LOCAL_CPU_MASK = 0xFFFFFFFF;
120
121        // POPULATE SCX_KICK_* ENUM VALUES
122        rodata.__SCX_KICK_IDLE = 1;
123        rodata.__SCX_KICK_PREEMPT = 2;
124        rodata.__SCX_KICK_WAIT = 4;
125
126        // LOAD (VALIDATES BPF WITH KERNEL)
127        let mut skel = open_skel.load()?;
128
129        // ATTACH STRUCT_OPS
130        let link = skel.maps.pandemonium_ops.attach_struct_ops()?;
131
132        // PIN MAPS FOR USERSPACE ACCESS (NON-FATAL: bpffs may not be mounted)
133        let pin_dir = "/sys/fs/bpf/pandemonium";
134        let bpffs_ok = std::fs::create_dir_all(pin_dir).is_ok();
135        if bpffs_ok {
136            std::fs::remove_file(KNOBS_PIN).ok();
137            skel.maps.tuning_knobs_map.pin(KNOBS_PIN).ok();
138
139            let cache_pin = "/sys/fs/bpf/pandemonium/cache_domain";
140            std::fs::remove_file(cache_pin).ok();
141            skel.maps.cache_domain.pin(cache_pin).ok();
142
143            let observe_pin = "/sys/fs/bpf/pandemonium/task_class_observe";
144            std::fs::remove_file(observe_pin).ok();
145            skel.maps.task_class_observe.pin(observe_pin).ok();
146
147            let init_pin = "/sys/fs/bpf/pandemonium/task_class_init";
148            std::fs::remove_file(init_pin).ok();
149            skel.maps.task_class_init.pin(init_pin).ok();
150
151            let compositor_pin = "/sys/fs/bpf/pandemonium/compositor_map";
152            std::fs::remove_file(compositor_pin).ok();
153            skel.maps.compositor_map.pin(compositor_pin).ok();
154        } else {
155            log_warn!("BPFFS NOT AVAILABLE: map pinning skipped (scheduler still functional)");
156        }
157
158        Ok(Self {
159            skel,
160            _link: link,
161            log: EventLog::new(),
162        })
163    }
164
165    // SUM PER-CPU STATS INTO A SINGLE TOTAL
166    pub fn read_stats(&self) -> PandemoniumStats {
167        let key = 0u32.to_ne_bytes();
168        let mut total = PandemoniumStats::default();
169
170        let percpu_vals = match self
171            .skel
172            .maps
173            .stats_map
174            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
175        {
176            Ok(Some(v)) => v,
177            _ => return total,
178        };
179
180        for cpu_val in &percpu_vals {
181            if cpu_val.len() >= std::mem::size_of::<PandemoniumStats>() {
182                let stats: PandemoniumStats = unsafe {
183                    std::ptr::read_unaligned(cpu_val.as_ptr() as *const PandemoniumStats)
184                };
185                total.nr_dispatches += stats.nr_dispatches;
186                total.nr_idle_hits += stats.nr_idle_hits;
187                total.nr_shared += stats.nr_shared;
188                total.nr_preempt += stats.nr_preempt;
189                total.wake_lat_sum += stats.wake_lat_sum;
190                if stats.wake_lat_max > total.wake_lat_max {
191                    total.wake_lat_max = stats.wake_lat_max;
192                }
193                total.wake_lat_samples += stats.wake_lat_samples;
194                total.nr_keep_running += stats.nr_keep_running;
195                total.nr_hard_kicks += stats.nr_hard_kicks;
196                total.nr_soft_kicks += stats.nr_soft_kicks;
197                total.nr_enq_wakeup += stats.nr_enq_wakeup;
198                total.nr_enq_requeue += stats.nr_enq_requeue;
199                total.wake_lat_idle_sum += stats.wake_lat_idle_sum;
200                total.wake_lat_idle_cnt += stats.wake_lat_idle_cnt;
201                total.wake_lat_kick_sum += stats.wake_lat_kick_sum;
202                total.wake_lat_kick_cnt += stats.wake_lat_kick_cnt;
203                total.nr_procdb_hits += stats.nr_procdb_hits;
204                total.nr_l2_hit_batch += stats.nr_l2_hit_batch;
205                total.nr_l2_miss_batch += stats.nr_l2_miss_batch;
206                total.nr_l2_hit_interactive += stats.nr_l2_hit_interactive;
207                total.nr_l2_miss_interactive += stats.nr_l2_miss_interactive;
208                total.nr_l2_hit_lat_crit += stats.nr_l2_hit_lat_crit;
209                total.nr_l2_miss_lat_crit += stats.nr_l2_miss_lat_crit;
210                total.nr_reenqueue += stats.nr_reenqueue;
211                if stats.batch_sojourn_ns > total.batch_sojourn_ns {
212                    total.batch_sojourn_ns = stats.batch_sojourn_ns;
213                }
214                total.burst_mode_active += stats.burst_mode_active;
215                if stats.longrun_mode_active > total.longrun_mode_active {
216                    total.longrun_mode_active = stats.longrun_mode_active;
217                }
218                total.nr_overflow_rescue += stats.nr_overflow_rescue;
219            }
220        }
221
222        total
223    }
224
225    // WRITE TUNING KNOBS TO BPF MAP -- CALLED BY MONITOR THREAD
226    pub fn write_tuning_knobs(&self, knobs: &TuningKnobs) -> Result<()> {
227        let key = 0u32.to_ne_bytes();
228        let value = unsafe {
229            std::slice::from_raw_parts(
230                knobs as *const TuningKnobs as *const u8,
231                std::mem::size_of::<TuningKnobs>(),
232            )
233        };
234        self.skel
235            .maps
236            .tuning_knobs_map
237            .update(&key, value, libbpf_rs::MapFlags::ANY)?;
238        Ok(())
239    }
240
241    // READ CURRENT TUNING KNOBS FROM BPF MAP
242    pub fn read_tuning_knobs(&self) -> TuningKnobs {
243        let key = 0u32.to_ne_bytes();
244        match self
245            .skel
246            .maps
247            .tuning_knobs_map
248            .lookup(&key, libbpf_rs::MapFlags::ANY)
249        {
250            Ok(Some(v)) if v.len() >= std::mem::size_of::<TuningKnobs>() => unsafe {
251                std::ptr::read_unaligned(v.as_ptr() as *const TuningKnobs)
252            },
253            _ => TuningKnobs::default(),
254        }
255    }
256
257    // READ WAKEUP LATENCY HISTOGRAM: 3 TIERS x 12 BUCKETS
258    // SUMS ACROSS ALL CPUs (PERCPU_ARRAY). RETURNS CUMULATIVE COUNTS.
259    pub fn read_wake_lat_hist(&self) -> [[u64; 12]; 3] {
260        let mut result = [[0u64; 12]; 3];
261        for key_idx in 0u32..36 {
262            let key = key_idx.to_ne_bytes();
263            if let Ok(Some(percpu_vals)) = self
264                .skel
265                .maps
266                .wake_lat_hist
267                .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
268            {
269                let tier = (key_idx / 12) as usize;
270                let bucket = (key_idx % 12) as usize;
271                for cpu_val in &percpu_vals {
272                    if cpu_val.len() >= std::mem::size_of::<u64>() {
273                        let val: u64 =
274                            unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
275                        result[tier][bucket] += val;
276                    }
277                }
278            }
279        }
280        result
281    }
282
283    // READ SLEEP DURATION HISTOGRAM: 4 BUCKETS
284    // SUMS ACROSS ALL CPUs (PERCPU_ARRAY). RETURNS CUMULATIVE COUNTS.
285    pub fn read_sleep_hist(&self) -> [u64; 4] {
286        let mut result = [0u64; 4];
287        for key_idx in 0u32..4 {
288            let key = key_idx.to_ne_bytes();
289            if let Ok(Some(percpu_vals)) = self
290                .skel
291                .maps
292                .sleep_hist
293                .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
294            {
295                for cpu_val in &percpu_vals {
296                    if cpu_val.len() >= std::mem::size_of::<u64>() {
297                        let val: u64 =
298                            unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
299                        result[key_idx as usize] += val;
300                    }
301                }
302            }
303        }
304        result
305    }
306
307    // POPULATE CACHE DOMAIN MAP FROM TOPOLOGY DATA AT STARTUP
308    pub fn write_cache_domain(&self, cpu: u32, l2_group: u32) -> Result<()> {
309        let key = cpu.to_ne_bytes();
310        let val = l2_group.to_ne_bytes();
311        self.skel
312            .maps
313            .cache_domain
314            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
315        Ok(())
316    }
317
318    // POPULATE L2 SIBLINGS MAP ENTRY
319    pub fn write_l2_sibling(&self, group_id: u32, slot: u32, cpu: u32) -> Result<()> {
320        let key = (group_id * 8 + slot).to_ne_bytes();
321        let val = cpu.to_ne_bytes();
322        self.skel
323            .maps
324            .l2_siblings
325            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
326        Ok(())
327    }
328
329    // POPULATE RESISTANCE AFFINITY RANK MAP
330    // affinity_rank[cpu * MAX_AFFINITY_CANDIDATES + slot] = target_cpu
331    // SORTED BY ASCENDING R_EFF FROM LAPLACIAN PSEUDOINVERSE
332    pub fn write_affinity_rank(&self, cpu: u32, slot: u32, target_cpu: u32) -> Result<()> {
333        let key = (cpu * 16 + slot).to_ne_bytes(); // MAX_AFFINITY_CANDIDATES = 16
334        let val = target_cpu.to_ne_bytes();
335        self.skel
336            .maps
337            .affinity_rank
338            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
339        Ok(())
340    }
341
342    // POPULATE COMPOSITOR MAP ENTRY
343    pub fn write_compositor(&self, name: &str) -> Result<()> {
344        let mut key = [0u8; 16];
345        let bytes = name.as_bytes();
346        let len = bytes.len().min(15);
347        key[..len].copy_from_slice(&bytes[..len]);
348        let val = [1u8];
349        self.skel
350            .maps
351            .compositor_map
352            .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
353        Ok(())
354    }
355
356    // READ UEI EXIT INFO. RETURNS (should_restart).
357    pub fn read_exit_info(&self) -> bool {
358        let data = self.skel.maps.data_data.as_ref().unwrap();
359        let kind = data.uei.kind;
360        let exit_code = data.uei.exit_code;
361
362        if kind != SCX_EXIT_NONE {
363            let reason_bytes: &[u8] =
364                unsafe { std::slice::from_raw_parts(data.uei.reason.as_ptr() as *const u8, 128) };
365            let msg_bytes: &[u8] =
366                unsafe { std::slice::from_raw_parts(data.uei.msg.as_ptr() as *const u8, 1024) };
367
368            let reason = std::str::from_utf8(reason_bytes)
369                .unwrap_or("unknown")
370                .trim_end_matches('\0');
371            let msg = std::str::from_utf8(msg_bytes)
372                .unwrap_or("")
373                .trim_end_matches('\0');
374
375            log_warn!("BPF exit: kind={} code={}", kind, exit_code);
376            if !reason.is_empty() {
377                log_warn!("BPF exit reason: {}", reason);
378            }
379            if !msg.is_empty() {
380                log_warn!("BPF exit msg: {}", msg);
381            }
382        }
383
384        (exit_code as u64 & SCX_ECODE_RST_MASK) != 0
385    }
386
387    pub fn exited(&self) -> bool {
388        self.skel.maps.data_data.as_ref().unwrap().uei.kind != SCX_EXIT_NONE
389    }
390}
391
392impl Drop for Scheduler<'_> {
393    fn drop(&mut self) {
394        let _ = self.skel.maps.tuning_knobs_map.unpin(KNOBS_PIN);
395        let _ = self
396            .skel
397            .maps
398            .cache_domain
399            .unpin("/sys/fs/bpf/pandemonium/cache_domain");
400        let _ = self
401            .skel
402            .maps
403            .task_class_observe
404            .unpin("/sys/fs/bpf/pandemonium/task_class_observe");
405        let _ = self
406            .skel
407            .maps
408            .task_class_init
409            .unpin("/sys/fs/bpf/pandemonium/task_class_init");
410        let _ = self
411            .skel
412            .maps
413            .compositor_map
414            .unpin("/sys/fs/bpf/pandemonium/compositor_map");
415        let _ = std::fs::remove_dir("/sys/fs/bpf/pandemonium");
416    }
417}