scx_pandemonium/
scheduler.rs1use std::mem::MaybeUninit;
6
7use anyhow::Result;
8use libbpf_rs::skel::{OpenSkel, SkelBuilder};
9use libbpf_rs::MapCore;
10
11use crate::bpf_skel::*;
12use crate::tuning::{OscillatorState, TuningKnobs};
13use scx_pandemonium::event::EventLog;
14
15const SCX_EXIT_NONE: i32 = 0;
17const SCX_ECODE_RST_MASK: u64 = 1 << 16;
18
19const SCX_DSQ_FLAG_BUILTIN: u64 = 1u64 << 63;
21const SCX_DSQ_FLAG_LOCAL_ON: u64 = 1u64 << 62;
22
23#[repr(C)]
25#[derive(Default, Clone, Copy)]
26pub struct PandemoniumStats {
27 pub nr_dispatches: u64,
28 pub nr_idle_hits: u64,
29 pub nr_shared: u64,
30 pub nr_preempt: u64,
31 pub wake_lat_sum: u64,
32 pub wake_lat_samples: u64,
33 pub nr_keep_running: u64,
34 pub nr_hard_kicks: u64,
35 pub nr_soft_kicks: u64,
36 pub nr_enq_wakeup: u64,
37 pub nr_enq_requeue: u64,
38 pub wake_lat_idle_sum: u64,
39 pub wake_lat_idle_cnt: u64,
40 pub wake_lat_kick_sum: u64,
41 pub wake_lat_kick_cnt: u64,
42 pub nr_l2_hit_batch: u64,
43 pub nr_l2_miss_batch: u64,
44 pub nr_l2_hit_interactive: u64,
45 pub nr_l2_miss_interactive: u64,
46 pub nr_l2_hit_lat_crit: u64,
47 pub nr_l2_miss_lat_crit: u64,
48 pub nr_reenqueue: u64,
49 pub batch_sojourn_ns: u64,
50 pub longrun_mode_active: u64,
51 pub nr_overflow_rescue: u64,
52}
53
54const _: () = assert!(std::mem::size_of::<PandemoniumStats>() == 200);
56const _: () = assert!(std::mem::size_of::<TuningKnobs>() == 88);
57
58const _: () = assert!(crate::bpf_intf::MAX_AFFINITY_CANDIDATES == crate::bpf_intf::MAX_CPUS >> 3);
63
64const KNOBS_PIN: &str = "/sys/fs/bpf/pandemonium/tuning_knobs";
67
68pub struct Scheduler<'a> {
69 skel: MainSkel<'a>,
70 _link: libbpf_rs::Link,
71 pub log: EventLog,
72}
73
74impl<'a> Scheduler<'a> {
75 pub fn init(
76 open_object: &'a mut MaybeUninit<libbpf_rs::OpenObject>,
77 nr_cpus_override: Option<u64>,
78 ) -> Result<Self> {
79 let builder = MainSkelBuilder::default();
81 let mut open_skel = builder.open(open_object)?;
82
83 {
85 let ops = open_skel.struct_ops.pandemonium_ops_mut();
86 let name_field = &mut ops.name;
87 let version_suffix = scx_utils::build_id::ops_version_suffix(env!("CARGO_PKG_VERSION"));
88 let bytes = version_suffix.as_bytes();
89 let mut i = 0;
90 let mut bytes_idx = 0;
91 let mut found_null = false;
92 while i < name_field.len() - 1 {
93 found_null |= name_field[i] == 0;
94 if !found_null {
95 i += 1;
96 continue;
97 }
98 if bytes_idx < bytes.len() {
99 name_field[i] = bytes[bytes_idx] as i8;
100 bytes_idx += 1;
101 } else {
102 break;
103 }
104 i += 1;
105 }
106 name_field[i] = 0;
107 }
108
109 let rodata = open_skel.maps.rodata_data.as_mut().unwrap();
111
112 let possible = libbpf_rs::num_possible_cpus()? as u64;
113 rodata.nr_cpu_ids = nr_cpus_override.unwrap_or(possible);
114
115 rodata.__SCX_DSQ_FLAG_BUILTIN = SCX_DSQ_FLAG_BUILTIN;
117 rodata.__SCX_DSQ_FLAG_LOCAL_ON = SCX_DSQ_FLAG_LOCAL_ON;
118 rodata.__SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN;
119 rodata.__SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1;
120 rodata.__SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON;
121 rodata.__SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON | 1;
122 rodata.__SCX_DSQ_LOCAL_CPU_MASK = 0xFFFFFFFF;
123
124 rodata.__SCX_KICK_IDLE = 1;
126 rodata.__SCX_KICK_PREEMPT = 2;
127 rodata.__SCX_KICK_WAIT = 4;
128
129 let mut skel = open_skel.load()?;
131
132 let link = skel.maps.pandemonium_ops.attach_struct_ops()?;
134
135 let pin_dir = "/sys/fs/bpf/pandemonium";
137 let bpffs_ok = std::fs::create_dir_all(pin_dir).is_ok();
138 if bpffs_ok {
139 std::fs::remove_file(KNOBS_PIN).ok();
140 skel.maps.tuning_knobs_map.pin(KNOBS_PIN).ok();
141
142 let cache_pin = "/sys/fs/bpf/pandemonium/cache_domain";
143 std::fs::remove_file(cache_pin).ok();
144 skel.maps.cache_domain.pin(cache_pin).ok();
145
146 let observe_pin = "/sys/fs/bpf/pandemonium/task_class_observe";
147 std::fs::remove_file(observe_pin).ok();
148 skel.maps.task_class_observe.pin(observe_pin).ok();
149
150 let init_pin = "/sys/fs/bpf/pandemonium/task_class_init";
151 std::fs::remove_file(init_pin).ok();
152 skel.maps.task_class_init.pin(init_pin).ok();
153
154 let compositor_pin = "/sys/fs/bpf/pandemonium/compositor_map";
155 std::fs::remove_file(compositor_pin).ok();
156 skel.maps.compositor_map.pin(compositor_pin).ok();
157 } else {
158 log_warn!("BPFFS NOT AVAILABLE: map pinning skipped (scheduler still functional)");
159 }
160
161 Ok(Self {
162 skel,
163 _link: link,
164 log: EventLog::new(),
165 })
166 }
167
168 pub fn read_stats(&self) -> PandemoniumStats {
170 let key = 0u32.to_ne_bytes();
171 let mut total = PandemoniumStats::default();
172
173 let percpu_vals = match self
174 .skel
175 .maps
176 .stats_map
177 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
178 {
179 Ok(Some(v)) => v,
180 _ => return total,
181 };
182
183 for cpu_val in &percpu_vals {
184 if cpu_val.len() >= std::mem::size_of::<PandemoniumStats>() {
185 let stats: PandemoniumStats = unsafe {
186 std::ptr::read_unaligned(cpu_val.as_ptr() as *const PandemoniumStats)
187 };
188 total.nr_dispatches += stats.nr_dispatches;
189 total.nr_idle_hits += stats.nr_idle_hits;
190 total.nr_shared += stats.nr_shared;
191 total.nr_preempt += stats.nr_preempt;
192 total.wake_lat_sum += stats.wake_lat_sum;
193 total.wake_lat_samples += stats.wake_lat_samples;
194 total.nr_keep_running += stats.nr_keep_running;
195 total.nr_hard_kicks += stats.nr_hard_kicks;
196 total.nr_soft_kicks += stats.nr_soft_kicks;
197 total.nr_enq_wakeup += stats.nr_enq_wakeup;
198 total.nr_enq_requeue += stats.nr_enq_requeue;
199 total.wake_lat_idle_sum += stats.wake_lat_idle_sum;
200 total.wake_lat_idle_cnt += stats.wake_lat_idle_cnt;
201 total.wake_lat_kick_sum += stats.wake_lat_kick_sum;
202 total.wake_lat_kick_cnt += stats.wake_lat_kick_cnt;
203 total.nr_l2_hit_batch += stats.nr_l2_hit_batch;
204 total.nr_l2_miss_batch += stats.nr_l2_miss_batch;
205 total.nr_l2_hit_interactive += stats.nr_l2_hit_interactive;
206 total.nr_l2_miss_interactive += stats.nr_l2_miss_interactive;
207 total.nr_l2_hit_lat_crit += stats.nr_l2_hit_lat_crit;
208 total.nr_l2_miss_lat_crit += stats.nr_l2_miss_lat_crit;
209 total.nr_reenqueue += stats.nr_reenqueue;
210 if stats.batch_sojourn_ns > total.batch_sojourn_ns {
211 total.batch_sojourn_ns = stats.batch_sojourn_ns;
212 }
213 if stats.longrun_mode_active > total.longrun_mode_active {
214 total.longrun_mode_active = stats.longrun_mode_active;
215 }
216 total.nr_overflow_rescue += stats.nr_overflow_rescue;
217 }
218 }
219
220 total
221 }
222
223 pub fn write_tuning_knobs(&self, knobs: &TuningKnobs) -> Result<()> {
225 let key = 0u32.to_ne_bytes();
226 let value = unsafe {
227 std::slice::from_raw_parts(
228 knobs as *const TuningKnobs as *const u8,
229 std::mem::size_of::<TuningKnobs>(),
230 )
231 };
232 self.skel
233 .maps
234 .tuning_knobs_map
235 .update(&key, value, libbpf_rs::MapFlags::ANY)?;
236 Ok(())
237 }
238
239 pub fn write_topology_fields(&self, tau_ns: u64, codel_eq_ns: u64) -> Result<()> {
245 let mut knobs = self.read_tuning_knobs();
246 knobs.topology_tau_ns = tau_ns;
247 knobs.codel_eq_ns = codel_eq_ns;
248 self.write_tuning_knobs(&knobs)
249 }
250
251 pub fn read_oscillator_state(&self) -> OscillatorState {
255 let bss = match self.skel.maps.bss_data.as_ref() {
256 Some(b) => b,
257 None => return OscillatorState::default(),
258 };
259 let data = match self.skel.maps.data_data.as_ref() {
260 Some(d) => d,
261 None => return OscillatorState::default(),
262 };
263 OscillatorState {
264 codel_target_ns: bss.codel_target_ns,
265 codel_target_floor_ns: bss.codel_target_floor_ns,
266 codel_target_max_ns: data.codel_target_max_ns,
267 }
268 }
269
270 pub fn read_tuning_knobs(&self) -> TuningKnobs {
272 let key = 0u32.to_ne_bytes();
273 match self
274 .skel
275 .maps
276 .tuning_knobs_map
277 .lookup(&key, libbpf_rs::MapFlags::ANY)
278 {
279 Ok(Some(v)) if v.len() >= std::mem::size_of::<TuningKnobs>() => unsafe {
280 std::ptr::read_unaligned(v.as_ptr() as *const TuningKnobs)
281 },
282 _ => TuningKnobs::default(),
283 }
284 }
285
286 pub fn read_wake_lat_hist(&self) -> [[u64; 12]; 3] {
289 let mut result = [[0u64; 12]; 3];
290 for key_idx in 0u32..36 {
291 let key = key_idx.to_ne_bytes();
292 if let Ok(Some(percpu_vals)) = self
293 .skel
294 .maps
295 .wake_lat_hist
296 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
297 {
298 let tier = (key_idx / 12) as usize;
299 let bucket = (key_idx % 12) as usize;
300 for cpu_val in &percpu_vals {
301 if cpu_val.len() >= std::mem::size_of::<u64>() {
302 let val: u64 =
303 unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
304 result[tier][bucket] += val;
305 }
306 }
307 }
308 }
309 result
310 }
311
312 pub fn read_sleep_hist(&self) -> [u64; 4] {
315 let mut result = [0u64; 4];
316 for key_idx in 0u32..4 {
317 let key = key_idx.to_ne_bytes();
318 if let Ok(Some(percpu_vals)) = self
319 .skel
320 .maps
321 .sleep_hist
322 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
323 {
324 for cpu_val in &percpu_vals {
325 if cpu_val.len() >= std::mem::size_of::<u64>() {
326 let val: u64 =
327 unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
328 result[key_idx as usize] += val;
329 }
330 }
331 }
332 }
333 result
334 }
335
336 pub fn write_cache_domain(&self, cpu: u32, l2_group: u32) -> Result<()> {
338 let key = cpu.to_ne_bytes();
339 let val = l2_group.to_ne_bytes();
340 self.skel
341 .maps
342 .cache_domain
343 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
344 Ok(())
345 }
346
347 pub fn write_l2_sibling(&self, group_id: u32, slot: u32, cpu: u32) -> Result<()> {
349 let key = (group_id * 8 + slot).to_ne_bytes();
350 let val = cpu.to_ne_bytes();
351 self.skel
352 .maps
353 .l2_siblings
354 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
355 Ok(())
356 }
357
358 pub fn write_affinity_rank(&self, cpu: u32, slot: u32, target_cpu: u32) -> Result<()> {
362 let stride = crate::bpf_intf::MAX_AFFINITY_CANDIDATES;
366 let key = (cpu * stride + slot).to_ne_bytes();
367 let val = target_cpu.to_ne_bytes();
368 self.skel
369 .maps
370 .affinity_rank
371 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
372 Ok(())
373 }
374
375 pub fn write_compositor(&self, name: &str) -> Result<()> {
377 let mut key = [0u8; 16];
378 let bytes = name.as_bytes();
379 let len = bytes.len().min(15);
380 key[..len].copy_from_slice(&bytes[..len]);
381 let val = [1u8];
382 self.skel
383 .maps
384 .compositor_map
385 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
386 Ok(())
387 }
388
389 pub fn read_exit_info(&self) -> bool {
391 let data = self.skel.maps.data_data.as_ref().unwrap();
392 let kind = data.uei.kind;
393 let exit_code = data.uei.exit_code;
394
395 if kind != SCX_EXIT_NONE {
396 let reason_bytes: &[u8] =
397 unsafe { std::slice::from_raw_parts(data.uei.reason.as_ptr() as *const u8, 128) };
398 let msg_bytes: &[u8] =
399 unsafe { std::slice::from_raw_parts(data.uei.msg.as_ptr() as *const u8, 1024) };
400
401 let reason = std::str::from_utf8(reason_bytes)
402 .unwrap_or("unknown")
403 .trim_end_matches('\0');
404 let msg = std::str::from_utf8(msg_bytes)
405 .unwrap_or("")
406 .trim_end_matches('\0');
407
408 log_warn!("BPF exit: kind={} code={}", kind, exit_code);
409 if !reason.is_empty() {
410 log_warn!("BPF exit reason: {}", reason);
411 }
412 if !msg.is_empty() {
413 log_warn!("BPF exit msg: {}", msg);
414 }
415 }
416
417 (exit_code as u64 & SCX_ECODE_RST_MASK) != 0
418 }
419
420 pub fn exited(&self) -> bool {
421 self.skel.maps.data_data.as_ref().unwrap().uei.kind != SCX_EXIT_NONE
422 }
423}
424
425impl Drop for Scheduler<'_> {
426 fn drop(&mut self) {
427 let _ = self.skel.maps.tuning_knobs_map.unpin(KNOBS_PIN);
428 let _ = self
429 .skel
430 .maps
431 .cache_domain
432 .unpin("/sys/fs/bpf/pandemonium/cache_domain");
433 let _ = self
434 .skel
435 .maps
436 .task_class_observe
437 .unpin("/sys/fs/bpf/pandemonium/task_class_observe");
438 let _ = self
439 .skel
440 .maps
441 .task_class_init
442 .unpin("/sys/fs/bpf/pandemonium/task_class_init");
443 let _ = self
444 .skel
445 .maps
446 .compositor_map
447 .unpin("/sys/fs/bpf/pandemonium/compositor_map");
448 let _ = std::fs::remove_dir("/sys/fs/bpf/pandemonium");
449 }
450}