scx_pandemonium/
scheduler.rs1use std::mem::MaybeUninit;
6
7use anyhow::Result;
8use libbpf_rs::skel::{OpenSkel, SkelBuilder};
9use libbpf_rs::MapCore;
10
11use crate::bpf_skel::*;
12use crate::tuning::TuningKnobs;
13use scx_pandemonium::event::EventLog;
14
15const SCX_EXIT_NONE: i32 = 0;
17const SCX_ECODE_RST_MASK: u64 = 1 << 16;
18
19const SCX_DSQ_FLAG_BUILTIN: u64 = 1u64 << 63;
21const SCX_DSQ_FLAG_LOCAL_ON: u64 = 1u64 << 62;
22
23#[repr(C)]
25#[derive(Default, Clone, Copy)]
26pub struct PandemoniumStats {
27 pub nr_dispatches: u64,
28 pub nr_idle_hits: u64,
29 pub nr_shared: u64,
30 pub nr_preempt: u64,
31 pub wake_lat_sum: u64,
32 pub wake_lat_max: u64,
33 pub wake_lat_samples: u64,
34 pub nr_keep_running: u64,
35 pub nr_hard_kicks: u64,
36 pub nr_soft_kicks: u64,
37 pub nr_enq_wakeup: u64,
38 pub nr_enq_requeue: u64,
39 pub wake_lat_idle_sum: u64,
40 pub wake_lat_idle_cnt: u64,
41 pub wake_lat_kick_sum: u64,
42 pub wake_lat_kick_cnt: u64,
43 pub nr_procdb_hits: u64,
44 pub nr_l2_hit_batch: u64,
45 pub nr_l2_miss_batch: u64,
46 pub nr_l2_hit_interactive: u64,
47 pub nr_l2_miss_interactive: u64,
48 pub nr_l2_hit_lat_crit: u64,
49 pub nr_l2_miss_lat_crit: u64,
50 pub nr_reenqueue: u64,
51 pub batch_sojourn_ns: u64,
52 pub burst_mode_active: u64,
53 pub longrun_mode_active: u64,
54 pub nr_overflow_rescue: u64,
55}
56
57const _: () = assert!(std::mem::size_of::<PandemoniumStats>() == 224);
59const _: () = assert!(std::mem::size_of::<TuningKnobs>() == 80);
60
61const KNOBS_PIN: &str = "/sys/fs/bpf/pandemonium/tuning_knobs";
64
65pub struct Scheduler<'a> {
66 skel: MainSkel<'a>,
67 _link: libbpf_rs::Link,
68 pub log: EventLog,
69}
70
71impl<'a> Scheduler<'a> {
72 pub fn init(
73 open_object: &'a mut MaybeUninit<libbpf_rs::OpenObject>,
74 nr_cpus_override: Option<u64>,
75 ) -> Result<Self> {
76 let builder = MainSkelBuilder::default();
78 let mut open_skel = builder.open(open_object)?;
79
80 {
82 let ops = open_skel.struct_ops.pandemonium_ops_mut();
83 let name_field = &mut ops.name;
84 let version_suffix = scx_utils::build_id::ops_version_suffix(env!("CARGO_PKG_VERSION"));
85 let bytes = version_suffix.as_bytes();
86 let mut i = 0;
87 let mut bytes_idx = 0;
88 let mut found_null = false;
89 while i < name_field.len() - 1 {
90 found_null |= name_field[i] == 0;
91 if !found_null {
92 i += 1;
93 continue;
94 }
95 if bytes_idx < bytes.len() {
96 name_field[i] = bytes[bytes_idx] as i8;
97 bytes_idx += 1;
98 } else {
99 break;
100 }
101 i += 1;
102 }
103 name_field[i] = 0;
104 }
105
106 let rodata = open_skel.maps.rodata_data.as_mut().unwrap();
108
109 let possible = libbpf_rs::num_possible_cpus()? as u64;
110 rodata.nr_cpu_ids = nr_cpus_override.unwrap_or(possible);
111
112 rodata.__SCX_DSQ_FLAG_BUILTIN = SCX_DSQ_FLAG_BUILTIN;
114 rodata.__SCX_DSQ_FLAG_LOCAL_ON = SCX_DSQ_FLAG_LOCAL_ON;
115 rodata.__SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN;
116 rodata.__SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1;
117 rodata.__SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON;
118 rodata.__SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON | 1;
119 rodata.__SCX_DSQ_LOCAL_CPU_MASK = 0xFFFFFFFF;
120
121 rodata.__SCX_KICK_IDLE = 1;
123 rodata.__SCX_KICK_PREEMPT = 2;
124 rodata.__SCX_KICK_WAIT = 4;
125
126 let mut skel = open_skel.load()?;
128
129 let link = skel.maps.pandemonium_ops.attach_struct_ops()?;
131
132 let pin_dir = "/sys/fs/bpf/pandemonium";
134 let bpffs_ok = std::fs::create_dir_all(pin_dir).is_ok();
135 if bpffs_ok {
136 std::fs::remove_file(KNOBS_PIN).ok();
137 skel.maps.tuning_knobs_map.pin(KNOBS_PIN).ok();
138
139 let cache_pin = "/sys/fs/bpf/pandemonium/cache_domain";
140 std::fs::remove_file(cache_pin).ok();
141 skel.maps.cache_domain.pin(cache_pin).ok();
142
143 let observe_pin = "/sys/fs/bpf/pandemonium/task_class_observe";
144 std::fs::remove_file(observe_pin).ok();
145 skel.maps.task_class_observe.pin(observe_pin).ok();
146
147 let init_pin = "/sys/fs/bpf/pandemonium/task_class_init";
148 std::fs::remove_file(init_pin).ok();
149 skel.maps.task_class_init.pin(init_pin).ok();
150
151 let compositor_pin = "/sys/fs/bpf/pandemonium/compositor_map";
152 std::fs::remove_file(compositor_pin).ok();
153 skel.maps.compositor_map.pin(compositor_pin).ok();
154 } else {
155 log_warn!("BPFFS NOT AVAILABLE: map pinning skipped (scheduler still functional)");
156 }
157
158 Ok(Self {
159 skel,
160 _link: link,
161 log: EventLog::new(),
162 })
163 }
164
165 pub fn read_stats(&self) -> PandemoniumStats {
167 let key = 0u32.to_ne_bytes();
168 let mut total = PandemoniumStats::default();
169
170 let percpu_vals = match self
171 .skel
172 .maps
173 .stats_map
174 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
175 {
176 Ok(Some(v)) => v,
177 _ => return total,
178 };
179
180 for cpu_val in &percpu_vals {
181 if cpu_val.len() >= std::mem::size_of::<PandemoniumStats>() {
182 let stats: PandemoniumStats = unsafe {
183 std::ptr::read_unaligned(cpu_val.as_ptr() as *const PandemoniumStats)
184 };
185 total.nr_dispatches += stats.nr_dispatches;
186 total.nr_idle_hits += stats.nr_idle_hits;
187 total.nr_shared += stats.nr_shared;
188 total.nr_preempt += stats.nr_preempt;
189 total.wake_lat_sum += stats.wake_lat_sum;
190 if stats.wake_lat_max > total.wake_lat_max {
191 total.wake_lat_max = stats.wake_lat_max;
192 }
193 total.wake_lat_samples += stats.wake_lat_samples;
194 total.nr_keep_running += stats.nr_keep_running;
195 total.nr_hard_kicks += stats.nr_hard_kicks;
196 total.nr_soft_kicks += stats.nr_soft_kicks;
197 total.nr_enq_wakeup += stats.nr_enq_wakeup;
198 total.nr_enq_requeue += stats.nr_enq_requeue;
199 total.wake_lat_idle_sum += stats.wake_lat_idle_sum;
200 total.wake_lat_idle_cnt += stats.wake_lat_idle_cnt;
201 total.wake_lat_kick_sum += stats.wake_lat_kick_sum;
202 total.wake_lat_kick_cnt += stats.wake_lat_kick_cnt;
203 total.nr_procdb_hits += stats.nr_procdb_hits;
204 total.nr_l2_hit_batch += stats.nr_l2_hit_batch;
205 total.nr_l2_miss_batch += stats.nr_l2_miss_batch;
206 total.nr_l2_hit_interactive += stats.nr_l2_hit_interactive;
207 total.nr_l2_miss_interactive += stats.nr_l2_miss_interactive;
208 total.nr_l2_hit_lat_crit += stats.nr_l2_hit_lat_crit;
209 total.nr_l2_miss_lat_crit += stats.nr_l2_miss_lat_crit;
210 total.nr_reenqueue += stats.nr_reenqueue;
211 if stats.batch_sojourn_ns > total.batch_sojourn_ns {
212 total.batch_sojourn_ns = stats.batch_sojourn_ns;
213 }
214 total.burst_mode_active += stats.burst_mode_active;
215 if stats.longrun_mode_active > total.longrun_mode_active {
216 total.longrun_mode_active = stats.longrun_mode_active;
217 }
218 total.nr_overflow_rescue += stats.nr_overflow_rescue;
219 }
220 }
221
222 total
223 }
224
225 pub fn write_tuning_knobs(&self, knobs: &TuningKnobs) -> Result<()> {
227 let key = 0u32.to_ne_bytes();
228 let value = unsafe {
229 std::slice::from_raw_parts(
230 knobs as *const TuningKnobs as *const u8,
231 std::mem::size_of::<TuningKnobs>(),
232 )
233 };
234 self.skel
235 .maps
236 .tuning_knobs_map
237 .update(&key, value, libbpf_rs::MapFlags::ANY)?;
238 Ok(())
239 }
240
241 pub fn read_tuning_knobs(&self) -> TuningKnobs {
243 let key = 0u32.to_ne_bytes();
244 match self
245 .skel
246 .maps
247 .tuning_knobs_map
248 .lookup(&key, libbpf_rs::MapFlags::ANY)
249 {
250 Ok(Some(v)) if v.len() >= std::mem::size_of::<TuningKnobs>() => unsafe {
251 std::ptr::read_unaligned(v.as_ptr() as *const TuningKnobs)
252 },
253 _ => TuningKnobs::default(),
254 }
255 }
256
257 pub fn read_wake_lat_hist(&self) -> [[u64; 12]; 3] {
260 let mut result = [[0u64; 12]; 3];
261 for key_idx in 0u32..36 {
262 let key = key_idx.to_ne_bytes();
263 if let Ok(Some(percpu_vals)) = self
264 .skel
265 .maps
266 .wake_lat_hist
267 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
268 {
269 let tier = (key_idx / 12) as usize;
270 let bucket = (key_idx % 12) as usize;
271 for cpu_val in &percpu_vals {
272 if cpu_val.len() >= std::mem::size_of::<u64>() {
273 let val: u64 =
274 unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
275 result[tier][bucket] += val;
276 }
277 }
278 }
279 }
280 result
281 }
282
283 pub fn read_sleep_hist(&self) -> [u64; 4] {
286 let mut result = [0u64; 4];
287 for key_idx in 0u32..4 {
288 let key = key_idx.to_ne_bytes();
289 if let Ok(Some(percpu_vals)) = self
290 .skel
291 .maps
292 .sleep_hist
293 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
294 {
295 for cpu_val in &percpu_vals {
296 if cpu_val.len() >= std::mem::size_of::<u64>() {
297 let val: u64 =
298 unsafe { std::ptr::read_unaligned(cpu_val.as_ptr() as *const u64) };
299 result[key_idx as usize] += val;
300 }
301 }
302 }
303 }
304 result
305 }
306
307 pub fn write_cache_domain(&self, cpu: u32, l2_group: u32) -> Result<()> {
309 let key = cpu.to_ne_bytes();
310 let val = l2_group.to_ne_bytes();
311 self.skel
312 .maps
313 .cache_domain
314 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
315 Ok(())
316 }
317
318 pub fn write_l2_sibling(&self, group_id: u32, slot: u32, cpu: u32) -> Result<()> {
320 let key = (group_id * 8 + slot).to_ne_bytes();
321 let val = cpu.to_ne_bytes();
322 self.skel
323 .maps
324 .l2_siblings
325 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
326 Ok(())
327 }
328
329 pub fn write_affinity_rank(&self, cpu: u32, slot: u32, target_cpu: u32) -> Result<()> {
333 let key = (cpu * 16 + slot).to_ne_bytes(); let val = target_cpu.to_ne_bytes();
335 self.skel
336 .maps
337 .affinity_rank
338 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
339 Ok(())
340 }
341
342 pub fn write_compositor(&self, name: &str) -> Result<()> {
344 let mut key = [0u8; 16];
345 let bytes = name.as_bytes();
346 let len = bytes.len().min(15);
347 key[..len].copy_from_slice(&bytes[..len]);
348 let val = [1u8];
349 self.skel
350 .maps
351 .compositor_map
352 .update(&key, &val, libbpf_rs::MapFlags::ANY)?;
353 Ok(())
354 }
355
356 pub fn read_exit_info(&self) -> bool {
358 let data = self.skel.maps.data_data.as_ref().unwrap();
359 let kind = data.uei.kind;
360 let exit_code = data.uei.exit_code;
361
362 if kind != SCX_EXIT_NONE {
363 let reason_bytes: &[u8] =
364 unsafe { std::slice::from_raw_parts(data.uei.reason.as_ptr() as *const u8, 128) };
365 let msg_bytes: &[u8] =
366 unsafe { std::slice::from_raw_parts(data.uei.msg.as_ptr() as *const u8, 1024) };
367
368 let reason = std::str::from_utf8(reason_bytes)
369 .unwrap_or("unknown")
370 .trim_end_matches('\0');
371 let msg = std::str::from_utf8(msg_bytes)
372 .unwrap_or("")
373 .trim_end_matches('\0');
374
375 log_warn!("BPF exit: kind={} code={}", kind, exit_code);
376 if !reason.is_empty() {
377 log_warn!("BPF exit reason: {}", reason);
378 }
379 if !msg.is_empty() {
380 log_warn!("BPF exit msg: {}", msg);
381 }
382 }
383
384 (exit_code as u64 & SCX_ECODE_RST_MASK) != 0
385 }
386
387 pub fn exited(&self) -> bool {
388 self.skel.maps.data_data.as_ref().unwrap().uei.kind != SCX_EXIT_NONE
389 }
390}
391
392impl Drop for Scheduler<'_> {
393 fn drop(&mut self) {
394 let _ = self.skel.maps.tuning_knobs_map.unpin(KNOBS_PIN);
395 let _ = self
396 .skel
397 .maps
398 .cache_domain
399 .unpin("/sys/fs/bpf/pandemonium/cache_domain");
400 let _ = self
401 .skel
402 .maps
403 .task_class_observe
404 .unpin("/sys/fs/bpf/pandemonium/task_class_observe");
405 let _ = self
406 .skel
407 .maps
408 .task_class_init
409 .unpin("/sys/fs/bpf/pandemonium/task_class_init");
410 let _ = self
411 .skel
412 .maps
413 .compositor_map
414 .unpin("/sys/fs/bpf/pandemonium/compositor_map");
415 let _ = std::fs::remove_dir("/sys/fs/bpf/pandemonium");
416 }
417}